Support RVV v0.12 intrinsics for row_rvv.cc & scale_rvv.cc

1. Add two defined marco LIBYUV_RVV_HAS_TUPLE_TYPE & LIBYUV_RVV_HAS_VXRM_ARG

Intrinsic v0.12 introduces
- tuple type in segment load & store
- vxrm argument in fixed-point intrinsics (e.g vnclip)

These two marcos are controled by __riscv_v_intrinsic.

2. Support RVV v0.12 intrinsics in row_rvv.cc & scale_rvv.cc

Change-Id: I921f91d9dc8fdda031e7b6647d0e296aa2793c39
Signed-off-by: Bruce Lai <bruce.lai@sifive.com>
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4767120
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 2f9c792..33a304e 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -841,8 +841,15 @@
 #endif
 
 #if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector)
+#if __riscv_v_intrinsic > 11000
+// Since v0.12, TUPLE_TYPE is introduced for segment load and store.
+#define LIBYUV_RVV_HAS_TUPLE_TYPE
+// Since v0.12, VXRM(fixed-point rounding mode) is included in arguments of
+// fixed-point intrinsics.
+#define LIBYUV_RVV_HAS_VXRM_ARG
+#endif
+
 #define HAS_COPYROW_RVV
-#if __riscv_v_intrinsic == 11000
 #define HAS_AB64TOARGBROW_RVV
 #define HAS_ABGRTOYJROW_RVV
 #define HAS_ABGRTOYROW_RVV
@@ -900,7 +907,6 @@
 #define HAS_SPLITUVROW_RVV
 #define HAS_SPLITXRGBROW_RVV
 #endif
-#endif
 
 #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
 #if defined(VISUALC_HAS_AVX2)
diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h
index 02ed61c..cc3b8f6 100644
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@@ -184,7 +184,6 @@
 // #define HAS_SCALEARGBROWDOWNEVEN_RVV
 #define HAS_SCALEUVROWDOWN4_RVV
 #define HAS_SCALEUVROWDOWNEVEN_RVV
-#if __riscv_v_intrinsic == 11000
 #define HAS_SCALEARGBROWDOWN2_RVV
 #define HAS_SCALEARGBROWDOWN2BOX_RVV
 #define HAS_SCALEARGBROWDOWN2LINEAR_RVV
@@ -208,7 +207,6 @@
 #define HAS_SCALEUVROWUP2_BILINEAR_RVV
 #define HAS_SCALEUVROWUP2_LINEAR_RVV
 #endif
-#endif
 
 // Scale ARGB vertically with bilinear interpolation.
 void ScalePlaneVertical(int src_height,
diff --git a/riscv_script/riscv-clang.cmake b/riscv_script/riscv-clang.cmake
index e287941..35888ae 100644
--- a/riscv_script/riscv-clang.cmake
+++ b/riscv_script/riscv-clang.cmake
@@ -43,6 +43,7 @@
     list(APPEND RISCV_COMPILER_FLAGS "-march=rv64gc")
   endif()
 endif()
+add_compile_options("-Wuninitialized")
 message(STATUS "RISCV_COMPILER_FLAGS: ${RISCV_COMPILER_FLAGS}")
 
 set(CMAKE_C_FLAGS             "${RISCV_COMPILER_FLAGS} ${CMAKE_C_FLAGS}")
diff --git a/source/row_rvv.cc b/source/row_rvv.cc
index 0bf2bef..39a5c0d 100644
--- a/source/row_rvv.cc
+++ b/source/row_rvv.cc
@@ -28,6 +28,20 @@
 extern "C" {
 #endif
 
+#ifdef LIBYUV_RVV_HAS_VXRM_ARG
+// Fill YUV -> RGB conversion constants into vectors
+#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
+  {                                                              \
+    ub = yuvconst->kUVCoeff[0];                                  \
+    vr = yuvconst->kUVCoeff[1];                                  \
+    ug = yuvconst->kUVCoeff[2];                                  \
+    vg = yuvconst->kUVCoeff[3];                                  \
+    yg = yuvconst->kRGBCoeffBias[0];                             \
+    bb = yuvconst->kRGBCoeffBias[1] + 32;                        \
+    bg = yuvconst->kRGBCoeffBias[2] - 32;                        \
+    br = yuvconst->kRGBCoeffBias[3] + 32;                        \
+  }
+#else
 // Fill YUV -> RGB conversion constants into vectors
 // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
 // register) is set to round-to-nearest-up mode(0).
@@ -43,7 +57,7 @@
     bg = yuvconst->kRGBCoeffBias[2] - 32;                        \
     br = yuvconst->kRGBCoeffBias[3] + 32;                        \
   }
-
+#endif
 // Read [2*VLEN/8] Y, [VLEN/8] U and [VLEN/8] V from 422
 #define READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \
   {                                                              \
@@ -95,6 +109,15 @@
     v_r_16 = __riscv_vssubu_vx_u16m4(v_tmp2, br, vl);                          \
   }
 
+#ifdef LIBYUV_RVV_HAS_VXRM_ARG
+// Convert from fixed point RGB To 8 bit RGB
+#define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r)        \
+  {                                                                 \
+    v_g = __riscv_vnclipu_wx_u8m2(v_g_16, 6, __RISCV_VXRM_RNU, vl); \
+    v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, __RISCV_VXRM_RNU, vl); \
+    v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, __RISCV_VXRM_RNU, vl); \
+  }
+#else
 // Convert from fixed point RGB To 8 bit RGB
 #define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \
   {                                                          \
@@ -102,7 +125,53 @@
     v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, vl);            \
     v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl);            \
   }
+#endif
 
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+// Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv
+#define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16) \
+  {                                                      \
+    vuint8m1x2_t v_tmp;                                  \
+    vuint8m1_t v_tmp0, v_tmp1;                           \
+    vuint8m2_t v_y;                                      \
+    vuint16m2_t v_u_16, v_v_16;                          \
+    vl = __riscv_vsetvl_e8m1((w + 1) / 2);               \
+    v_tmp = __riscv_vlseg2e8_v_u8m1x2(src_uv, vl);       \
+    v_tmp0 = __riscv_vget_v_u8m1x2_u8m1(v_tmp, 0);       \
+    v_tmp1 = __riscv_vget_v_u8m1x2_u8m1(v_tmp, 1);       \
+    v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl);     \
+    v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl);     \
+    v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl);  \
+    v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl);  \
+    v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16);     \
+    v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16);     \
+    vl = __riscv_vsetvl_e8m2(w);                         \
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);                \
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);        \
+  }
+
+// Read 2*[VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_vu
+#define READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16) \
+  {                                                      \
+    vuint8m1x2_t v_tmp;                                  \
+    vuint8m1_t v_tmp0, v_tmp1;                           \
+    vuint8m2_t v_y;                                      \
+    vuint16m2_t v_u_16, v_v_16;                          \
+    vl = __riscv_vsetvl_e8m1((w + 1) / 2);               \
+    v_tmp = __riscv_vlseg2e8_v_u8m1x2(src_vu, vl);       \
+    v_tmp0 = __riscv_vget_v_u8m1x2_u8m1(v_tmp, 0);       \
+    v_tmp1 = __riscv_vget_v_u8m1x2_u8m1(v_tmp, 1);       \
+    v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl);     \
+    v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl);     \
+    v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl);  \
+    v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl);  \
+    v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16);     \
+    v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16);     \
+    vl = __riscv_vsetvl_e8m2(w);                         \
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);                \
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);        \
+  }
+#else
 // Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv
 #define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16)   \
   {                                                        \
@@ -140,6 +209,7 @@
     v_y = __riscv_vle8_v_u8m2(src_y, vl);                  \
     v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);          \
   }
+#endif
 
 #ifdef HAS_ARGBTOAR64ROW_RVV
 void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
@@ -160,6 +230,34 @@
 #endif
 
 #ifdef HAS_ARGBTOAB64ROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
+  size_t avl = (size_t)width;
+  do {
+    vuint16m2x4_t v_dst_ab64;
+    vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16;
+    size_t vl = __riscv_vsetvl_e8m1(avl);
+    vuint8m1x4_t v_src_argb = __riscv_vlseg4e8_v_u8m1x4(src_argb, vl);
+    vuint8m1_t v_b = __riscv_vget_v_u8m1x4_u8m1(v_src_argb, 0);
+    vuint8m1_t v_g = __riscv_vget_v_u8m1x4_u8m1(v_src_argb, 1);
+    vuint8m1_t v_r = __riscv_vget_v_u8m1x4_u8m1(v_src_argb, 2);
+    vuint8m1_t v_a = __riscv_vget_v_u8m1x4_u8m1(v_src_argb, 3);
+    v_b_16 = __riscv_vwaddu_vx_u16m2(v_b, 0, vl);
+    v_g_16 = __riscv_vwaddu_vx_u16m2(v_g, 0, vl);
+    v_r_16 = __riscv_vwaddu_vx_u16m2(v_r, 0, vl);
+    v_a_16 = __riscv_vwaddu_vx_u16m2(v_a, 0, vl);
+    v_b_16 = __riscv_vmul_vx_u16m2(v_b_16, 0x0101, vl);
+    v_g_16 = __riscv_vmul_vx_u16m2(v_g_16, 0x0101, vl);
+    v_r_16 = __riscv_vmul_vx_u16m2(v_r_16, 0x0101, vl);
+    v_a_16 = __riscv_vmul_vx_u16m2(v_a_16, 0x0101, vl);
+    v_dst_ab64 = __riscv_vcreate_v_u16m2x4(v_r_16, v_g_16, v_b_16, v_a_16);
+    __riscv_vsseg4e16_v_u16m2x4(dst_ab64, v_dst_ab64, vl);
+    avl -= vl;
+    src_argb += 4 * vl;
+    dst_ab64 += 4 * vl;
+  } while (avl > 0);
+}
+#else
 void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
   size_t avl = (size_t)width;
   do {
@@ -182,6 +280,7 @@
   } while (avl > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_AR64TOARGBROW_RVV
 void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
@@ -201,6 +300,26 @@
 #endif
 
 #ifdef HAS_AR64TOAB64ROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void AR64ToAB64Row_RVV(const uint16_t* src_ar64,
+                       uint16_t* dst_ab64,
+                       int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e16m2(w);
+    vuint16m2x4_t v_argb16 = __riscv_vlseg4e16_v_u16m2x4(src_ar64, vl);
+    vuint16m2_t v_b = __riscv_vget_v_u16m2x4_u16m2(v_argb16, 0);
+    vuint16m2_t v_g = __riscv_vget_v_u16m2x4_u16m2(v_argb16, 1);
+    vuint16m2_t v_r = __riscv_vget_v_u16m2x4_u16m2(v_argb16, 2);
+    vuint16m2_t v_a = __riscv_vget_v_u16m2x4_u16m2(v_argb16, 3);
+    vuint16m2x4_t v_dst_abgr = __riscv_vcreate_v_u16m2x4(v_r, v_g, v_b, v_a);
+    __riscv_vsseg4e16_v_u16m2x4(dst_ab64, v_dst_abgr, vl);
+    w -= vl;
+    src_ar64 += vl * 4;
+    dst_ab64 += vl * 4;
+  } while (w > 0);
+}
+#else
 void AR64ToAB64Row_RVV(const uint16_t* src_ar64,
                        uint16_t* dst_ab64,
                        int width) {
@@ -216,8 +335,31 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_AB64TOARGBROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
+  size_t avl = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e16m2(avl);
+    vuint16m2x4_t v_abgr16 = __riscv_vlseg4e16_v_u16m2x4(src_ab64, vl);
+    vuint16m2_t v_r_16 = __riscv_vget_v_u16m2x4_u16m2(v_abgr16, 0);
+    vuint16m2_t v_g_16 = __riscv_vget_v_u16m2x4_u16m2(v_abgr16, 1);
+    vuint16m2_t v_b_16 = __riscv_vget_v_u16m2x4_u16m2(v_abgr16, 2);
+    vuint16m2_t v_a_16 = __riscv_vget_v_u16m2x4_u16m2(v_abgr16, 3);
+    vuint8m1_t v_b = __riscv_vnsrl_wx_u8m1(v_b_16, 8, vl);
+    vuint8m1_t v_g = __riscv_vnsrl_wx_u8m1(v_g_16, 8, vl);
+    vuint8m1_t v_r = __riscv_vnsrl_wx_u8m1(v_r_16, 8, vl);
+    vuint8m1_t v_a = __riscv_vnsrl_wx_u8m1(v_a_16, 8, vl);
+    vuint8m1x4_t v_dst_argb = __riscv_vcreate_v_u8m1x4(v_b, v_g, v_r, v_a);
+    __riscv_vsseg4e8_v_u8m1x4(dst_argb, v_dst_argb, vl);
+    avl -= vl;
+    src_ab64 += 4 * vl;
+    dst_argb += 4 * vl;
+  } while (avl > 0);
+}
+#else
 void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
   size_t avl = (size_t)width;
   do {
@@ -236,8 +378,28 @@
   } while (avl > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_RAWTOARGBROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2x3_t v_bgr = __riscv_vlseg3e8_v_u8m2x3(src_raw, vl);
+    vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 0);
+    vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 1);
+    vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 2);
+    vuint8m2x4_t v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
+    __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
+    w -= vl;
+    src_raw += vl * 3;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+#else
 void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
   size_t w = (size_t)width;
   size_t vl = __riscv_vsetvl_e8m2(w);
@@ -253,8 +415,28 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_RAWTORGBAROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2x3_t v_bgr = __riscv_vlseg3e8_v_u8m2x3(src_raw, vl);
+    vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 0);
+    vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 1);
+    vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 2);
+    vuint8m2x4_t v_dst_rgba = __riscv_vcreate_v_u8m2x4(v_a, v_b, v_g, v_r);
+    __riscv_vsseg4e8_v_u8m2x4(dst_rgba, v_dst_rgba, vl);
+    w -= vl;
+    src_raw += vl * 3;
+    dst_rgba += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+#else
 void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
   size_t w = (size_t)width;
   size_t vl = __riscv_vsetvl_e8m2(w);
@@ -270,8 +452,26 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_RAWTORGB24ROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x3_t v_bgr = __riscv_vlseg3e8_v_u8m2x3(src_raw, vl);
+    vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 0);
+    vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 1);
+    vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 2);
+    vuint8m2x3_t v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r);
+    __riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl);
+    w -= vl;
+    src_raw += vl * 3;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#else
 void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
   size_t w = (size_t)width;
   do {
@@ -285,8 +485,27 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_ARGBTORAWROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
+    vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
+    vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
+    vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
+    vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3);
+    vuint8m2x3_t v_dst_bgr = __riscv_vcreate_v_u8m2x3(v_r, v_g, v_b);
+    __riscv_vsseg3e8_v_u8m2x3(dst_raw, v_dst_bgr, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_raw += vl * 3;
+  } while (w > 0);
+}
+#else
 void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
   size_t w = (size_t)width;
   do {
@@ -300,8 +519,28 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_ARGBTORGB24ROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ARGBToRGB24Row_RVV(const uint8_t* src_argb,
+                        uint8_t* dst_rgb24,
+                        int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
+    vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
+    vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
+    vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
+    vuint8m2x3_t v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r);
+    __riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#else
 void ARGBToRGB24Row_RVV(const uint8_t* src_argb,
                         uint8_t* dst_rgb24,
                         int width) {
@@ -317,8 +556,27 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_ARGBTOABGRROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
+    vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
+    vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
+    vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
+    vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3);
+    vuint8m2x4_t v_dst_abgr = __riscv_vcreate_v_u8m2x4(v_r, v_g, v_b, v_a);
+    __riscv_vsseg4e8_v_u8m2x4(dst_abgr, v_dst_abgr, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_abgr += vl * 4;
+  } while (w > 0);
+}
+#else
 void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) {
   size_t w = (size_t)width;
   do {
@@ -332,8 +590,27 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_ARGBTOBGRAROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
+    vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
+    vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
+    vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
+    vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3);
+    vuint8m2x4_t v_dst_bgra = __riscv_vcreate_v_u8m2x4(v_a, v_r, v_g, v_b);
+    __riscv_vsseg4e8_v_u8m2x4(dst_bgra, v_dst_bgra, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_bgra += vl * 4;
+  } while (w > 0);
+}
+#else
 void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) {
   size_t w = (size_t)width;
   do {
@@ -347,8 +624,27 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_ARGBTORGBAROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
+    vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
+    vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
+    vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
+    vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3);
+    vuint8m2x4_t v_dst_rgba = __riscv_vcreate_v_u8m2x4(v_a, v_b, v_g, v_r);
+    __riscv_vsseg4e8_v_u8m2x4(dst_rgba, v_dst_rgba, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_rgba += vl * 4;
+  } while (w > 0);
+}
+#else
 void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) {
   size_t w = (size_t)width;
   do {
@@ -362,8 +658,27 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_RGBATOARGBROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_src_rgba = __riscv_vlseg4e8_v_u8m2x4(src_rgba, vl);
+    vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 0);
+    vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 1);
+    vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 2);
+    vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 3);
+    vuint8m2x4_t v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
+    __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
+    w -= vl;
+    src_rgba += vl * 4;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#else
 void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) {
   size_t w = (size_t)width;
   do {
@@ -377,8 +692,30 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_RGB24TOARGBROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24,
+                        uint8_t* dst_argb,
+                        int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2x3_t v_src_rgb = __riscv_vlseg3e8_v_u8m2x3(src_rgb24, vl);
+    vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 0);
+    vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 1);
+    vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 2);
+    vuint8m2x4_t v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
+    __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
+    w -= vl;
+    src_rgb24 += vl * 3;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+#else
 void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24,
                         uint8_t* dst_argb,
                         int width) {
@@ -396,8 +733,41 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_I444TOARGBROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void I444ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  vuint8m2x4_t v_dst_argb;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
+    __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl;
+    src_v += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#else
 void I444ToARGBRow_RVV(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
@@ -427,8 +797,43 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_I444ALPHATOARGBROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void I444AlphaToARGBRow_RVV(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    vuint8m2x4_t v_dst_argb;
+    READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    v_a = __riscv_vle8_v_u8m2(src_a, vl);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
+    __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
+    w -= vl;
+    src_y += vl;
+    src_a += vl;
+    src_u += vl;
+    src_v += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#else
 void I444AlphaToARGBRow_RVV(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
@@ -460,8 +865,40 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_I444TORGB24ROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void I444ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    vuint8m2x3_t v_dst_rgb;
+    READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r);
+    __riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl;
+    src_v += vl;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#else
 void I444ToRGB24Row_RVV(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -490,8 +927,41 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_I422TOARGBROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void I422ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  vuint8m2x4_t v_dst_argb;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
+    __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#else
 void I422ToARGBRow_RVV(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
@@ -521,8 +991,43 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_I422ALPHATOARGBROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void I422AlphaToARGBRow_RVV(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    vuint8m2x4_t v_dst_argb;
+    READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    v_a = __riscv_vle8_v_u8m2(src_a, vl);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
+    __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
+    w -= vl;
+    src_y += vl;
+    src_a += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#else
 void I422AlphaToARGBRow_RVV(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
@@ -554,8 +1059,41 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_I422TORGBAROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void I422ToRGBARow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_rgba,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  vuint8m2x4_t v_dst_rgba;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    v_dst_rgba = __riscv_vcreate_v_u8m2x4(v_a, v_b, v_g, v_r);
+    __riscv_vsseg4e8_v_u8m2x4(dst_rgba, v_dst_rgba, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_rgba += vl * 4;
+  } while (w > 0);
+}
+#else
 void I422ToRGBARow_RVV(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
@@ -585,8 +1123,40 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_I422TORGB24ROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void I422ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  vuint8m2x3_t v_dst_rgb;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r);
+    __riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#else
 void I422ToRGB24Row_RVV(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -615,8 +1185,48 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_I400TOARGBROW_RVV
+#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG)
+void I400ToARGBRow_RVV(const uint8_t* src_y,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  const bool is_yb_positive = (yuvconstants->kRGBCoeffBias[4] >= 0);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl);
+  vuint8m2x4_t v_dst_argb;
+  vuint16m4_t v_yb;
+  if (is_yb_positive) {
+    v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl);
+  } else {
+    v_yb = __riscv_vmv_v_x_u16m4(-yuvconstants->kRGBCoeffBias[4] + 32, vl);
+  }
+  do {
+    vuint8m2_t v_y, v_out;
+    vuint16m4_t v_y_16, v_tmp0, v_tmp1, v_tmp2;
+    vl = __riscv_vsetvl_e8m2(w);
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);
+    v_tmp0 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl);  // 257 * v_y
+    v_tmp1 = __riscv_vmulhu_vv_u16m4(v_tmp0, v_yg, vl);
+    if (is_yb_positive) {
+      v_tmp2 = __riscv_vsaddu_vv_u16m4(v_tmp1, v_yb, vl);
+    } else {
+      v_tmp2 = __riscv_vssubu_vv_u16m4(v_tmp1, v_yb, vl);
+    }
+    v_out = __riscv_vnclipu_wx_u8m2(v_tmp2, 6, __RISCV_VXRM_RNU, vl);
+    v_dst_argb = __riscv_vcreate_v_u8m2x4(v_out, v_out, v_out, v_a);
+    __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
+    w -= vl;
+    src_y += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#else
 void I400ToARGBRow_RVV(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
@@ -656,8 +1266,25 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_J400TOARGBROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2_t v_y = __riscv_vle8_v_u8m2(src_y, vl);
+    vuint8m2x4_t v_dst_argb = __riscv_vcreate_v_u8m2x4(v_y, v_y, v_y, v_a);
+    __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
+    w -= vl;
+    src_y += vl;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+#else
 void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) {
   size_t w = (size_t)width;
   size_t vl = __riscv_vsetvl_e8m2(w);
@@ -673,6 +1300,7 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_COPYROW_RVV
 void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
@@ -689,6 +1317,36 @@
 #endif
 
 #ifdef HAS_NV12TOARGBROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void NV12ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  vuint8m2x4_t v_dst_argb;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
+    __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
+    w -= vl;
+    src_y += vl;
+    src_uv += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#else
 void NV12ToARGBRow_RVV(const uint8_t* src_y,
                        const uint8_t* src_uv,
                        uint8_t* dst_argb,
@@ -716,8 +1374,38 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_NV12TORGB24ROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void NV12ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r;
+  vuint8m2x3_t v_dst_rgb;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r);
+    __riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl);
+    w -= vl;
+    src_y += vl;
+    src_uv += vl;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#else
 void NV12ToRGB24Row_RVV(const uint8_t* src_y,
                         const uint8_t* src_uv,
                         uint8_t* dst_rgb24,
@@ -744,8 +1432,39 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_NV21TOARGBROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void NV21ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_vu,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint8m2x4_t v_dst_argb;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
+    __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
+    w -= vl;
+    src_y += vl;
+    src_vu += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#else
 void NV21ToARGBRow_RVV(const uint8_t* src_y,
                        const uint8_t* src_vu,
                        uint8_t* dst_argb,
@@ -773,8 +1492,38 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_NV21TORGB24ROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void NV21ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r;
+  vuint8m2x3_t v_dst_rgb;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r);
+    __riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl);
+    w -= vl;
+    src_y += vl;
+    src_vu += vl;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#else
 void NV21ToRGB24Row_RVV(const uint8_t* src_y,
                         const uint8_t* src_vu,
                         uint8_t* dst_rgb24,
@@ -801,10 +1550,65 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 // Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1
-
 #ifdef HAS_INTERPOLATEROW_RVV
+#ifdef LIBYUV_RVV_HAS_VXRM_ARG
+void InterpolateRow_RVV(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int dst_width,
+                        int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  size_t dst_w = (size_t)dst_width;
+  assert(source_y_fraction >= 0);
+  assert(source_y_fraction < 256);
+  // Blend 100 / 0 - Copy row unchanged.
+  if (y1_fraction == 0) {
+    do {
+      size_t vl = __riscv_vsetvl_e8m8(dst_w);
+      __riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl);
+      dst_w -= vl;
+      src_ptr += vl;
+      dst_ptr += vl;
+    } while (dst_w > 0);
+    return;
+  }
+  // Blend 50 / 50.
+  if (y1_fraction == 128) {
+    do {
+      size_t vl = __riscv_vsetvl_e8m8(dst_w);
+      vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl);
+      vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl);
+      vuint8m8_t row_out =
+          __riscv_vaaddu_vv_u8m8(row0, row1, __RISCV_VXRM_RNU, vl);
+      __riscv_vse8_v_u8m8(dst_ptr, row_out, vl);
+      dst_w -= vl;
+      src_ptr += vl;
+      src_ptr1 += vl;
+      dst_ptr += vl;
+    } while (dst_w > 0);
+    return;
+  }
+  // General purpose row blend.
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(dst_w);
+    vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl);
+    vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl);
+    vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl);
+    acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl);
+    __riscv_vse8_v_u8m4(
+        dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, __RISCV_VXRM_RNU, vl), vl);
+    dst_w -= vl;
+    src_ptr += vl;
+    src_ptr1 += vl;
+    dst_ptr += vl;
+  } while (dst_w > 0);
+}
+#else
 void InterpolateRow_RVV(uint8_t* dst_ptr,
                         const uint8_t* src_ptr,
                         ptrdiff_t src_stride,
@@ -862,8 +1666,33 @@
   } while (dst_w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_SPLITRGBROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void SplitRGBRow_RVV(const uint8_t* src_rgb,
+                     uint8_t* dst_r,
+                     uint8_t* dst_g,
+                     uint8_t* dst_b,
+                     int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x3_t v_src = __riscv_vlseg3e8_v_u8m2x3(src_rgb, vl);
+    vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_src, 0);
+    vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_src, 1);
+    vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_src, 2);
+    __riscv_vse8_v_u8m2(dst_r, v_r, vl);
+    __riscv_vse8_v_u8m2(dst_g, v_g, vl);
+    __riscv_vse8_v_u8m2(dst_b, v_b, vl);
+    w -= vl;
+    dst_r += vl;
+    dst_g += vl;
+    dst_b += vl;
+    src_rgb += vl * 3;
+  } while (w > 0);
+}
+#else
 void SplitRGBRow_RVV(const uint8_t* src_rgb,
                      uint8_t* dst_r,
                      uint8_t* dst_g,
@@ -885,8 +1714,31 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_MERGERGBROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void MergeRGBRow_RVV(const uint8_t* src_r,
+                     const uint8_t* src_g,
+                     const uint8_t* src_b,
+                     uint8_t* dst_rgb,
+                     int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
+    vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
+    vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
+    vuint8m2x3_t v_dst = __riscv_vcreate_v_u8m2x3(v_r, v_g, v_b);
+    __riscv_vsseg3e8_v_u8m2x3(dst_rgb, v_dst, vl);
+    w -= vl;
+    src_r += vl;
+    src_g += vl;
+    src_b += vl;
+    dst_rgb += vl * 3;
+  } while (w > 0);
+}
+#else
 void MergeRGBRow_RVV(const uint8_t* src_r,
                      const uint8_t* src_g,
                      const uint8_t* src_b,
@@ -907,8 +1759,37 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_SPLITARGBROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void SplitARGBRow_RVV(const uint8_t* src_argb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      uint8_t* dst_a,
+                      int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_src = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
+    vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src, 0);
+    vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src, 1);
+    vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src, 2);
+    vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src, 3);
+    __riscv_vse8_v_u8m2(dst_a, v_a, vl);
+    __riscv_vse8_v_u8m2(dst_r, v_r, vl);
+    __riscv_vse8_v_u8m2(dst_g, v_g, vl);
+    __riscv_vse8_v_u8m2(dst_b, v_b, vl);
+    w -= vl;
+    dst_a += vl;
+    dst_r += vl;
+    dst_g += vl;
+    dst_b += vl;
+    src_argb += vl * 4;
+  } while (w > 0);
+}
+#else
 void SplitARGBRow_RVV(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
@@ -933,8 +1814,34 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_MERGEARGBROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void MergeARGBRow_RVV(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      const uint8_t* src_a,
+                      uint8_t* dst_argb,
+                      int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
+    vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
+    vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
+    vuint8m2_t v_a = __riscv_vle8_v_u8m2(src_a, vl);
+    vuint8m2x4_t v_dst = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
+    __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst, vl);
+    w -= vl;
+    src_r += vl;
+    src_g += vl;
+    src_b += vl;
+    src_a += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#else
 void MergeARGBRow_RVV(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
@@ -958,8 +1865,33 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_SPLITXRGBROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void SplitXRGBRow_RVV(const uint8_t* src_argb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_src = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
+    vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src, 0);
+    vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src, 1);
+    vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src, 2);
+    __riscv_vse8_v_u8m2(dst_r, v_r, vl);
+    __riscv_vse8_v_u8m2(dst_g, v_g, vl);
+    __riscv_vse8_v_u8m2(dst_b, v_b, vl);
+    w -= vl;
+    dst_r += vl;
+    dst_g += vl;
+    dst_b += vl;
+    src_argb += vl * 4;
+  } while (w > 0);
+}
+#else
 void SplitXRGBRow_RVV(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
@@ -981,8 +1913,33 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_MERGEXRGBROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void MergeXRGBRow_RVV(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_argb,
+                      int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
+    vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
+    vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
+    vuint8m2x4_t v_dst = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
+    __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst, vl);
+    w -= vl;
+    src_r += vl;
+    src_g += vl;
+    src_b += vl;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+#else
 void MergeXRGBRow_RVV(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
@@ -1006,8 +1963,29 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_SPLITUVROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void SplitUVRow_RVV(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    vuint8m4x2_t v_src = __riscv_vlseg2e8_v_u8m4x2(src_uv, vl);
+    vuint8m4_t v_u = __riscv_vget_v_u8m4x2_u8m4(v_src, 0);
+    vuint8m4_t v_v = __riscv_vget_v_u8m4x2_u8m4(v_src, 1);
+    __riscv_vse8_v_u8m4(dst_u, v_u, vl);
+    __riscv_vse8_v_u8m4(dst_v, v_v, vl);
+    w -= vl;
+    dst_u += vl;
+    dst_v += vl;
+    src_uv += 2 * vl;
+  } while (w > 0);
+}
+#else
 void SplitUVRow_RVV(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
@@ -1026,8 +2004,28 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_MERGEUVROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void MergeUVRow_RVV(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    vuint8m4_t v_u = __riscv_vle8_v_u8m4(src_u, vl);
+    vuint8m4_t v_v = __riscv_vle8_v_u8m4(src_v, vl);
+    vuint8m4x2_t v_dst = __riscv_vcreate_v_u8m4x2(v_u, v_v);
+    __riscv_vsseg2e8_v_u8m4x2(dst_uv, v_dst, vl);
+    w -= vl;
+    src_u += vl;
+    src_v += vl;
+    dst_uv += 2 * vl;
+  } while (w > 0);
+}
+#else
 void MergeUVRow_RVV(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
@@ -1046,6 +2044,7 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 struct RgbConstants {
   uint8_t kRGBToY[4];
@@ -1080,6 +2079,41 @@
 
 // ARGB expects first 3 values to contain RGB and 4th value is ignored
 #ifdef HAS_ARGBTOYMATRIXROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
+                          uint8_t* dst_y,
+                          int width,
+                          const struct RgbConstants* rgbconstants) {
+  assert(width != 0);
+  size_t w = (size_t)width;
+  vuint8m2_t v_by, v_gy, v_ry;  // vectors are to store RGBToY constant
+  vuint16m4_t v_addy;           // vector is to store kAddY
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
+  v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
+  v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
+  v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
+  do {
+    vuint8m2_t v_y;
+    vuint16m4_t v_y_u16;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
+    vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
+    vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
+    vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
+    vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3);
+    v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
+    v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
+    v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
+    __riscv_vse8_v_u8m2(dst_y, v_y, vl);
+    w -= vl;
+    src_argb += 4 * vl;
+    dst_y += vl;
+  } while (w > 0);
+}
+#else
 void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
                           uint8_t* dst_y,
                           int width,
@@ -1110,6 +2144,7 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_ARGBTOYROW_RVV
 void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@@ -1137,6 +2172,41 @@
 
 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
 #ifdef HAS_RGBATOYMATRIXROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba,
+                          uint8_t* dst_y,
+                          int width,
+                          const struct RgbConstants* rgbconstants) {
+  assert(width != 0);
+  size_t w = (size_t)width;
+  vuint8m2_t v_by, v_gy, v_ry;  // vectors are to store RGBToY constant
+  vuint16m4_t v_addy;           // vector is to store kAddY
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
+  v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
+  v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
+  v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
+  do {
+    vuint8m2_t v_y;
+    vuint16m4_t v_y_u16;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_src_rgba = __riscv_vlseg4e8_v_u8m2x4(src_rgba, vl);
+    vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 0);
+    vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 1);
+    vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 2);
+    vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 3);
+    v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
+    v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
+    v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
+    __riscv_vse8_v_u8m2(dst_y, v_y, vl);
+    w -= vl;
+    src_rgba += 4 * vl;
+    dst_y += vl;
+  } while (w > 0);
+}
+#else
 void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba,
                           uint8_t* dst_y,
                           int width,
@@ -1167,6 +2237,7 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_RGBATOYROW_RVV
 void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
@@ -1187,6 +2258,40 @@
 #endif
 
 #ifdef HAS_RGBTOYMATRIXROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void RGBToYMatrixRow_RVV(const uint8_t* src_rgb,
+                         uint8_t* dst_y,
+                         int width,
+                         const struct RgbConstants* rgbconstants) {
+  assert(width != 0);
+  size_t w = (size_t)width;
+  vuint8m2_t v_by, v_gy, v_ry;  // vectors are to store RGBToY constant
+  vuint16m4_t v_addy;           // vector is to store kAddY
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
+  v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
+  v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
+  v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
+  do {
+    vuint8m2_t v_y;
+    vuint16m4_t v_y_u16;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x3_t v_src_rgb = __riscv_vlseg3e8_v_u8m2x3(src_rgb, vl);
+    vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 0);
+    vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 1);
+    vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 2);
+    v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
+    v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
+    v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
+    __riscv_vse8_v_u8m2(dst_y, v_y, vl);
+    w -= vl;
+    src_rgb += 3 * vl;
+    dst_y += vl;
+  } while (w > 0);
+}
+#else
 void RGBToYMatrixRow_RVV(const uint8_t* src_rgb,
                          uint8_t* dst_y,
                          int width,
@@ -1217,6 +2322,7 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_RGB24TOYJROW_RVV
 void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
@@ -1246,6 +2352,54 @@
 // dst_argb may be src_argb or src_argb1.
 // src_argb: RGB values have already been pre-multiplied by the a.
 #ifdef HAS_ARGBBLENDROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ARGBBlendRow_RVV(const uint8_t* src_argb,
+                      const uint8_t* src_argb1,
+                      uint8_t* dst_argb,
+                      int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvlmax_e8m2();
+  // clamp255((((256 - a) * b) >> 8) + f)
+  // = b * (256 - a) / 256 + f
+  // = b - (b * a / 256) + f
+  vuint8m2_t v_255 = __riscv_vmv_v_x_u8m2(255, vl);
+  do {
+    vuint8m2_t v_tmp_b, v_tmp_g, v_tmp_r;
+    vuint8m2_t v_dst_b, v_dst_g, v_dst_r;
+    vuint8m2x4_t v_dst_argb;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_src0_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
+    vuint8m2_t v_src0_b = __riscv_vget_v_u8m2x4_u8m2(v_src0_argb, 0);
+    vuint8m2_t v_src0_g = __riscv_vget_v_u8m2x4_u8m2(v_src0_argb, 1);
+    vuint8m2_t v_src0_r = __riscv_vget_v_u8m2x4_u8m2(v_src0_argb, 2);
+    vuint8m2_t v_src0_a = __riscv_vget_v_u8m2x4_u8m2(v_src0_argb, 3);
+    vuint8m2x4_t v_src1_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb1, vl);
+    vuint8m2_t v_src1_b = __riscv_vget_v_u8m2x4_u8m2(v_src1_argb, 0);
+    vuint8m2_t v_src1_g = __riscv_vget_v_u8m2x4_u8m2(v_src1_argb, 1);
+    vuint8m2_t v_src1_r = __riscv_vget_v_u8m2x4_u8m2(v_src1_argb, 2);
+
+    v_tmp_b = __riscv_vmulhu_vv_u8m2(v_src1_b, v_src0_a, vl);
+    v_tmp_g = __riscv_vmulhu_vv_u8m2(v_src1_g, v_src0_a, vl);
+    v_tmp_r = __riscv_vmulhu_vv_u8m2(v_src1_r, v_src0_a, vl);
+
+    v_dst_b = __riscv_vsub_vv_u8m2(v_src1_b, v_tmp_b, vl);
+    v_dst_g = __riscv_vsub_vv_u8m2(v_src1_g, v_tmp_g, vl);
+    v_dst_r = __riscv_vsub_vv_u8m2(v_src1_r, v_tmp_r, vl);
+
+    v_dst_b = __riscv_vsaddu_vv_u8m2(v_dst_b, v_src0_b, vl);
+    v_dst_g = __riscv_vsaddu_vv_u8m2(v_dst_g, v_src0_g, vl);
+    v_dst_r = __riscv_vsaddu_vv_u8m2(v_dst_r, v_src0_r, vl);
+
+    v_dst_argb = __riscv_vcreate_v_u8m2x4(v_dst_b, v_dst_g, v_dst_r, v_255);
+    __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
+
+    w -= vl;
+    src_argb += 4 * vl;
+    src_argb1 += 4 * vl;
+    dst_argb += 4 * vl;
+  } while (w > 0);
+}
+#else
 void ARGBBlendRow_RVV(const uint8_t* src_argb,
                       const uint8_t* src_argb1,
                       uint8_t* dst_argb,
@@ -1287,6 +2441,7 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_BLENDPLANEROW_RVV
 void BlendPlaneRow_RVV(const uint8_t* src0,
@@ -1323,6 +2478,41 @@
 
 // Attenuate: (f * a + 255) >> 8
 #ifdef HAS_ARGBATTENUATEROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ARGBAttenuateRow_RVV(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint16m4_t v_ba_16, v_ga_16, v_ra_16;
+    vuint8m2x4_t v_dst_argb;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
+    vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
+    vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
+    vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
+    vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3);
+    // f * a
+    v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl);
+    v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl);
+    v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl);
+    // f * a + 255
+    v_ba_16 = __riscv_vadd_vx_u16m4(v_ba_16, 255u, vl);
+    v_ga_16 = __riscv_vadd_vx_u16m4(v_ga_16, 255u, vl);
+    v_ra_16 = __riscv_vadd_vx_u16m4(v_ra_16, 255u, vl);
+    // (f * a + 255) >> 8
+    v_b = __riscv_vnsrl_wx_u8m2(v_ba_16, 8, vl);
+    v_g = __riscv_vnsrl_wx_u8m2(v_ga_16, 8, vl);
+    v_r = __riscv_vnsrl_wx_u8m2(v_ra_16, 8, vl);
+
+    v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
+    __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#else
 void ARGBAttenuateRow_RVV(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           int width) {
@@ -1351,8 +2541,25 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_ARGBEXTRACTALPHAROW_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
+    vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3);
+    __riscv_vse8_v_u8m2(dst_a, v_a, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_a += vl;
+  } while (w > 0);
+}
+#else
 void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb,
                              uint8_t* dst_a,
                              int width) {
@@ -1368,6 +2575,7 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_ARGBCOPYYTOALPHAROW_RVV
 void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
diff --git a/source/scale_rvv.cc b/source/scale_rvv.cc
index de037e4..5a6f6e5 100644
--- a/source/scale_rvv.cc
+++ b/source/scale_rvv.cc
@@ -67,6 +67,30 @@
 #endif
 
 #ifdef HAS_SCALEARGBROWDOWN2LINEAR_RVV
+#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG)
+void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  (void)src_stride;
+  size_t w = (size_t)dst_width;
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  do {
+    size_t vl = __riscv_vsetvl_e32m4(w);
+    vuint32m4x2_t v_src = __riscv_vlseg2e32_v_u32m4x2(src, vl);
+    vuint32m4_t v_even_32 = __riscv_vget_v_u32m4x2_u32m4(v_src, 0);
+    vuint32m4_t v_odd_32 = __riscv_vget_v_u32m4x2_u32m4(v_src, 1);
+    vuint8m4_t v_even = __riscv_vreinterpret_v_u32m4_u8m4(v_even_32);
+    vuint8m4_t v_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_odd_32);
+    vuint8m4_t v_dst =
+        __riscv_vaaddu_vv_u8m4(v_even, v_odd, __RISCV_VXRM_RNU, vl * 4);
+    __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
+    w -= vl;
+    src += vl * 2;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#else
 void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
                                  ptrdiff_t src_stride,
                                  uint8_t* dst_argb,
@@ -93,8 +117,45 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_SCALEARGBROWDOWN2BOX_RVV
+#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG)
+void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint32_t* src0 = (const uint32_t*)(src_argb);
+  const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
+  do {
+    size_t vl = __riscv_vsetvl_e32m4(w);
+    vuint32m4x2_t v_src0 = __riscv_vlseg2e32_v_u32m4x2(src0, vl);
+    vuint32m4x2_t v_src1 = __riscv_vlseg2e32_v_u32m4x2(src1, vl);
+    vuint32m4_t v_row0_even_32 = __riscv_vget_v_u32m4x2_u32m4(v_src0, 0);
+    vuint32m4_t v_row0_odd_32 = __riscv_vget_v_u32m4x2_u32m4(v_src0, 1);
+    vuint32m4_t v_row1_even_32 = __riscv_vget_v_u32m4x2_u32m4(v_src1, 0);
+    vuint32m4_t v_row1_odd_32 = __riscv_vget_v_u32m4x2_u32m4(v_src1, 1);
+    vuint8m4_t v_row0_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_even_32);
+    vuint8m4_t v_row0_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_odd_32);
+    vuint8m4_t v_row1_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_even_32);
+    vuint8m4_t v_row1_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_odd_32);
+    vuint16m8_t v_row0_sum =
+        __riscv_vwaddu_vv_u16m8(v_row0_even, v_row0_odd, vl * 4);
+    vuint16m8_t v_row1_sum =
+        __riscv_vwaddu_vv_u16m8(v_row1_even, v_row1_odd, vl * 4);
+    vuint16m8_t v_dst_16 =
+        __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4);
+    vuint8m4_t v_dst =
+        __riscv_vnclipu_wx_u8m4(v_dst_16, 2, __RISCV_VXRM_RNU, vl * 4);
+    __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
+    w -= vl;
+    src0 += vl * 2;
+    src1 += vl * 2;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#else
 void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
                               ptrdiff_t src_stride,
                               uint8_t* dst_argb,
@@ -129,6 +190,7 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_SCALEARGBROWDOWNEVEN_RVV
 void ScaleARGBRowDownEven_RVV(const uint8_t* src_argb,
@@ -152,6 +214,43 @@
 #endif
 
 #ifdef HAS_SCALEARGBROWDOWNEVENBOX_RVV
+#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG)
+void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint32_t* src0 = (const uint32_t*)(src_argb);
+  const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
+  const int stride_byte = src_stepx * 4;
+  do {
+    size_t vl = __riscv_vsetvl_e32m4(w);
+    vuint32m4x2_t v_src0 = __riscv_vlsseg2e32_v_u32m4x2(src0, stride_byte, vl);
+    vuint32m4x2_t v_src1 = __riscv_vlsseg2e32_v_u32m4x2(src1, stride_byte, vl);
+    vuint32m4_t v_row0_low_32 = __riscv_vget_v_u32m4x2_u32m4(v_src0, 0);
+    vuint32m4_t v_row0_high_32 = __riscv_vget_v_u32m4x2_u32m4(v_src0, 1);
+    vuint32m4_t v_row1_low_32 = __riscv_vget_v_u32m4x2_u32m4(v_src1, 0);
+    vuint32m4_t v_row1_high_32 = __riscv_vget_v_u32m4x2_u32m4(v_src1, 1);
+    vuint8m4_t v_row0_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_low_32);
+    vuint8m4_t v_row0_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_high_32);
+    vuint8m4_t v_row1_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_low_32);
+    vuint8m4_t v_row1_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_high_32);
+    vuint16m8_t v_row0_sum =
+        __riscv_vwaddu_vv_u16m8(v_row0_low, v_row0_high, vl * 4);
+    vuint16m8_t v_row1_sum =
+        __riscv_vwaddu_vv_u16m8(v_row1_low, v_row1_high, vl * 4);
+    vuint16m8_t v_sum = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4);
+    vuint8m4_t v_dst =
+        __riscv_vnclipu_wx_u8m4(v_sum, 2, __RISCV_VXRM_RNU, vl * 4);
+    __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
+    w -= vl;
+    src0 += vl * src_stepx;
+    src1 += vl * src_stepx;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#else
 void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
                                  ptrdiff_t src_stride,
                                  int src_stepx,
@@ -190,6 +289,7 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_SCALEROWDOWN2_RVV
 void ScaleRowDown2_RVV(const uint8_t* src_ptr,
@@ -212,6 +312,26 @@
 #endif
 
 #ifdef HAS_SCALEROWDOWN2LINEAR_RVV
+#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG)
+void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width) {
+  size_t w = (size_t)dst_width;
+  (void)src_stride;
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    vuint8m4x2_t v_src = __riscv_vlseg2e8_v_u8m4x2(src_ptr, vl);
+    vuint8m4_t v_s0 = __riscv_vget_v_u8m4x2_u8m4(v_src, 0);
+    vuint8m4_t v_s1 = __riscv_vget_v_u8m4x2_u8m4(v_src, 1);
+    vuint8m4_t v_dst = __riscv_vaaddu_vv_u8m4(v_s0, v_s1, __RISCV_VXRM_RNU, vl);
+    __riscv_vse8_v_u8m4(dst, v_dst, vl);
+    w -= vl;
+    src_ptr += 2 * vl;
+    dst += vl;
+  } while (w > 0);
+}
+#else
 void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
                              ptrdiff_t src_stride,
                              uint8_t* dst,
@@ -234,8 +354,38 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_SCALEROWDOWN2BOX_RVV
+#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG)
+void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  size_t w = (size_t)dst_width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    vuint8m4x2_t v_s = __riscv_vlseg2e8_v_u8m4x2(s, vl);
+    vuint8m4x2_t v_t = __riscv_vlseg2e8_v_u8m4x2(t, vl);
+    vuint8m4_t v_s0 = __riscv_vget_v_u8m4x2_u8m4(v_s, 0);
+    vuint8m4_t v_s1 = __riscv_vget_v_u8m4x2_u8m4(v_s, 1);
+    vuint8m4_t v_t0 = __riscv_vget_v_u8m4x2_u8m4(v_t, 0);
+    vuint8m4_t v_t1 = __riscv_vget_v_u8m4x2_u8m4(v_t, 1);
+    vuint16m8_t v_s01 = __riscv_vwaddu_vv_u16m8(v_s0, v_s1, vl);
+    vuint16m8_t v_t01 = __riscv_vwaddu_vv_u16m8(v_t0, v_t1, vl);
+    vuint16m8_t v_st01 = __riscv_vadd_vv_u16m8(v_s01, v_t01, vl);
+    // Use round-to-nearest-up mode for vnclip
+    vuint8m4_t v_dst = __riscv_vnclipu_wx_u8m4(v_st01, 2, __RISCV_VXRM_RNU, vl);
+    __riscv_vse8_v_u8m4(dst, v_dst, vl);
+    w -= vl;
+    s += 2 * vl;
+    t += 2 * vl;
+    dst += vl;
+  } while (w > 0);
+}
+#else
 void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
                           ptrdiff_t src_stride,
                           uint8_t* dst,
@@ -266,8 +416,27 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_SCALEROWDOWN4_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ScaleRowDown4_RVV(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst_ptr,
+                       int dst_width) {
+  size_t w = (size_t)dst_width;
+  (void)src_stride;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_s = __riscv_vlseg4e8_v_u8m2x4(src_ptr, vl);
+    vuint8m2_t v_s2 = __riscv_vget_v_u8m2x4_u8m2(v_s, 2);
+    __riscv_vse8_v_u8m2(dst_ptr, v_s2, vl);
+    w -= vl;
+    src_ptr += (4 * vl);
+    dst_ptr += vl;
+  } while (w > 0);
+}
+#else
 void ScaleRowDown4_RVV(const uint8_t* src_ptr,
                        ptrdiff_t src_stride,
                        uint8_t* dst_ptr,
@@ -285,8 +454,70 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_SCALEROWDOWN4BOX_RVV
+#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG)
+void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+  size_t w = (size_t)dst_width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_s = __riscv_vlseg4e8_v_u8m2x4(src_ptr, vl);
+    vuint8m2_t v_s0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 0);
+    vuint8m2_t v_s1 = __riscv_vget_v_u8m2x4_u8m2(v_s, 1);
+    vuint8m2_t v_s2 = __riscv_vget_v_u8m2x4_u8m2(v_s, 2);
+    vuint8m2_t v_s3 = __riscv_vget_v_u8m2x4_u8m2(v_s, 3);
+    vuint16m4_t v_s01 = __riscv_vwaddu_vv_u16m4(v_s0, v_s1, vl);
+    vuint8m2x4_t v_t = __riscv_vlseg4e8_v_u8m2x4(src_ptr1, vl);
+    vuint8m2_t v_t0 = __riscv_vget_v_u8m2x4_u8m2(v_t, 0);
+    vuint8m2_t v_t1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 1);
+    vuint8m2_t v_t2 = __riscv_vget_v_u8m2x4_u8m2(v_t, 2);
+    vuint8m2_t v_t3 = __riscv_vget_v_u8m2x4_u8m2(v_t, 3);
+    vuint16m4_t v_t01 = __riscv_vwaddu_vv_u16m4(v_t0, v_t1, vl);
+    vuint8m2x4_t v_u = __riscv_vlseg4e8_v_u8m2x4(src_ptr2, vl);
+    vuint8m2_t v_u0 = __riscv_vget_v_u8m2x4_u8m2(v_u, 0);
+    vuint8m2_t v_u1 = __riscv_vget_v_u8m2x4_u8m2(v_u, 1);
+    vuint8m2_t v_u2 = __riscv_vget_v_u8m2x4_u8m2(v_u, 2);
+    vuint8m2_t v_u3 = __riscv_vget_v_u8m2x4_u8m2(v_u, 3);
+    vuint16m4_t v_u01 = __riscv_vwaddu_vv_u16m4(v_u0, v_u1, vl);
+    vuint16m4_t v_u23 = __riscv_vwaddu_vv_u16m4(v_u2, v_u3, vl);
+    vuint16m4_t v_s23 = __riscv_vwaddu_vv_u16m4(v_s2, v_s3, vl);
+    vuint16m4_t v_t23 = __riscv_vwaddu_vv_u16m4(v_t2, v_t3, vl);
+    vuint16m4_t v_st01 = __riscv_vadd_vv_u16m4(v_s01, v_t01, vl);
+    vuint16m4_t v_st23 = __riscv_vadd_vv_u16m4(v_s23, v_t23, vl);
+    vuint8m2x4_t v_v = __riscv_vlseg4e8_v_u8m2x4(src_ptr3, vl);
+    vuint8m2_t v_v0 = __riscv_vget_v_u8m2x4_u8m2(v_v, 0);
+    vuint8m2_t v_v1 = __riscv_vget_v_u8m2x4_u8m2(v_v, 1);
+    vuint8m2_t v_v2 = __riscv_vget_v_u8m2x4_u8m2(v_v, 2);
+    vuint8m2_t v_v3 = __riscv_vget_v_u8m2x4_u8m2(v_v, 3);
+
+    vuint16m4_t v_v01 = __riscv_vwaddu_vv_u16m4(v_v0, v_v1, vl);
+    vuint16m4_t v_v23 = __riscv_vwaddu_vv_u16m4(v_v2, v_v3, vl);
+
+    vuint16m4_t v_uv01 = __riscv_vadd_vv_u16m4(v_u01, v_v01, vl);
+    vuint16m4_t v_uv23 = __riscv_vadd_vv_u16m4(v_u23, v_v23, vl);
+
+    vuint16m4_t v_st0123 = __riscv_vadd_vv_u16m4(v_st01, v_st23, vl);
+    vuint16m4_t v_uv0123 = __riscv_vadd_vv_u16m4(v_uv01, v_uv23, vl);
+    vuint16m4_t v_stuv0123 = __riscv_vadd_vv_u16m4(v_st0123, v_uv0123, vl);
+    vuint8m2_t v_dst =
+        __riscv_vnclipu_wx_u8m2(v_stuv0123, 4, __RISCV_VXRM_RNU, vl);
+    __riscv_vse8_v_u8m2(dst_ptr, v_dst, vl);
+    w -= vl;
+    src_ptr += 4 * vl;
+    src_ptr1 += 4 * vl;
+    src_ptr2 += 4 * vl;
+    src_ptr3 += 4 * vl;
+    dst_ptr += vl;
+  } while (w > 0);
+}
+#else
 void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
                           ptrdiff_t src_stride,
                           uint8_t* dst_ptr,
@@ -348,8 +579,29 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_SCALEROWDOWN34_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ScaleRowDown34_RVV(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_src = __riscv_vlseg4e8_v_u8m2x4(src_ptr, vl);
+    vuint8m2_t v_0 = __riscv_vget_v_u8m2x4_u8m2(v_src, 0);
+    vuint8m2_t v_1 = __riscv_vget_v_u8m2x4_u8m2(v_src, 1);
+    vuint8m2_t v_3 = __riscv_vget_v_u8m2x4_u8m2(v_src, 3);
+    vuint8m2x3_t v_dst = __riscv_vcreate_v_u8m2x3(v_0, v_1, v_3);
+    __riscv_vsseg3e8_v_u8m2x3(dst_ptr, v_dst, vl);
+    w -= vl;
+    src_ptr += 4 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#else
 void ScaleRowDown34_RVV(const uint8_t* src_ptr,
                         ptrdiff_t src_stride,
                         uint8_t* dst_ptr,
@@ -366,8 +618,77 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_SCALEROWDOWN34_0_BOX_RVV
+#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG)
+void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  do {
+    vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16;
+    vuint8m2_t v_u0, v_u1, v_u2, v_u3;
+    vuint16m4_t v_u1_u16;
+    vuint8m2_t v_a0, v_a1, v_a2;
+    vuint8m2x3_t v_dst;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_s = __riscv_vlseg4e8_v_u8m2x4(s, vl);
+    vuint8m2_t v_s0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 0);
+    vuint8m2_t v_s1 = __riscv_vget_v_u8m2x4_u8m2(v_s, 1);
+    vuint8m2_t v_s2 = __riscv_vget_v_u8m2x4_u8m2(v_s, 2);
+    vuint8m2_t v_s3 = __riscv_vget_v_u8m2x4_u8m2(v_s, 3);
+
+    if (src_stride == 0) {
+      v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
+      v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
+      v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_s2, 2, vl);
+      v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_s3, 2, vl);
+    } else {
+      vuint8m2x4_t v_t = __riscv_vlseg4e8_v_u8m2x4(t, vl);
+      vuint8m2_t v_t0 = __riscv_vget_v_u8m2x4_u8m2(v_t, 0);
+      vuint8m2_t v_t1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 1);
+      vuint8m2_t v_t2 = __riscv_vget_v_u8m2x4_u8m2(v_t, 2);
+      vuint8m2_t v_t3 = __riscv_vget_v_u8m2x4_u8m2(v_t, 3);
+      v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 0, vl);
+      v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 0, vl);
+      v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_t2, 0, vl);
+      v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_t3, 0, vl);
+      t += 4 * vl;
+    }
+
+    v_t0_u16 = __riscv_vwmaccu_vx_u16m4(v_t0_u16, 3, v_s0, vl);
+    v_t1_u16 = __riscv_vwmaccu_vx_u16m4(v_t1_u16, 3, v_s1, vl);
+    v_t2_u16 = __riscv_vwmaccu_vx_u16m4(v_t2_u16, 3, v_s2, vl);
+    v_t3_u16 = __riscv_vwmaccu_vx_u16m4(v_t3_u16, 3, v_s3, vl);
+
+    v_u0 = __riscv_vnclipu_wx_u8m2(v_t0_u16, 2, __RISCV_VXRM_RNU, vl);
+    v_u1 = __riscv_vnclipu_wx_u8m2(v_t1_u16, 2, __RISCV_VXRM_RNU, vl);
+    v_u2 = __riscv_vnclipu_wx_u8m2(v_t2_u16, 2, __RISCV_VXRM_RNU, vl);
+    v_u3 = __riscv_vnclipu_wx_u8m2(v_t3_u16, 2, __RISCV_VXRM_RNU, vl);
+    // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+    v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u1, 0, vl);
+    v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u0, vl);
+    v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, __RISCV_VXRM_RNU, vl);
+    // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+    v_a1 = __riscv_vaaddu_vv_u8m2(v_u1, v_u2, __RISCV_VXRM_RNU, vl);
+    // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+    v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u2, 0, vl);
+    v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u3, vl);
+    v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, __RISCV_VXRM_RNU, vl);
+
+    v_dst = __riscv_vcreate_v_u8m2x3(v_a0, v_a1, v_a2);
+    __riscv_vsseg3e8_v_u8m2x3(dst_ptr, v_dst, vl);
+
+    w -= vl;
+    s += 4 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#else
 void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_ptr,
@@ -434,8 +755,69 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_SCALEROWDOWN34_1_BOX_RVV
+#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG)
+void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  do {
+    vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3;
+    vuint16m4_t v_u1_u16;
+    vuint8m2_t v_a0, v_a1, v_a2;
+    vuint8m2x3_t v_dst;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_s = __riscv_vlseg4e8_v_u8m2x4(s, vl);
+    vuint8m2_t v_s0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 0);
+    vuint8m2_t v_s1 = __riscv_vget_v_u8m2x4_u8m2(v_s, 1);
+    vuint8m2_t v_s2 = __riscv_vget_v_u8m2x4_u8m2(v_s, 2);
+    vuint8m2_t v_s3 = __riscv_vget_v_u8m2x4_u8m2(v_s, 3);
+
+    // Use round-to-nearest-up mode for vnclip & averaging add
+    if (src_stride == 0) {
+      v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_s0, __RISCV_VXRM_RNU, vl);
+      v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_s1, __RISCV_VXRM_RNU, vl);
+      v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_s2, __RISCV_VXRM_RNU, vl);
+      v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_s3, __RISCV_VXRM_RNU, vl);
+    } else {
+      vuint8m2x4_t v_t = __riscv_vlseg4e8_v_u8m2x4(t, vl);
+      vuint8m2_t v_t0 = __riscv_vget_v_u8m2x4_u8m2(v_t, 0);
+      vuint8m2_t v_t1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 1);
+      vuint8m2_t v_t2 = __riscv_vget_v_u8m2x4_u8m2(v_t, 2);
+      vuint8m2_t v_t3 = __riscv_vget_v_u8m2x4_u8m2(v_t, 3);
+      v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_t0, __RISCV_VXRM_RNU, vl);
+      v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_t1, __RISCV_VXRM_RNU, vl);
+      v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_t2, __RISCV_VXRM_RNU, vl);
+      v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_t3, __RISCV_VXRM_RNU, vl);
+      t += 4 * vl;
+    }
+    // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+    v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave1, 0, vl);
+    v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave0, vl);
+    v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, __RISCV_VXRM_RNU, vl);
+
+    // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+    v_a1 = __riscv_vaaddu_vv_u8m2(v_ave1, v_ave2, __RISCV_VXRM_RNU, vl);
+
+    // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+    v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave2, 0, vl);
+    v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave3, vl);
+    v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, __RISCV_VXRM_RNU, vl);
+
+    v_dst = __riscv_vcreate_v_u8m2x3(v_a0, v_a1, v_a2);
+    __riscv_vsseg3e8_v_u8m2x3(dst_ptr, v_dst, vl);
+
+    w -= vl;
+    s += 4 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#else
 void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_ptr,
@@ -490,8 +872,31 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_SCALEROWDOWN38_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ScaleRowDown38_RVV(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  (void)src_stride;
+  assert(dst_width % 3 == 0);
+  do {
+    size_t vl = __riscv_vsetvl_e8m1(w);
+    vuint8m1x8_t v_src = __riscv_vlseg8e8_v_u8m1x8(src_ptr, vl);
+    vuint8m1_t v_s0 = __riscv_vget_v_u8m1x8_u8m1(v_src, 0);
+    vuint8m1_t v_s3 = __riscv_vget_v_u8m1x8_u8m1(v_src, 3);
+    vuint8m1_t v_s6 = __riscv_vget_v_u8m1x8_u8m1(v_src, 6);
+    vuint8m1x3_t v_dst = __riscv_vcreate_v_u8m1x3(v_s0, v_s3, v_s6);
+    __riscv_vsseg3e8_v_u8m1x3(dst_ptr, v_dst, vl);
+    w -= vl;
+    src_ptr += 8 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#else
 void ScaleRowDown38_RVV(const uint8_t* src_ptr,
                         ptrdiff_t src_stride,
                         uint8_t* dst_ptr,
@@ -511,8 +916,77 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_SCALEROWDOWN38_2_BOX_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  const uint16_t coeff_a = (65536u / 6u);
+  const uint16_t coeff_b = (65536u / 4u);
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  do {
+    vuint16m2_t v_e, v_f, v_g;
+    vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
+    vuint8m1x3_t v_dst;
+    size_t vl = __riscv_vsetvl_e8m1(w);
+    // s: e00, e10, e20, f00, f10, f20, g00, g10
+    vuint8m1x8_t v_s = __riscv_vlseg8e8_v_u8m1x8(src_ptr, vl);
+    vuint8m1_t v_s0 = __riscv_vget_v_u8m1x8_u8m1(v_s, 0);
+    vuint8m1_t v_s1 = __riscv_vget_v_u8m1x8_u8m1(v_s, 1);
+    vuint8m1_t v_s2 = __riscv_vget_v_u8m1x8_u8m1(v_s, 2);
+    vuint8m1_t v_s3 = __riscv_vget_v_u8m1x8_u8m1(v_s, 3);
+    vuint8m1_t v_s4 = __riscv_vget_v_u8m1x8_u8m1(v_s, 4);
+    vuint8m1_t v_s5 = __riscv_vget_v_u8m1x8_u8m1(v_s, 5);
+    vuint8m1_t v_s6 = __riscv_vget_v_u8m1x8_u8m1(v_s, 6);
+    vuint8m1_t v_s7 = __riscv_vget_v_u8m1x8_u8m1(v_s, 7);
+    // t: e01, e11, e21, f01, f11, f21, g01, g11
+    vuint8m1x8_t v_t = __riscv_vlseg8e8_v_u8m1x8(src_ptr + src_stride, vl);
+    vuint8m1_t v_t0 = __riscv_vget_v_u8m1x8_u8m1(v_t, 0);
+    vuint8m1_t v_t1 = __riscv_vget_v_u8m1x8_u8m1(v_t, 1);
+    vuint8m1_t v_t2 = __riscv_vget_v_u8m1x8_u8m1(v_t, 2);
+    vuint8m1_t v_t3 = __riscv_vget_v_u8m1x8_u8m1(v_t, 3);
+    vuint8m1_t v_t4 = __riscv_vget_v_u8m1x8_u8m1(v_t, 4);
+    vuint8m1_t v_t5 = __riscv_vget_v_u8m1x8_u8m1(v_t, 5);
+    vuint8m1_t v_t6 = __riscv_vget_v_u8m1x8_u8m1(v_t, 6);
+    vuint8m1_t v_t7 = __riscv_vget_v_u8m1x8_u8m1(v_t, 7);
+    // Calculate sum of [e00, e21] to v_e
+    // Calculate sum of [f00, f21] to v_f
+    // Calculate sum of [g00, g11] to v_g
+    vuint16m2_t v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
+    vuint16m2_t v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
+    vuint16m2_t v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
+    vuint16m2_t v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
+    vuint16m2_t v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
+    vuint16m2_t v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
+    vuint16m2_t v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
+    vuint16m2_t v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
+
+    v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
+    v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
+    v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
+    v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
+    v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
+
+    // Average in 16-bit fixed-point
+    v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
+    v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
+    v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
+    v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
+    v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
+    v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
+
+    v_dst = __riscv_vcreate_v_u8m1x3(v_dst_e, v_dst_f, v_dst_g);
+    __riscv_vsseg3e8_v_u8m1x3(dst_ptr, v_dst, vl);
+    w -= vl;
+    src_ptr += 8 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#else
 void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_ptr,
@@ -569,8 +1043,101 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_SCALEROWDOWN38_3_BOX_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  const uint16_t coeff_a = (65536u / 9u);
+  const uint16_t coeff_b = (65536u / 6u);
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  do {
+    vuint16m2_t v_e0, v_e1, v_e2, v_e3, v_e4, v_e;
+    vuint16m2_t v_f0, v_f1, v_f2, v_f3, v_f4, v_f;
+    vuint16m2_t v_g0, v_g1, v_g2, v_g;
+    vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
+    vuint8m1x3_t v_dst;
+    size_t vl = __riscv_vsetvl_e8m1(w);
+    // s: e00, e10, e20, f00, f10, f20, g00, g10
+    vuint8m1x8_t v_s = __riscv_vlseg8e8_v_u8m1x8(src_ptr, vl);
+    vuint8m1_t v_s0 = __riscv_vget_v_u8m1x8_u8m1(v_s, 0);
+    vuint8m1_t v_s1 = __riscv_vget_v_u8m1x8_u8m1(v_s, 1);
+    vuint8m1_t v_s2 = __riscv_vget_v_u8m1x8_u8m1(v_s, 2);
+    vuint8m1_t v_s3 = __riscv_vget_v_u8m1x8_u8m1(v_s, 3);
+    vuint8m1_t v_s4 = __riscv_vget_v_u8m1x8_u8m1(v_s, 4);
+    vuint8m1_t v_s5 = __riscv_vget_v_u8m1x8_u8m1(v_s, 5);
+    vuint8m1_t v_s6 = __riscv_vget_v_u8m1x8_u8m1(v_s, 6);
+    vuint8m1_t v_s7 = __riscv_vget_v_u8m1x8_u8m1(v_s, 7);
+    // t: e01, e11, e21, f01, f11, f21, g01, g11
+    vuint8m1x8_t v_t = __riscv_vlseg8e8_v_u8m1x8(src_ptr + src_stride, vl);
+    vuint8m1_t v_t0 = __riscv_vget_v_u8m1x8_u8m1(v_t, 0);
+    vuint8m1_t v_t1 = __riscv_vget_v_u8m1x8_u8m1(v_t, 1);
+    vuint8m1_t v_t2 = __riscv_vget_v_u8m1x8_u8m1(v_t, 2);
+    vuint8m1_t v_t3 = __riscv_vget_v_u8m1x8_u8m1(v_t, 3);
+    vuint8m1_t v_t4 = __riscv_vget_v_u8m1x8_u8m1(v_t, 4);
+    vuint8m1_t v_t5 = __riscv_vget_v_u8m1x8_u8m1(v_t, 5);
+    vuint8m1_t v_t6 = __riscv_vget_v_u8m1x8_u8m1(v_t, 6);
+    vuint8m1_t v_t7 = __riscv_vget_v_u8m1x8_u8m1(v_t, 7);
+    // u: e02, e12, e22, f02, f12, f22, g02, g12
+    vuint8m1x8_t v_u = __riscv_vlseg8e8_v_u8m1x8(src_ptr + 2 * src_stride, vl);
+    vuint8m1_t v_u0 = __riscv_vget_v_u8m1x8_u8m1(v_u, 0);
+    vuint8m1_t v_u1 = __riscv_vget_v_u8m1x8_u8m1(v_u, 1);
+    vuint8m1_t v_u2 = __riscv_vget_v_u8m1x8_u8m1(v_u, 2);
+    vuint8m1_t v_u3 = __riscv_vget_v_u8m1x8_u8m1(v_u, 3);
+    vuint8m1_t v_u4 = __riscv_vget_v_u8m1x8_u8m1(v_u, 4);
+    vuint8m1_t v_u5 = __riscv_vget_v_u8m1x8_u8m1(v_u, 5);
+    vuint8m1_t v_u6 = __riscv_vget_v_u8m1x8_u8m1(v_u, 6);
+    vuint8m1_t v_u7 = __riscv_vget_v_u8m1x8_u8m1(v_u, 7);
+    // Calculate sum of [e00, e22]
+    v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
+    v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
+    v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
+    v_e3 = __riscv_vwaddu_vv_u16m2(v_u0, v_u1, vl);
+    v_e4 = __riscv_vwaddu_vx_u16m2(v_u2, 0, vl);
+
+    v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
+    v_e2 = __riscv_vadd_vv_u16m2(v_e2, v_e3, vl);
+    v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e4, vl);
+    v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
+    // Calculate sum of [f00, f22]
+    v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
+    v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
+    v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
+    v_f3 = __riscv_vwaddu_vv_u16m2(v_u3, v_u4, vl);
+    v_f4 = __riscv_vwaddu_vx_u16m2(v_u5, 0, vl);
+
+    v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
+    v_f2 = __riscv_vadd_vv_u16m2(v_f2, v_f3, vl);
+    v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f4, vl);
+    v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
+    // Calculate sum of [g00, g12]
+    v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
+    v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
+    v_g2 = __riscv_vwaddu_vv_u16m2(v_u6, v_u7, vl);
+
+    v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
+    v_g = __riscv_vadd_vv_u16m2(v_g, v_g2, vl);
+
+    // Average in 16-bit fixed-point
+    v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
+    v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
+    v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
+    v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
+    v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
+    v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
+
+    v_dst = __riscv_vcreate_v_u8m1x3(v_dst_e, v_dst_f, v_dst_g);
+    __riscv_vsseg3e8_v_u8m1x3(dst_ptr, v_dst, vl);
+    w -= vl;
+    src_ptr += 8 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#else
 void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_ptr,
@@ -642,12 +1209,50 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 // ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms'
 // ScaleRowUp2_(Bi)linear_Any_XXX. We process entire row in this function. Other
 // platforms only implement non-edge part of image and process edge with scalar.
 
 #ifdef HAS_SCALEROWUP2_LINEAR_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  size_t work_width = (size_t)dst_width - 1u;
+  size_t src_width = work_width >> 1u;
+  const uint8_t* work_src_ptr = src_ptr;
+  uint8_t* work_dst_ptr = dst_ptr + 1;
+  size_t vl = __riscv_vsetvlmax_e8m4();
+  vuint8m4_t v_3 = __riscv_vmv_v_x_u8m4(3, vl);
+  dst_ptr[0] = src_ptr[0];
+  while (src_width > 0) {
+    vuint8m4_t v_src0, v_src1, v_dst_odd, v_dst_even;
+    vuint16m8_t v_src0_u16, v_src1_u16;
+    vuint8m4x2_t v_dst;
+    size_t vl = __riscv_vsetvl_e8m4(src_width);
+    v_src0 = __riscv_vle8_v_u8m4(work_src_ptr, vl);
+    v_src1 = __riscv_vle8_v_u8m4(work_src_ptr + 1, vl);
+
+    v_src0_u16 = __riscv_vwaddu_vx_u16m8(v_src0, 2, vl);
+    v_src1_u16 = __riscv_vwaddu_vx_u16m8(v_src1, 2, vl);
+    v_src0_u16 = __riscv_vwmaccu_vv_u16m8(v_src0_u16, v_3, v_src1, vl);
+    v_src1_u16 = __riscv_vwmaccu_vv_u16m8(v_src1_u16, v_3, v_src0, vl);
+
+    v_dst_odd = __riscv_vnsrl_wx_u8m4(v_src0_u16, 2, vl);
+    v_dst_even = __riscv_vnsrl_wx_u8m4(v_src1_u16, 2, vl);
+
+    v_dst = __riscv_vcreate_v_u8m4x2(v_dst_even, v_dst_odd);
+    __riscv_vsseg2e8_v_u8m4x2(work_dst_ptr, v_dst, vl);
+
+    src_width -= vl;
+    work_src_ptr += vl;
+    work_dst_ptr += 2 * vl;
+  }
+  dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2];
+}
+#else
 void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int dst_width) {
@@ -682,8 +1287,82 @@
   dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2];
 }
 #endif
+#endif
 
 #ifdef HAS_SCALEROWUP2_BILINEAR_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              ptrdiff_t dst_stride,
+                              int dst_width) {
+  size_t work_width = ((size_t)dst_width - 1u) & ~1u;
+  size_t src_width = work_width >> 1u;
+  const uint8_t* work_s = src_ptr;
+  const uint8_t* work_t = src_ptr + src_stride;
+  const uint8_t* s = work_s;
+  const uint8_t* t = work_t;
+  uint8_t* d = dst_ptr;
+  uint8_t* e = dst_ptr + dst_stride;
+  uint8_t* work_d = d + 1;
+  uint8_t* work_e = e + 1;
+  size_t vl = __riscv_vsetvlmax_e16m4();
+  vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl);
+  vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl);
+  d[0] = (3 * s[0] + t[0] + 2) >> 2;
+  e[0] = (s[0] + 3 * t[0] + 2) >> 2;
+  while (src_width > 0) {
+    vuint8m2_t v_s0, v_s1, v_t0, v_t1;
+    vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16;
+    vuint16m4_t v_t0_u16_, v_t1_u16_;
+    vuint8m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd;
+    vuint8m2x2_t v_dst0, v_dst1;
+    size_t vl = __riscv_vsetvl_e8m2(src_width);
+    v_s0 = __riscv_vle8_v_u8m2(work_s, vl);
+    v_s1 = __riscv_vle8_v_u8m2(work_s + 1, vl);
+
+    v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
+    v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
+    v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl);
+    v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl);
+
+    v_t0 = __riscv_vle8_v_u8m2(work_t, vl);
+    v_t1 = __riscv_vle8_v_u8m2(work_t + 1, vl);
+
+    v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl);
+    v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl);
+    v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl);
+    v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl);
+
+    v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl);
+    v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl);
+
+    v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl);
+    v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl);
+    v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl);
+    v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl);
+
+    v_dst0_odd = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl);
+    v_dst0_even = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl);
+    v_dst1_odd = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl);
+    v_dst1_even = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl);
+
+    v_dst0 = __riscv_vcreate_v_u8m2x2(v_dst0_even, v_dst0_odd);
+    __riscv_vsseg2e8_v_u8m2x2(work_d, v_dst0, vl);
+    v_dst1 = __riscv_vcreate_v_u8m2x2(v_dst1_even, v_dst1_odd);
+    __riscv_vsseg2e8_v_u8m2x2(work_e, v_dst1, vl);
+    src_width -= vl;
+    work_s += vl;
+    work_t += vl;
+    work_d += 2 * vl;
+    work_e += 2 * vl;
+  }
+  d[dst_width - 1] =
+      (3 * s[(dst_width - 1) / 2] + t[(dst_width - 1) / 2] + 2) >> 2;
+  e[dst_width - 1] =
+      (s[(dst_width - 1) / 2] + 3 * t[(dst_width - 1) / 2] + 2) >> 2;
+}
+#else
 void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_ptr,
@@ -754,6 +1433,7 @@
       (s[(dst_width - 1) / 2] + 3 * t[(dst_width - 1) / 2] + 2) >> 2;
 }
 #endif
+#endif
 
 #ifdef HAS_SCALEUVROWDOWN2_RVV
 void ScaleUVRowDown2_RVV(const uint8_t* src_uv,
@@ -777,6 +1457,30 @@
 #endif
 
 #ifdef HAS_SCALEUVROWDOWN2LINEAR_RVV
+#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG)
+void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_uv,
+                               int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint16_t* src = (const uint16_t*)src_uv;
+  (void)src_stride;
+  do {
+    size_t vl = __riscv_vsetvl_e16m4(w);
+    vuint16m4x2_t v_src = __riscv_vlseg2e16_v_u16m4x2(src, vl);
+    vuint16m4_t v_u0v0_16 = __riscv_vget_v_u16m4x2_u16m4(v_src, 0);
+    vuint16m4_t v_u1v1_16 = __riscv_vget_v_u16m4x2_u16m4(v_src, 1);
+    vuint8m4_t v_u0v0 = __riscv_vreinterpret_v_u16m4_u8m4(v_u0v0_16);
+    vuint8m4_t v_u1v1 = __riscv_vreinterpret_v_u16m4_u8m4(v_u1v1_16);
+    vuint8m4_t v_avg =
+        __riscv_vaaddu_vv_u8m4(v_u0v0, v_u1v1, __RISCV_VXRM_RNU, vl * 2);
+    __riscv_vse8_v_u8m4(dst_uv, v_avg, vl * 2);
+    w -= vl;
+    src += vl * 2;
+    dst_uv += vl * 2;
+  } while (w > 0);
+}
+#else
 void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv,
                                ptrdiff_t src_stride,
                                uint8_t* dst_uv,
@@ -803,8 +1507,50 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_SCALEUVROWDOWN2BOX_RVV
+#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG)
+void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_uv,
+                            int dst_width) {
+  const uint8_t* src_uv_row1 = src_uv + src_stride;
+  size_t w = (size_t)dst_width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2x4_t v_s = __riscv_vlseg4e8_v_u8m2x4(src_uv, vl);
+    vuint8m2_t v_u0_row0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 0);
+    vuint8m2_t v_v0_row0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 1);
+    vuint8m2_t v_u1_row0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 2);
+    vuint8m2_t v_v1_row0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 3);
+    vuint8m2x4_t v_t = __riscv_vlseg4e8_v_u8m2x4(src_uv_row1, vl);
+    vuint8m2_t v_u0_row1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 0);
+    vuint8m2_t v_v0_row1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 1);
+    vuint8m2_t v_u1_row1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 2);
+    vuint8m2_t v_v1_row1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 3);
+
+    vuint16m4_t v_u0u1_row0 = __riscv_vwaddu_vv_u16m4(v_u0_row0, v_u1_row0, vl);
+    vuint16m4_t v_u0u1_row1 = __riscv_vwaddu_vv_u16m4(v_u0_row1, v_u1_row1, vl);
+    vuint16m4_t v_v0v1_row0 = __riscv_vwaddu_vv_u16m4(v_v0_row0, v_v1_row0, vl);
+    vuint16m4_t v_v0v1_row1 = __riscv_vwaddu_vv_u16m4(v_v0_row1, v_v1_row1, vl);
+    vuint16m4_t v_sum0 = __riscv_vadd_vv_u16m4(v_u0u1_row0, v_u0u1_row1, vl);
+    vuint16m4_t v_sum1 = __riscv_vadd_vv_u16m4(v_v0v1_row0, v_v0v1_row1, vl);
+    vuint8m2_t v_dst_u =
+        __riscv_vnclipu_wx_u8m2(v_sum0, 2, __RISCV_VXRM_RNU, vl);
+    vuint8m2_t v_dst_v =
+        __riscv_vnclipu_wx_u8m2(v_sum1, 2, __RISCV_VXRM_RNU, vl);
+
+    vuint8m2x2_t v_dst_uv = __riscv_vcreate_v_u8m2x2(v_dst_u, v_dst_v);
+    __riscv_vsseg2e8_v_u8m2x2(dst_uv, v_dst_uv, vl);
+
+    dst_uv += 2 * vl;
+    src_uv += 4 * vl;
+    w -= vl;
+    src_uv_row1 += 4 * vl;
+  } while (w > 0);
+}
+#else
 void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv,
                             ptrdiff_t src_stride,
                             uint8_t* dst_uv,
@@ -847,6 +1593,7 @@
   } while (w > 0);
 }
 #endif
+#endif
 
 #ifdef HAS_SCALEUVROWDOWN4_RVV
 void ScaleUVRowDown4_RVV(const uint8_t* src_uv,
@@ -903,6 +1650,49 @@
 // scalar.
 
 #ifdef HAS_SCALEUVROWUP2_LINEAR_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t work_width = ((size_t)dst_width - 1u) & ~1u;
+  uint16_t* work_dst_ptr = (uint16_t*)dst_ptr + 1;
+  const uint8_t* work_src_ptr = src_ptr;
+  size_t vl = __riscv_vsetvlmax_e8m4();
+  vuint8m4_t v_3_u8 = __riscv_vmv_v_x_u8m4(3, vl);
+  dst_ptr[0] = src_ptr[0];
+  dst_ptr[1] = src_ptr[1];
+  while (work_width > 0) {
+    vuint8m4_t v_uv0, v_uv1, v_dst_odd_u8, v_dst_even_u8;
+    vuint16m4_t v_dst_odd, v_dst_even;
+    vuint16m8_t v_uv0_u16, v_uv1_u16;
+    vuint16m4x2_t v_dst;
+    size_t vl = __riscv_vsetvl_e8m4(work_width);
+    v_uv0 = __riscv_vle8_v_u8m4(work_src_ptr, vl);
+    v_uv1 = __riscv_vle8_v_u8m4(work_src_ptr + 2, vl);
+
+    v_uv0_u16 = __riscv_vwaddu_vx_u16m8(v_uv0, 2, vl);
+    v_uv1_u16 = __riscv_vwaddu_vx_u16m8(v_uv1, 2, vl);
+
+    v_uv0_u16 = __riscv_vwmaccu_vv_u16m8(v_uv0_u16, v_3_u8, v_uv1, vl);
+    v_uv1_u16 = __riscv_vwmaccu_vv_u16m8(v_uv1_u16, v_3_u8, v_uv0, vl);
+
+    v_dst_odd_u8 = __riscv_vnsrl_wx_u8m4(v_uv0_u16, 2, vl);
+    v_dst_even_u8 = __riscv_vnsrl_wx_u8m4(v_uv1_u16, 2, vl);
+
+    v_dst_even = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_even_u8);
+    v_dst_odd = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_odd_u8);
+
+    v_dst = __riscv_vcreate_v_u16m4x2(v_dst_even, v_dst_odd);
+    __riscv_vsseg2e16_v_u16m4x2(work_dst_ptr, v_dst, vl / 2);
+
+    work_width -= vl;
+    work_src_ptr += vl;
+    work_dst_ptr += vl;
+  }
+  dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2];
+  dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1];
+}
+#else
 void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int dst_width) {
@@ -943,8 +1733,98 @@
   dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1];
 }
 #endif
+#endif
 
 #ifdef HAS_SCALEUVROWUP2_BILINEAR_RVV
+#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE
+void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                ptrdiff_t dst_stride,
+                                int dst_width) {
+  size_t work_width = ((size_t)dst_width - 1u) & ~1u;
+  const uint8_t* work_s = src_ptr;
+  const uint8_t* work_t = src_ptr + src_stride;
+  const uint8_t* s = work_s;
+  const uint8_t* t = work_t;
+  uint8_t* d = dst_ptr;
+  uint8_t* e = dst_ptr + dst_stride;
+  uint16_t* work_d = (uint16_t*)d + 1;
+  uint16_t* work_e = (uint16_t*)e + 1;
+  size_t vl = __riscv_vsetvlmax_e16m4();
+  vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl);
+  vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl);
+  d[0] = (3 * s[0] + t[0] + 2) >> 2;
+  e[0] = (s[0] + 3 * t[0] + 2) >> 2;
+  d[1] = (3 * s[1] + t[1] + 2) >> 2;
+  e[1] = (s[1] + 3 * t[1] + 2) >> 2;
+  while (work_width > 0) {
+    vuint8m2_t v_s0, v_s1, v_t0, v_t1;
+    vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16;
+    vuint16m4_t v_t0_u16_, v_t1_u16_;
+    vuint8m2_t v_dst0_odd_u8, v_dst0_even_u8, v_dst1_odd_u8, v_dst1_even_u8;
+    vuint16m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd;
+    vuint16m2x2_t v_dst0, v_dst1;
+    size_t vl = __riscv_vsetvl_e8m2(work_width);
+    v_s0 = __riscv_vle8_v_u8m2(work_s, vl);
+    v_s1 = __riscv_vle8_v_u8m2(work_s + 2, vl);
+
+    v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
+    v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
+    v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl);
+    v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl);
+
+    v_t0 = __riscv_vle8_v_u8m2(work_t, vl);
+    v_t1 = __riscv_vle8_v_u8m2(work_t + 2, vl);
+
+    v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl);
+    v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl);
+    v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl);
+    v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl);
+
+    v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl);
+    v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl);
+
+    v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl);
+    v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl);
+    v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl);
+    v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl);
+
+    v_dst0_odd_u8 = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl);
+    v_dst0_even_u8 = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl);
+    v_dst1_odd_u8 = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl);
+    v_dst1_even_u8 = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl);
+
+    v_dst0_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_even_u8);
+    v_dst0_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_odd_u8);
+    v_dst1_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_even_u8);
+    v_dst1_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_odd_u8);
+
+    v_dst0 = __riscv_vcreate_v_u16m2x2(v_dst0_even, v_dst0_odd);
+    __riscv_vsseg2e16_v_u16m2x2(work_d, v_dst0, vl / 2);
+    v_dst1 = __riscv_vcreate_v_u16m2x2(v_dst1_even, v_dst1_odd);
+    __riscv_vsseg2e16_v_u16m2x2(work_e, v_dst1, vl / 2);
+
+    work_width -= vl;
+    work_s += vl;
+    work_t += vl;
+    work_d += vl;
+    work_e += vl;
+  }
+  d[2 * dst_width - 2] =
+      (3 * s[((dst_width + 1) & ~1) - 2] + t[((dst_width + 1) & ~1) - 2] + 2) >>
+      2;
+  e[2 * dst_width - 2] =
+      (s[((dst_width + 1) & ~1) - 2] + 3 * t[((dst_width + 1) & ~1) - 2] + 2) >>
+      2;
+  d[2 * dst_width - 1] =
+      (3 * s[((dst_width + 1) & ~1) - 1] + t[((dst_width + 1) & ~1) - 1] + 2) >>
+      2;
+  e[2 * dst_width - 1] =
+      (s[((dst_width + 1) & ~1) - 1] + 3 * t[((dst_width + 1) & ~1) - 1] + 2) >>
+      2;
+}
+#else
 void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8_t* dst_ptr,
@@ -1030,6 +1910,7 @@
       2;
 }
 #endif
+#endif
 
 #ifdef __cplusplus
 }  // extern "C"