aboutsummaryrefslogtreecommitdiffstats
path: root/src/main/jni/libyuv/source
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/jni/libyuv/source')
-rw-r--r--src/main/jni/libyuv/source/compare.cc325
-rw-r--r--src/main/jni/libyuv/source/compare_common.cc42
-rw-r--r--src/main/jni/libyuv/source/compare_neon.cc103
-rw-r--r--src/main/jni/libyuv/source/compare_posix.cc158
-rw-r--r--src/main/jni/libyuv/source/compare_win.cc232
-rw-r--r--src/main/jni/libyuv/source/convert.cc1543
-rw-r--r--src/main/jni/libyuv/source/convert_argb.cc938
-rw-r--r--src/main/jni/libyuv/source/convert_from.cc1210
-rw-r--r--src/main/jni/libyuv/source/convert_from_argb.cc1133
-rw-r--r--src/main/jni/libyuv/source/convert_jpeg.cc392
-rw-r--r--src/main/jni/libyuv/source/convert_to_argb.cc327
-rw-r--r--src/main/jni/libyuv/source/convert_to_i420.cc383
-rw-r--r--src/main/jni/libyuv/source/cpu_id.cc293
-rw-r--r--src/main/jni/libyuv/source/format_conversion.cc554
-rw-r--r--src/main/jni/libyuv/source/mjpeg_decoder.cc566
-rw-r--r--src/main/jni/libyuv/source/mjpeg_validate.cc47
-rw-r--r--src/main/jni/libyuv/source/planar_functions.cc2291
-rw-r--r--src/main/jni/libyuv/source/rotate.cc1315
-rw-r--r--src/main/jni/libyuv/source/rotate_argb.cc209
-rw-r--r--src/main/jni/libyuv/source/rotate_mips.cc485
-rw-r--r--src/main/jni/libyuv/source/rotate_neon.cc533
-rw-r--r--src/main/jni/libyuv/source/rotate_neon64.cc540
-rw-r--r--src/main/jni/libyuv/source/row_any.cc602
-rw-r--r--src/main/jni/libyuv/source/row_common.cc2286
-rw-r--r--src/main/jni/libyuv/source/row_mips.cc994
-rw-r--r--src/main/jni/libyuv/source/row_neon.cc3148
-rw-r--r--src/main/jni/libyuv/source/row_neon64.cc3327
-rw-r--r--src/main/jni/libyuv/source/row_posix.cc6443
-rw-r--r--src/main/jni/libyuv/source/row_win.cc7402
-rw-r--r--src/main/jni/libyuv/source/row_x86.asm146
-rw-r--r--src/main/jni/libyuv/source/scale.cc1716
-rw-r--r--src/main/jni/libyuv/source/scale_argb.cc809
-rw-r--r--src/main/jni/libyuv/source/scale_common.cc1165
-rw-r--r--src/main/jni/libyuv/source/scale_mips.cc654
-rw-r--r--src/main/jni/libyuv/source/scale_neon.cc764
-rw-r--r--src/main/jni/libyuv/source/scale_neon64.cc789
-rw-r--r--src/main/jni/libyuv/source/scale_posix.cc1315
-rw-r--r--src/main/jni/libyuv/source/scale_win.cc1320
-rw-r--r--src/main/jni/libyuv/source/video_common.cc64
-rw-r--r--src/main/jni/libyuv/source/x86inc.asm1136
40 files changed, 0 insertions, 47699 deletions
diff --git a/src/main/jni/libyuv/source/compare.cc b/src/main/jni/libyuv/source/compare.cc
deleted file mode 100644
index dc715e019..000000000
--- a/src/main/jni/libyuv/source/compare.cc
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/compare.h"
-
-#include <float.h>
-#include <math.h>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#include "libyuv/basic_types.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// hash seed of 5381 recommended.
-// Internal C version of HashDjb2 with int sized count for efficiency.
-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
-
-// This module is for Visual C x86
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(_M_IX86) || \
- (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
-#define HAS_HASHDJB2_SSE41
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
-
-#if _MSC_VER >= 1700
-#define HAS_HASHDJB2_AVX2
-uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
-#endif
-
-#endif // HAS_HASHDJB2_SSE41
-
-// hash seed of 5381 recommended.
-LIBYUV_API
-uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
- const int kBlockSize = 1 << 15; // 32768;
- int remainder;
- uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
-#if defined(HAS_HASHDJB2_SSE41)
- if (TestCpuFlag(kCpuHasSSE41)) {
- HashDjb2_SSE = HashDjb2_SSE41;
- }
-#endif
-#if defined(HAS_HASHDJB2_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- HashDjb2_SSE = HashDjb2_AVX2;
- }
-#endif
-
- while (count >= (uint64)(kBlockSize)) {
- seed = HashDjb2_SSE(src, kBlockSize, seed);
- src += kBlockSize;
- count -= kBlockSize;
- }
- remainder = (int)(count) & ~15;
- if (remainder) {
- seed = HashDjb2_SSE(src, remainder, seed);
- src += remainder;
- count -= remainder;
- }
- remainder = (int)(count) & 15;
- if (remainder) {
- seed = HashDjb2_C(src, remainder, seed);
- }
- return seed;
-}
-
-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
-#if !defined(LIBYUV_DISABLE_NEON) && \
- (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
-#define HAS_SUMSQUAREERROR_NEON
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
-#endif
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#define HAS_SUMSQUAREERROR_SSE2
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
-#endif
-// Visual C 2012 required for AVX2.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700
-#define HAS_SUMSQUAREERROR_AVX2
-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
-#endif
-
-// TODO(fbarchard): Refactor into row function.
-LIBYUV_API
-uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
- int count) {
- // SumSquareError returns values 0 to 65535 for each squared difference.
- // Up to 65536 of those can be summed and remain within a uint32.
- // After each block of 65536 pixels, accumulate into a uint64.
- const int kBlockSize = 65536;
- int remainder = count & (kBlockSize - 1) & ~31;
- uint64 sse = 0;
- int i;
- uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
- SumSquareError_C;
-#if defined(HAS_SUMSQUAREERROR_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- SumSquareError = SumSquareError_NEON;
- }
-#endif
-#if defined(HAS_SUMSQUAREERROR_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
- // Note only used for multiples of 16 so count is not checked.
- SumSquareError = SumSquareError_SSE2;
- }
-#endif
-#if defined(HAS_SUMSQUAREERROR_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- // Note only used for multiples of 32 so count is not checked.
- SumSquareError = SumSquareError_AVX2;
- }
-#endif
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+: sse)
-#endif
- for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
- sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
- }
- src_a += count & ~(kBlockSize - 1);
- src_b += count & ~(kBlockSize - 1);
- if (remainder) {
- sse += SumSquareError(src_a, src_b, remainder);
- src_a += remainder;
- src_b += remainder;
- }
- remainder = count & 31;
- if (remainder) {
- sse += SumSquareError_C(src_a, src_b, remainder);
- }
- return sse;
-}
-
-LIBYUV_API
-uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
- const uint8* src_b, int stride_b,
- int width, int height) {
- uint64 sse = 0;
- int h;
- // Coalesce rows.
- if (stride_a == width &&
- stride_b == width) {
- width *= height;
- height = 1;
- stride_a = stride_b = 0;
- }
- for (h = 0; h < height; ++h) {
- sse += ComputeSumSquareError(src_a, src_b, width);
- src_a += stride_a;
- src_b += stride_b;
- }
- return sse;
-}
-
-LIBYUV_API
-double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
- double psnr;
- if (sse > 0) {
- double mse = (double)(count) / (double)(sse);
- psnr = 10.0 * log10(255.0 * 255.0 * mse);
- } else {
- psnr = kMaxPsnr; // Limit to prevent divide by 0
- }
-
- if (psnr > kMaxPsnr)
- psnr = kMaxPsnr;
-
- return psnr;
-}
-
-LIBYUV_API
-double CalcFramePsnr(const uint8* src_a, int stride_a,
- const uint8* src_b, int stride_b,
- int width, int height) {
- const uint64 samples = width * height;
- const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
- src_b, stride_b,
- width, height);
- return SumSquareErrorToPsnr(sse, samples);
-}
-
-LIBYUV_API
-double I420Psnr(const uint8* src_y_a, int stride_y_a,
- const uint8* src_u_a, int stride_u_a,
- const uint8* src_v_a, int stride_v_a,
- const uint8* src_y_b, int stride_y_b,
- const uint8* src_u_b, int stride_u_b,
- const uint8* src_v_b, int stride_v_b,
- int width, int height) {
- const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
- src_y_b, stride_y_b,
- width, height);
- const int width_uv = (width + 1) >> 1;
- const int height_uv = (height + 1) >> 1;
- const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
- src_u_b, stride_u_b,
- width_uv, height_uv);
- const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
- src_v_b, stride_v_b,
- width_uv, height_uv);
- const uint64 samples = width * height + 2 * (width_uv * height_uv);
- const uint64 sse = sse_y + sse_u + sse_v;
- return SumSquareErrorToPsnr(sse, samples);
-}
-
-static const int64 cc1 = 26634; // (64^2*(.01*255)^2
-static const int64 cc2 = 239708; // (64^2*(.03*255)^2
-
-static double Ssim8x8_C(const uint8* src_a, int stride_a,
- const uint8* src_b, int stride_b) {
- int64 sum_a = 0;
- int64 sum_b = 0;
- int64 sum_sq_a = 0;
- int64 sum_sq_b = 0;
- int64 sum_axb = 0;
-
- int i;
- for (i = 0; i < 8; ++i) {
- int j;
- for (j = 0; j < 8; ++j) {
- sum_a += src_a[j];
- sum_b += src_b[j];
- sum_sq_a += src_a[j] * src_a[j];
- sum_sq_b += src_b[j] * src_b[j];
- sum_axb += src_a[j] * src_b[j];
- }
-
- src_a += stride_a;
- src_b += stride_b;
- }
-
- {
- const int64 count = 64;
- // scale the constants by number of pixels
- const int64 c1 = (cc1 * count * count) >> 12;
- const int64 c2 = (cc2 * count * count) >> 12;
-
- const int64 sum_a_x_sum_b = sum_a * sum_b;
-
- const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
- (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
-
- const int64 sum_a_sq = sum_a*sum_a;
- const int64 sum_b_sq = sum_b*sum_b;
-
- const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
- (count * sum_sq_a - sum_a_sq +
- count * sum_sq_b - sum_b_sq + c2);
-
- if (ssim_d == 0.0) {
- return DBL_MAX;
- }
- return ssim_n * 1.0 / ssim_d;
- }
-}
-
-// We are using a 8x8 moving window with starting location of each 8x8 window
-// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
-// block boundaries to penalize blocking artifacts.
-LIBYUV_API
-double CalcFrameSsim(const uint8* src_a, int stride_a,
- const uint8* src_b, int stride_b,
- int width, int height) {
- int samples = 0;
- double ssim_total = 0;
- double (*Ssim8x8)(const uint8* src_a, int stride_a,
- const uint8* src_b, int stride_b) = Ssim8x8_C;
-
- // sample point start with each 4x4 location
- int i;
- for (i = 0; i < height - 8; i += 4) {
- int j;
- for (j = 0; j < width - 8; j += 4) {
- ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
- samples++;
- }
-
- src_a += stride_a * 4;
- src_b += stride_b * 4;
- }
-
- ssim_total /= samples;
- return ssim_total;
-}
-
-LIBYUV_API
-double I420Ssim(const uint8* src_y_a, int stride_y_a,
- const uint8* src_u_a, int stride_u_a,
- const uint8* src_v_a, int stride_v_a,
- const uint8* src_y_b, int stride_y_b,
- const uint8* src_u_b, int stride_u_b,
- const uint8* src_v_b, int stride_v_b,
- int width, int height) {
- const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
- src_y_b, stride_y_b, width, height);
- const int width_uv = (width + 1) >> 1;
- const int height_uv = (height + 1) >> 1;
- const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
- src_u_b, stride_u_b,
- width_uv, height_uv);
- const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
- src_v_b, stride_v_b,
- width_uv, height_uv);
- return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/compare_common.cc b/src/main/jni/libyuv/source/compare_common.cc
deleted file mode 100644
index c546b5182..000000000
--- a/src/main/jni/libyuv/source/compare_common.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
- uint32 sse = 0u;
- int i;
- for (i = 0; i < count; ++i) {
- int diff = src_a[i] - src_b[i];
- sse += (uint32)(diff * diff);
- }
- return sse;
-}
-
-// hash seed of 5381 recommended.
-// Internal C version of HashDjb2 with int sized count for efficiency.
-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
- uint32 hash = seed;
- int i;
- for (i = 0; i < count; ++i) {
- hash += (hash << 5) + src[i];
- }
- return hash;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/compare_neon.cc b/src/main/jni/libyuv/source/compare_neon.cc
deleted file mode 100644
index 55052c0ee..000000000
--- a/src/main/jni/libyuv/source/compare_neon.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
-
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
- volatile uint32 sse;
- asm volatile (
- "vmov.u8 q8, #0 \n"
- "vmov.u8 q10, #0 \n"
- "vmov.u8 q9, #0 \n"
- "vmov.u8 q11, #0 \n"
-
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n"
- MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n"
- "subs %2, %2, #16 \n"
- "vsubl.u8 q2, d0, d2 \n"
- "vsubl.u8 q3, d1, d3 \n"
- "vmlal.s16 q8, d4, d4 \n"
- "vmlal.s16 q9, d6, d6 \n"
- "vmlal.s16 q10, d5, d5 \n"
- "vmlal.s16 q11, d7, d7 \n"
- "bgt 1b \n"
-
- "vadd.u32 q8, q8, q9 \n"
- "vadd.u32 q10, q10, q11 \n"
- "vadd.u32 q11, q8, q10 \n"
- "vpaddl.u32 q1, q11 \n"
- "vadd.u64 d0, d2, d3 \n"
- "vmov.32 %3, d0[0] \n"
- : "+r"(src_a),
- "+r"(src_b),
- "+r"(count),
- "=r"(sse)
- :
- : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
- return sse;
-}
-
-#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
- volatile uint32 sse;
- asm volatile (
- "eor v16.16b, v16.16b, v16.16b \n"
- "eor v18.16b, v18.16b, v18.16b \n"
- "eor v17.16b, v17.16b, v17.16b \n"
- "eor v19.16b, v19.16b, v19.16b \n"
-
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n"
- MEMACCESS(1)
- "ld1 {v1.16b}, [%1], #16 \n"
- "subs %2, %2, #16 \n"
- "usubl v2.8h, v0.8b, v1.8b \n"
- "usubl2 v3.8h, v0.16b, v1.16b \n"
- "smlal v16.4s, v2.4h, v2.4h \n"
- "smlal v17.4s, v3.4h, v3.4h \n"
- "smlal2 v18.4s, v2.8h, v2.8h \n"
- "smlal2 v19.4s, v3.8h, v3.8h \n"
- "bgt 1b \n"
-
- "add v16.4s, v16.4s, v17.4s \n"
- "add v18.4s, v18.4s, v19.4s \n"
- "add v19.4s, v16.4s, v18.4s \n"
- "addv s0, v19.4s \n"
- "fmov %w3, s0 \n"
- : "+r"(src_a),
- "+r"(src_b),
- "+r"(count),
- "=r"(sse)
- :
- : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
- return sse;
-}
-
-#endif // __ARM_NEON__
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/compare_posix.cc b/src/main/jni/libyuv/source/compare_posix.cc
deleted file mode 100644
index ac361190e..000000000
--- a/src/main/jni/libyuv/source/compare_posix.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
- uint32 sse;
- asm volatile ( // NOLINT
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm5,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm1 \n"
- "lea " MEMLEA(0x10, 0) ",%0 \n"
- "movdqa " MEMACCESS(1) ",%%xmm2 \n"
- "lea " MEMLEA(0x10, 1) ",%1 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psubusb %%xmm2,%%xmm1 \n"
- "psubusb %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm2 \n"
- "pmaddwd %%xmm1,%%xmm1 \n"
- "pmaddwd %%xmm2,%%xmm2 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "jg 1b \n"
-
- "pshufd $0xee,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "pshufd $0x1,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "movd %%xmm0,%3 \n"
-
- : "+r"(src_a), // %0
- "+r"(src_b), // %1
- "+r"(count), // %2
- "=g"(sse) // %3
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- ); // NOLINT
- return sse;
-}
-
-#endif // defined(__x86_64__) || defined(__i386__)
-
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
-#define HAS_HASHDJB2_SSE41
-static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
-static uvec32 kHashMul0 = {
- 0x0c3525e1, // 33 ^ 15
- 0xa3476dc1, // 33 ^ 14
- 0x3b4039a1, // 33 ^ 13
- 0x4f5f0981, // 33 ^ 12
-};
-static uvec32 kHashMul1 = {
- 0x30f35d61, // 33 ^ 11
- 0x855cb541, // 33 ^ 10
- 0x040a9121, // 33 ^ 9
- 0x747c7101, // 33 ^ 8
-};
-static uvec32 kHashMul2 = {
- 0xec41d4e1, // 33 ^ 7
- 0x4cfa3cc1, // 33 ^ 6
- 0x025528a1, // 33 ^ 5
- 0x00121881, // 33 ^ 4
-};
-static uvec32 kHashMul3 = {
- 0x00008c61, // 33 ^ 3
- 0x00000441, // 33 ^ 2
- 0x00000021, // 33 ^ 1
- 0x00000001, // 33 ^ 0
-};
-
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
- uint32 hash;
- asm volatile ( // NOLINT
- "movd %2,%%xmm0 \n"
- "pxor %%xmm7,%%xmm7 \n"
- "movdqa %4,%%xmm6 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm1 \n"
- "lea " MEMLEA(0x10, 0) ",%0 \n"
- "pmulld %%xmm6,%%xmm0 \n"
- "movdqa %5,%%xmm5 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm7,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm7,%%xmm3 \n"
- "pmulld %%xmm5,%%xmm3 \n"
- "movdqa %6,%%xmm5 \n"
- "movdqa %%xmm2,%%xmm4 \n"
- "punpckhwd %%xmm7,%%xmm4 \n"
- "pmulld %%xmm5,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "punpckhbw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklwd %%xmm7,%%xmm2 \n"
- "pmulld %%xmm5,%%xmm2 \n"
- "movdqa %8,%%xmm5 \n"
- "punpckhwd %%xmm7,%%xmm1 \n"
- "pmulld %%xmm5,%%xmm1 \n"
- "paddd %%xmm4,%%xmm3 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "sub $0x10,%1 \n"
- "paddd %%xmm3,%%xmm1 \n"
- "pshufd $0xe,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "pshufd $0x1,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "jg 1b \n"
- "movd %%xmm0,%3 \n"
- : "+r"(src), // %0
- "+r"(count), // %1
- "+rm"(seed), // %2
- "=g"(hash) // %3
- : "m"(kHash16x33), // %4
- "m"(kHashMul0), // %5
- "m"(kHashMul1), // %6
- "m"(kHashMul2), // %7
- "m"(kHashMul3) // %8
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- ); // NOLINT
- return hash;
-}
-#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
-
diff --git a/src/main/jni/libyuv/source/compare_win.cc b/src/main/jni/libyuv/source/compare_win.cc
deleted file mode 100644
index 99831651f..000000000
--- a/src/main/jni/libyuv/source/compare_win.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-
-__declspec(naked) __declspec(align(16))
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
- __asm {
- mov eax, [esp + 4] // src_a
- mov edx, [esp + 8] // src_b
- mov ecx, [esp + 12] // count
- pxor xmm0, xmm0
- pxor xmm5, xmm5
-
- align 4
- wloop:
- movdqa xmm1, [eax]
- lea eax, [eax + 16]
- movdqa xmm2, [edx]
- lea edx, [edx + 16]
- sub ecx, 16
- movdqa xmm3, xmm1 // abs trick
- psubusb xmm1, xmm2
- psubusb xmm2, xmm3
- por xmm1, xmm2
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm5
- punpckhbw xmm2, xmm5
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm0, xmm1
- paddd xmm0, xmm2
- jg wloop
-
- pshufd xmm1, xmm0, 0xee
- paddd xmm0, xmm1
- pshufd xmm1, xmm0, 0x01
- paddd xmm0, xmm1
- movd eax, xmm0
- ret
- }
-}
-
-// Visual C 2012 required for AVX2.
-#if _MSC_VER >= 1700
-// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
-#pragma warning(disable: 4752)
-__declspec(naked) __declspec(align(16))
-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
- __asm {
- mov eax, [esp + 4] // src_a
- mov edx, [esp + 8] // src_b
- mov ecx, [esp + 12] // count
- vpxor ymm0, ymm0, ymm0 // sum
- vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
- sub edx, eax
-
- align 4
- wloop:
- vmovdqu ymm1, [eax]
- vmovdqu ymm2, [eax + edx]
- lea eax, [eax + 32]
- sub ecx, 32
- vpsubusb ymm3, ymm1, ymm2 // abs difference trick
- vpsubusb ymm2, ymm2, ymm1
- vpor ymm1, ymm2, ymm3
- vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
- vpunpckhbw ymm1, ymm1, ymm5
- vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
- vpmaddwd ymm1, ymm1, ymm1
- vpaddd ymm0, ymm0, ymm1
- vpaddd ymm0, ymm0, ymm2
- jg wloop
-
- vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
- vpaddd ymm0, ymm0, ymm1
- vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
- vpaddd ymm0, ymm0, ymm1
- vpermq ymm1, ymm0, 0x02 // high + low lane.
- vpaddd ymm0, ymm0, ymm1
- vmovd eax, xmm0
- vzeroupper
- ret
- }
-}
-#endif // _MSC_VER >= 1700
-
-#define HAS_HASHDJB2_SSE41
-static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
-static uvec32 kHashMul0 = {
- 0x0c3525e1, // 33 ^ 15
- 0xa3476dc1, // 33 ^ 14
- 0x3b4039a1, // 33 ^ 13
- 0x4f5f0981, // 33 ^ 12
-};
-static uvec32 kHashMul1 = {
- 0x30f35d61, // 33 ^ 11
- 0x855cb541, // 33 ^ 10
- 0x040a9121, // 33 ^ 9
- 0x747c7101, // 33 ^ 8
-};
-static uvec32 kHashMul2 = {
- 0xec41d4e1, // 33 ^ 7
- 0x4cfa3cc1, // 33 ^ 6
- 0x025528a1, // 33 ^ 5
- 0x00121881, // 33 ^ 4
-};
-static uvec32 kHashMul3 = {
- 0x00008c61, // 33 ^ 3
- 0x00000441, // 33 ^ 2
- 0x00000021, // 33 ^ 1
- 0x00000001, // 33 ^ 0
-};
-
-// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
-// 44: 66 0F 38 40 DD pmulld xmm3,xmm5
-// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
-// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
-// 83: 66 0F 38 40 CD pmulld xmm1,xmm5
-#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
- _asm _emit 0x40 _asm _emit reg
-
-__declspec(naked) __declspec(align(16))
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
- __asm {
- mov eax, [esp + 4] // src
- mov ecx, [esp + 8] // count
- movd xmm0, [esp + 12] // seed
-
- pxor xmm7, xmm7 // constant 0 for unpck
- movdqa xmm6, kHash16x33
-
- align 4
- wloop:
- movdqu xmm1, [eax] // src[0-15]
- lea eax, [eax + 16]
- pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
- movdqa xmm5, kHashMul0
- movdqa xmm2, xmm1
- punpcklbw xmm2, xmm7 // src[0-7]
- movdqa xmm3, xmm2
- punpcklwd xmm3, xmm7 // src[0-3]
- pmulld(0xdd) // pmulld xmm3, xmm5
- movdqa xmm5, kHashMul1
- movdqa xmm4, xmm2
- punpckhwd xmm4, xmm7 // src[4-7]
- pmulld(0xe5) // pmulld xmm4, xmm5
- movdqa xmm5, kHashMul2
- punpckhbw xmm1, xmm7 // src[8-15]
- movdqa xmm2, xmm1
- punpcklwd xmm2, xmm7 // src[8-11]
- pmulld(0xd5) // pmulld xmm2, xmm5
- movdqa xmm5, kHashMul3
- punpckhwd xmm1, xmm7 // src[12-15]
- pmulld(0xcd) // pmulld xmm1, xmm5
- paddd xmm3, xmm4 // add 16 results
- paddd xmm1, xmm2
- sub ecx, 16
- paddd xmm1, xmm3
-
- pshufd xmm2, xmm1, 0x0e // upper 2 dwords
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 0x01
- paddd xmm1, xmm2
- paddd xmm0, xmm1
- jg wloop
-
- movd eax, xmm0 // return hash
- ret
- }
-}
-
-// Visual C 2012 required for AVX2.
-#if _MSC_VER >= 1700
-__declspec(naked) __declspec(align(16))
-uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
- __asm {
- mov eax, [esp + 4] // src
- mov ecx, [esp + 8] // count
- movd xmm0, [esp + 12] // seed
- movdqa xmm6, kHash16x33
-
- align 4
- wloop:
- vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
- pmulld xmm0, xmm6 // hash *= 33 ^ 16
- vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7]
- pmulld xmm3, kHashMul0
- vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11]
- pmulld xmm4, kHashMul1
- vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15]
- pmulld xmm2, kHashMul2
- lea eax, [eax + 16]
- pmulld xmm1, kHashMul3
- paddd xmm3, xmm4 // add 16 results
- paddd xmm1, xmm2
- sub ecx, 16
- paddd xmm1, xmm3
- pshufd xmm2, xmm1, 0x0e // upper 2 dwords
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 0x01
- paddd xmm1, xmm2
- paddd xmm0, xmm1
- jg wloop
-
- movd eax, xmm0 // return hash
- ret
- }
-}
-#endif // _MSC_VER >= 1700
-
-#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/convert.cc b/src/main/jni/libyuv/source/convert.cc
deleted file mode 100644
index c31ecf263..000000000
--- a/src/main/jni/libyuv/source/convert.cc
+++ /dev/null
@@ -1,1543 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-#include "libyuv/scale.h" // For ScalePlane()
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
-static __inline int Abs(int v) {
- return v >= 0 ? v : -v;
-}
-
-// Any I4xx To I420 format with mirroring.
-static int I4xxToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int src_y_width, int src_y_height,
- int src_uv_width, int src_uv_height) {
- const int dst_y_width = Abs(src_y_width);
- const int dst_y_height = Abs(src_y_height);
- const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
- const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
- if (src_y_width == 0 || src_y_height == 0 ||
- src_uv_width == 0 || src_uv_height == 0) {
- return -1;
- }
- ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
- dst_y, dst_stride_y, dst_y_width, dst_y_height,
- kFilterBilinear);
- ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
- dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
- kFilterBilinear);
- ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
- dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
- kFilterBilinear);
- return 0;
-}
-
-// Copy I420 with optional flipping
-// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
-// is does row coalescing.
-LIBYUV_API
-int I420Copy(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_y || !src_u || !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
-
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
- // Copy UV planes.
- CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
- CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
- return 0;
-}
-
-// 422 chroma is 1/2 width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
-LIBYUV_API
-int I422ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- const int src_uv_width = SUBSAMPLE(width, 1, 1);
- return I4xxToI420(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height,
- src_uv_width, height);
-}
-
-// 444 chroma is 1x width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
-LIBYUV_API
-int I444ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- return I4xxToI420(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height,
- width, height);
-}
-
-// 411 chroma is 1/4 width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
-LIBYUV_API
-int I411ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- const int src_uv_width = SUBSAMPLE(width, 3, 2);
- return I4xxToI420(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height,
- src_uv_width, height);
-}
-
-// I400 is greyscale typically used in MJPG
-LIBYUV_API
-int I400ToI420(const uint8* src_y, int src_stride_y,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_y || !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_stride_y = -src_stride_y;
- }
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
- SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
- return 0;
-}
-
-static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
- uint8* dst, int dst_stride,
- int width, int height) {
- int y;
- void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_X86)
- if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
- CopyRow = CopyRow_X86;
- }
-#endif
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
- IS_ALIGNED(src, 16) &&
- IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
- CopyRow = CopyRow_SSE2;
- }
-#endif
-#if defined(HAS_COPYROW_ERMS)
- if (TestCpuFlag(kCpuHasERMS)) {
- CopyRow = CopyRow_ERMS;
- }
-#endif
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
- CopyRow = CopyRow_NEON;
- }
-#endif
-#if defined(HAS_COPYROW_MIPS)
- if (TestCpuFlag(kCpuHasMIPS)) {
- CopyRow = CopyRow_MIPS;
- }
-#endif
-
- // Copy plane
- for (y = 0; y < height - 1; y += 2) {
- CopyRow(src, dst, width);
- CopyRow(src + src_stride_0, dst + dst_stride, width);
- src += src_stride_0 + src_stride_1;
- dst += dst_stride * 2;
- }
- if (height & 1) {
- CopyRow(src, dst, width);
- }
-}
-
-// Support converting from FOURCC_M420
-// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
-// easy conversion to I420.
-// M420 format description:
-// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
-// Chroma is half width / half height. (420)
-// src_stride_m420 is row planar. Normally this will be the width in pixels.
-// The UV plane is half width, but 2 values, so src_stride_m420 applies to
-// this as well as the two Y planes.
-static int X420ToI420(const uint8* src_y,
- int src_stride_y0, int src_stride_y1,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
- SplitUVRow_C;
- if (!src_y || !src_uv ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_u = dst_u + (halfheight - 1) * dst_stride_u;
- dst_v = dst_v + (halfheight - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
- // Coalesce rows.
- if (src_stride_y0 == width &&
- src_stride_y1 == width &&
- dst_stride_y == width) {
- width *= height;
- height = 1;
- src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
- }
- // Coalesce rows.
- if (src_stride_uv == halfwidth * 2 &&
- dst_stride_u == halfwidth &&
- dst_stride_v == halfwidth) {
- halfwidth *= halfheight;
- halfheight = 1;
- src_stride_uv = dst_stride_u = dst_stride_v = 0;
- }
-#if defined(HAS_SPLITUVROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
- SplitUVRow = SplitUVRow_Any_SSE2;
- if (IS_ALIGNED(halfwidth, 16)) {
- SplitUVRow = SplitUVRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&
- IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
- IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
- SplitUVRow = SplitUVRow_SSE2;
- }
- }
- }
-#endif
-#if defined(HAS_SPLITUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
- SplitUVRow = SplitUVRow_Any_AVX2;
- if (IS_ALIGNED(halfwidth, 32)) {
- SplitUVRow = SplitUVRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_SPLITUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
- SplitUVRow = SplitUVRow_Any_NEON;
- if (IS_ALIGNED(halfwidth, 16)) {
- SplitUVRow = SplitUVRow_NEON;
- }
- }
-#endif
-#if defined(HAS_SPLITUVROW_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16) {
- SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;
- if (IS_ALIGNED(halfwidth, 16)) {
- SplitUVRow = SplitUVRow_Unaligned_MIPS_DSPR2;
- if (IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
- IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
- IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
- SplitUVRow = SplitUVRow_MIPS_DSPR2;
- }
- }
- }
-#endif
-
- if (dst_y) {
- if (src_stride_y0 == src_stride_y1) {
- CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
- } else {
- CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
- width, height);
- }
- }
-
- for (y = 0; y < halfheight; ++y) {
- // Copy a row of UV.
- SplitUVRow(src_uv, dst_u, dst_v, halfwidth);
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- src_uv += src_stride_uv;
- }
- return 0;
-}
-
-// Convert NV12 to I420.
-LIBYUV_API
-int NV12ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- return X420ToI420(src_y, src_stride_y, src_stride_y,
- src_uv, src_stride_uv,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height);
-}
-
-// Convert NV21 to I420. Same as NV12 but u and v pointers swapped.
-LIBYUV_API
-int NV21ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_vu, int src_stride_vu,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- return X420ToI420(src_y, src_stride_y, src_stride_y,
- src_vu, src_stride_vu,
- dst_y, dst_stride_y,
- dst_v, dst_stride_v,
- dst_u, dst_stride_u,
- width, height);
-}
-
-// Convert M420 to I420.
-LIBYUV_API
-int M420ToI420(const uint8* src_m420, int src_stride_m420,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
- src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height);
-}
-
-// Convert Q420 to I420.
-// Format is rows of YY/YUYV
-LIBYUV_API
-int Q420ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
- int halfheight;
- void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
- void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
- int pix) = YUY2ToUV422Row_C;
- void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
- YUY2ToYRow_C;
- if (!src_y || !src_yuy2 ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_u = dst_u + (halfheight - 1) * dst_stride_u;
- dst_v = dst_v + (halfheight - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
- // CopyRow for rows of just Y in Q420 copied to Y plane of I420.
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
- CopyRow = CopyRow_NEON;
- }
-#endif
-#if defined(HAS_COPYROW_X86)
- if (IS_ALIGNED(width, 4)) {
- CopyRow = CopyRow_X86;
- }
-#endif
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- CopyRow = CopyRow_SSE2;
- }
-#endif
-#if defined(HAS_COPYROW_ERMS)
- if (TestCpuFlag(kCpuHasERMS)) {
- CopyRow = CopyRow_ERMS;
- }
-#endif
-#if defined(HAS_COPYROW_MIPS)
- if (TestCpuFlag(kCpuHasMIPS)) {
- CopyRow = CopyRow_MIPS;
- }
-#endif
-
-#if defined(HAS_YUY2TOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
- YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
- YUY2ToYRow = YUY2ToYRow_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
- YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- YUY2ToYRow = YUY2ToYRow_SSE2;
- }
- }
- }
- }
-#endif
-#if defined(HAS_YUY2TOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
- YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
- YUY2ToYRow = YUY2ToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
- YUY2ToYRow = YUY2ToYRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_YUY2TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- YUY2ToYRow = YUY2ToYRow_Any_NEON;
- if (width >= 16) {
- YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
- }
- if (IS_ALIGNED(width, 16)) {
- YUY2ToYRow = YUY2ToYRow_NEON;
- YUY2ToUV422Row = YUY2ToUV422Row_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- CopyRow(src_y, dst_y, width);
- src_y += src_stride_y;
- dst_y += dst_stride_y;
-
- YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
- YUY2ToYRow(src_yuy2, dst_y, width);
- src_yuy2 += src_stride_yuy2;
- dst_y += dst_stride_y;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- CopyRow(src_y, dst_y, width);
- YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
- }
- return 0;
-}
-
-// Convert YUY2 to I420.
-LIBYUV_API
-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
- void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C;
- void (*YUY2ToYRow)(const uint8* src_yuy2,
- uint8* dst_y, int pix) = YUY2ToYRow_C;
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
- src_stride_yuy2 = -src_stride_yuy2;
- }
-#if defined(HAS_YUY2TOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
- YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
- YUY2ToYRow = YUY2ToYRow_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2;
- YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
- YUY2ToUVRow = YUY2ToUVRow_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- YUY2ToYRow = YUY2ToYRow_SSE2;
- }
- }
- }
- }
-#endif
-#if defined(HAS_YUY2TOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
- YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
- YUY2ToYRow = YUY2ToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- YUY2ToUVRow = YUY2ToUVRow_AVX2;
- YUY2ToYRow = YUY2ToYRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_YUY2TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- YUY2ToYRow = YUY2ToYRow_Any_NEON;
- if (width >= 16) {
- YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
- }
- if (IS_ALIGNED(width, 16)) {
- YUY2ToYRow = YUY2ToYRow_NEON;
- YUY2ToUVRow = YUY2ToUVRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
- YUY2ToYRow(src_yuy2, dst_y, width);
- YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
- src_yuy2 += src_stride_yuy2 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
- YUY2ToYRow(src_yuy2, dst_y, width);
- }
- return 0;
-}
-
-// Convert UYVY to I420.
-LIBYUV_API
-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
- void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C;
- void (*UYVYToYRow)(const uint8* src_uyvy,
- uint8* dst_y, int pix) = UYVYToYRow_C;
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
- src_stride_uyvy = -src_stride_uyvy;
- }
-#if defined(HAS_UYVYTOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
- UYVYToUVRow = UYVYToUVRow_Any_SSE2;
- UYVYToYRow = UYVYToYRow_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2;
- UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
- UYVYToUVRow = UYVYToUVRow_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- UYVYToYRow = UYVYToYRow_SSE2;
- }
- }
- }
- }
-#endif
-#if defined(HAS_UYVYTOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
- UYVYToUVRow = UYVYToUVRow_Any_AVX2;
- UYVYToYRow = UYVYToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- UYVYToUVRow = UYVYToUVRow_AVX2;
- UYVYToYRow = UYVYToYRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_UYVYTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- UYVYToYRow = UYVYToYRow_Any_NEON;
- if (width >= 16) {
- UYVYToUVRow = UYVYToUVRow_Any_NEON;
- }
- if (IS_ALIGNED(width, 16)) {
- UYVYToYRow = UYVYToYRow_NEON;
- UYVYToUVRow = UYVYToUVRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
- UYVYToYRow(src_uyvy, dst_y, width);
- UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
- src_uyvy += src_stride_uyvy * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
- UYVYToYRow(src_uyvy, dst_y, width);
- }
- return 0;
-}
-
-// Convert ARGB to I420.
-LIBYUV_API
-int ARGBToI420(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
- if (!src_argb ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
- ARGBToYRow = ARGBToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
- ARGBToYRow = ARGBToYRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
- ARGBToYRow(src_argb, dst_y, width);
- ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
- src_argb += src_stride_argb * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
- ARGBToYRow(src_argb, dst_y, width);
- }
- return 0;
-}
-
-// Convert BGRA to I420.
-LIBYUV_API
-int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
- void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
- void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) =
- BGRAToYRow_C;
- if (!src_bgra ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_bgra = src_bgra + (height - 1) * src_stride_bgra;
- src_stride_bgra = -src_stride_bgra;
- }
-#if defined(HAS_BGRATOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
- BGRAToYRow = BGRAToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3;
- BGRAToYRow = BGRAToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) {
- BGRAToUVRow = BGRAToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- BGRAToYRow = BGRAToYRow_SSSE3;
- }
- }
- }
- }
-#elif defined(HAS_BGRATOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- BGRAToYRow = BGRAToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- BGRAToYRow = BGRAToYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_BGRATOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- BGRAToUVRow = BGRAToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- BGRAToUVRow = BGRAToUVRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
- BGRAToYRow(src_bgra, dst_y, width);
- BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
- src_bgra += src_stride_bgra * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
- BGRAToYRow(src_bgra, dst_y, width);
- }
- return 0;
-}
-
-// Convert ABGR to I420.
-LIBYUV_API
-int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
- void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
- void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) =
- ABGRToYRow_C;
- if (!src_abgr ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_abgr = src_abgr + (height - 1) * src_stride_abgr;
- src_stride_abgr = -src_stride_abgr;
- }
-#if defined(HAS_ABGRTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
- ABGRToYRow = ABGRToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3;
- ABGRToYRow = ABGRToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) {
- ABGRToUVRow = ABGRToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ABGRToYRow = ABGRToYRow_SSSE3;
- }
- }
- }
- }
-#elif defined(HAS_ABGRTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ABGRToYRow = ABGRToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ABGRToYRow = ABGRToYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ABGRTOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- ABGRToUVRow = ABGRToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ABGRToUVRow = ABGRToUVRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
- ABGRToYRow(src_abgr, dst_y, width);
- ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
- src_abgr += src_stride_abgr * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
- ABGRToYRow(src_abgr, dst_y, width);
- }
- return 0;
-}
-
-// Convert RGBA to I420.
-LIBYUV_API
-int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
- void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
- void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) =
- RGBAToYRow_C;
- if (!src_rgba ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_rgba = src_rgba + (height - 1) * src_stride_rgba;
- src_stride_rgba = -src_stride_rgba;
- }
-#if defined(HAS_RGBATOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
- RGBAToYRow = RGBAToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3;
- RGBAToYRow = RGBAToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) {
- RGBAToUVRow = RGBAToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- RGBAToYRow = RGBAToYRow_SSSE3;
- }
- }
- }
- }
-#elif defined(HAS_RGBATOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- RGBAToYRow = RGBAToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- RGBAToYRow = RGBAToYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_RGBATOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- RGBAToUVRow = RGBAToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- RGBAToUVRow = RGBAToUVRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
- RGBAToYRow(src_rgba, dst_y, width);
- RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
- src_rgba += src_stride_rgba * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
- RGBAToYRow(src_rgba, dst_y, width);
- }
- return 0;
-}
-
-// Convert RGB24 to I420.
-LIBYUV_API
-int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
-#if defined(HAS_RGB24TOYROW_NEON)
- void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
- void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) =
- RGB24ToYRow_C;
-#else
- void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
- RGB24ToARGBRow_C;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
-#endif
- if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
- src_stride_rgb24 = -src_stride_rgb24;
- }
-
-#if defined(HAS_RGB24TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- RGB24ToYRow = RGB24ToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- RGB24ToYRow = RGB24ToYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_RGB24TOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToUVRow = RGB24ToUVRow_NEON;
- }
- }
-#endif
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
- }
-#endif // HAS_ARGBTOUVROW_SSSE3
-
- {
-#if !defined(HAS_RGB24TOYROW_NEON)
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB24TOYROW_NEON)
- RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
- RGB24ToYRow(src_rgb24, dst_y, width);
- RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
-#else
- RGB24ToARGBRow(src_rgb24, row, width);
- RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
- src_rgb24 += src_stride_rgb24 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
-#if defined(HAS_RGB24TOYROW_NEON)
- RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
- RGB24ToYRow(src_rgb24, dst_y, width);
-#else
- RGB24ToARGBRow(src_rgb24, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
-#endif
- }
-#if !defined(HAS_RGB24TOYROW_NEON)
- free_aligned_buffer_64(row);
-#endif
- }
- return 0;
-}
-
-// Convert RAW to I420.
-LIBYUV_API
-int RAWToI420(const uint8* src_raw, int src_stride_raw,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
-#if defined(HAS_RAWTOYROW_NEON)
- void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
- uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
- void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) =
- RAWToYRow_C;
-#else
- void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
- RAWToARGBRow_C;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
-#endif
- if (!src_raw || !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_raw = src_raw + (height - 1) * src_stride_raw;
- src_stride_raw = -src_stride_raw;
- }
-
-#if defined(HAS_RAWTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- RAWToYRow = RAWToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- RAWToYRow = RAWToYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_RAWTOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- RAWToUVRow = RAWToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- RAWToUVRow = RAWToUVRow_NEON;
- }
- }
-#endif
-#if defined(HAS_RAWTOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- RAWToARGBRow = RAWToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
- }
-#endif // HAS_ARGBTOUVROW_SSSE3
-
- {
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
-
- for (y = 0; y < height - 1; y += 2) {
- #if defined(HAS_RAWTOYROW_NEON)
- RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
- RAWToYRow(src_raw, dst_y, width);
- RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
- #else
- RAWToARGBRow(src_raw, row, width);
- RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
- #endif
- src_raw += src_stride_raw * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- #if defined(HAS_RAWTOYROW_NEON)
- RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
- RAWToYRow(src_raw, dst_y, width);
- #else
- RAWToARGBRow(src_raw, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- #endif
- }
- #if !defined(HAS_RAWTOYROW_NEON)
- free_aligned_buffer_64(row);
- #endif
- }
- return 0;
-}
-
-// Convert RGB565 to I420.
-LIBYUV_API
-int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
-#if defined(HAS_RGB565TOYROW_NEON)
- void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
- void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) =
- RGB565ToYRow_C;
-#else
- void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
- RGB565ToARGBRow_C;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
-#endif
- if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
- src_stride_rgb565 = -src_stride_rgb565;
- }
-
-#if defined(HAS_RGB565TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- RGB565ToYRow = RGB565ToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- RGB565ToYRow = RGB565ToYRow_NEON;
- }
- if (width >= 16) {
- RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- RGB565ToUVRow = RGB565ToUVRow_NEON;
- }
- }
- }
-#else // HAS_RGB565TOYROW_NEON
-
-#if defined(HAS_RGB565TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
- RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
- }
-#endif // HAS_ARGBTOUVROW_SSSE3
-#endif // HAS_RGB565TOYROW_NEON
-
- {
-#if !defined(HAS_RGB565TOYROW_NEON)
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB565TOYROW_NEON)
- RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
- RGB565ToYRow(src_rgb565, dst_y, width);
- RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
-#else
- RGB565ToARGBRow(src_rgb565, row, width);
- RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
- src_rgb565 += src_stride_rgb565 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
-#if defined(HAS_RGB565TOYROW_NEON)
- RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
- RGB565ToYRow(src_rgb565, dst_y, width);
-#else
- RGB565ToARGBRow(src_rgb565, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
-#endif
- }
-#if !defined(HAS_RGB565TOYROW_NEON)
- free_aligned_buffer_64(row);
-#endif
- }
- return 0;
-}
-
-// Convert ARGB1555 to I420.
-LIBYUV_API
-int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
-#if defined(HAS_ARGB1555TOYROW_NEON)
- void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
- void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) =
- ARGB1555ToYRow_C;
-#else
- void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
- ARGB1555ToARGBRow_C;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
-#endif
- if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
- src_stride_argb1555 = -src_stride_argb1555;
- }
-
-#if defined(HAS_ARGB1555TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGB1555ToYRow = ARGB1555ToYRow_NEON;
- }
- if (width >= 16) {
- ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
- }
- }
- }
-#else // HAS_ARGB1555TOYROW_NEON
-
-#if defined(HAS_ARGB1555TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
- }
-#endif // HAS_ARGBTOUVROW_SSSE3
-#endif // HAS_ARGB1555TOYROW_NEON
-
- {
-#if !defined(HAS_ARGB1555TOYROW_NEON)
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
-#endif
- for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB1555TOYROW_NEON)
- ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
- ARGB1555ToYRow(src_argb1555, dst_y, width);
- ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
- width);
-#else
- ARGB1555ToARGBRow(src_argb1555, row, width);
- ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
- width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
- src_argb1555 += src_stride_argb1555 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
-#if defined(HAS_ARGB1555TOYROW_NEON)
- ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
- ARGB1555ToYRow(src_argb1555, dst_y, width);
-#else
- ARGB1555ToARGBRow(src_argb1555, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
-#endif
- }
-#if !defined(HAS_ARGB1555TOYROW_NEON)
- free_aligned_buffer_64(row);
-#endif
- }
- return 0;
-}
-
-// Convert ARGB4444 to I420.
-LIBYUV_API
-int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
-#if defined(HAS_ARGB4444TOYROW_NEON)
- void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
- void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) =
- ARGB4444ToYRow_C;
-#else
- void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
- ARGB4444ToARGBRow_C;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
-#endif
- if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
- src_stride_argb4444 = -src_stride_argb4444;
- }
-
-#if defined(HAS_ARGB4444TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGB4444ToYRow = ARGB4444ToYRow_NEON;
- }
- if (width >= 16) {
- ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
- }
- }
- }
-#else // HAS_ARGB4444TOYROW_NEON
-
-#if defined(HAS_ARGB4444TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
- }
-#endif // HAS_ARGBTOUVROW_SSSE3
-#endif // HAS_ARGB4444TOYROW_NEON
-
- {
-#if !defined(HAS_ARGB4444TOYROW_NEON)
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB4444TOYROW_NEON)
- ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
- ARGB4444ToYRow(src_argb4444, dst_y, width);
- ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
- width);
-#else
- ARGB4444ToARGBRow(src_argb4444, row, width);
- ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
- width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
- src_argb4444 += src_stride_argb4444 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
-#if defined(HAS_ARGB4444TOYROW_NEON)
- ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
- ARGB4444ToYRow(src_argb4444, dst_y, width);
-#else
- ARGB4444ToARGBRow(src_argb4444, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
-#endif
- }
-#if !defined(HAS_ARGB4444TOYROW_NEON)
- free_aligned_buffer_64(row);
-#endif
- }
- return 0;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/convert_argb.cc b/src/main/jni/libyuv/source/convert_argb.cc
deleted file mode 100644
index ac0bc3d15..000000000
--- a/src/main/jni/libyuv/source/convert_argb.cc
+++ /dev/null
@@ -1,938 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert_argb.h"
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-#include "libyuv/rotate_argb.h"
-#include "libyuv/row.h"
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Copy ARGB with optional flipping
-LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- if (!src_argb || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
-
- CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
- width * 4, height);
- return 0;
-}
-
-// Convert I444 to ARGB.
-LIBYUV_API
-int I444ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*I444ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I444ToARGBRow_C;
- if (!src_y || !src_u || !src_v ||
- !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_y == width &&
- src_stride_u == width &&
- src_stride_v == width &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
- }
-#if defined(HAS_I444TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I444ToARGBRow = I444ToARGBRow_SSSE3;
- }
- }
- }
-#elif defined(HAS_I444TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- I444ToARGBRow = I444ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I444ToARGBRow = I444ToARGBRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
-}
-
-// Convert I422 to ARGB.
-LIBYUV_API
-int I422ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*I422ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToARGBRow_C;
- if (!src_y || !src_u || !src_v ||
- !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_y == width &&
- src_stride_u * 2 == width &&
- src_stride_v * 2 == width &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
- }
-#if defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
- I422ToARGBRow = I422ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGBRow = I422ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- I422ToARGBRow = I422ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
- IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
- IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
- IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
- I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
-}
-
-// Convert I411 to ARGB.
-LIBYUV_API
-int I411ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*I411ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I411ToARGBRow_C;
- if (!src_y || !src_u || !src_v ||
- !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_y == width &&
- src_stride_u * 4 == width &&
- src_stride_v * 4 == width &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
- }
-#if defined(HAS_I411TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I411ToARGBRow = I411ToARGBRow_SSSE3;
- }
- }
- }
-#elif defined(HAS_I411TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- I411ToARGBRow = I411ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I411ToARGBRow = I411ToARGBRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
-}
-
-// Convert I400 to ARGB.
-LIBYUV_API
-int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*YToARGBRow)(const uint8* y_buf,
- uint8* rgb_buf,
- int width) = YToARGBRow_C;
- if (!src_y || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_y == width &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_y = dst_stride_argb = 0;
- }
-#if defined(HAS_YTOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- YToARGBRow = YToARGBRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- YToARGBRow = YToARGBRow_SSE2;
- }
- }
-#elif defined(HAS_YTOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- YToARGBRow = YToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- YToARGBRow = YToARGBRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- YToARGBRow(src_y, dst_argb, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- }
- return 0;
-}
-
-// Convert I400 to ARGB.
-LIBYUV_API
-int I400ToARGB(const uint8* src_y, int src_stride_y,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
- I400ToARGBRow_C;
- if (!src_y || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_stride_y = -src_stride_y;
- }
- // Coalesce rows.
- if (src_stride_y == width &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_y = dst_stride_argb = 0;
- }
-#if defined(HAS_I400TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
- I400ToARGBRow = I400ToARGBRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I400ToARGBRow = I400ToARGBRow_SSE2;
- }
- }
- }
-#elif defined(HAS_I400TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- I400ToARGBRow = I400ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I400ToARGBRow = I400ToARGBRow_NEON;
- }
- }
-#endif
- for (y = 0; y < height; ++y) {
- I400ToARGBRow(src_y, dst_argb, width);
- src_y += src_stride_y;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Shuffle table for converting BGRA to ARGB.
-static uvec8 kShuffleMaskBGRAToARGB = {
- 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
-};
-
-// Shuffle table for converting ABGR to ARGB.
-static uvec8 kShuffleMaskABGRToARGB = {
- 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
-};
-
-// Shuffle table for converting RGBA to ARGB.
-static uvec8 kShuffleMaskRGBAToARGB = {
- 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
-};
-
-// Convert BGRA to ARGB.
-LIBYUV_API
-int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return ARGBShuffle(src_bgra, src_stride_bgra,
- dst_argb, dst_stride_argb,
- (const uint8*)(&kShuffleMaskBGRAToARGB),
- width, height);
-}
-
-// Convert ARGB to BGRA (same as BGRAToARGB).
-LIBYUV_API
-int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return ARGBShuffle(src_bgra, src_stride_bgra,
- dst_argb, dst_stride_argb,
- (const uint8*)(&kShuffleMaskBGRAToARGB),
- width, height);
-}
-
-// Convert ABGR to ARGB.
-LIBYUV_API
-int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return ARGBShuffle(src_abgr, src_stride_abgr,
- dst_argb, dst_stride_argb,
- (const uint8*)(&kShuffleMaskABGRToARGB),
- width, height);
-}
-
-// Convert ARGB to ABGR to (same as ABGRToARGB).
-LIBYUV_API
-int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return ARGBShuffle(src_abgr, src_stride_abgr,
- dst_argb, dst_stride_argb,
- (const uint8*)(&kShuffleMaskABGRToARGB),
- width, height);
-}
-
-// Convert RGBA to ARGB.
-LIBYUV_API
-int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return ARGBShuffle(src_rgba, src_stride_rgba,
- dst_argb, dst_stride_argb,
- (const uint8*)(&kShuffleMaskRGBAToARGB),
- width, height);
-}
-
-// Convert RGB24 to ARGB.
-LIBYUV_API
-int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
- RGB24ToARGBRow_C;
- if (!src_rgb24 || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
- src_stride_rgb24 = -src_stride_rgb24;
- }
- // Coalesce rows.
- if (src_stride_rgb24 == width * 3 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_rgb24 = dst_stride_argb = 0;
- }
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
- }
- }
-#elif defined(HAS_RGB24TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- RGB24ToARGBRow = RGB24ToARGBRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- RGB24ToARGBRow(src_rgb24, dst_argb, width);
- src_rgb24 += src_stride_rgb24;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Convert RAW to ARGB.
-LIBYUV_API
-int RAWToARGB(const uint8* src_raw, int src_stride_raw,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
- RAWToARGBRow_C;
- if (!src_raw || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_raw = src_raw + (height - 1) * src_stride_raw;
- src_stride_raw = -src_stride_raw;
- }
- // Coalesce rows.
- if (src_stride_raw == width * 3 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_raw = dst_stride_argb = 0;
- }
-#if defined(HAS_RAWTOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- RAWToARGBRow = RAWToARGBRow_SSSE3;
- }
- }
-#elif defined(HAS_RAWTOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- RAWToARGBRow = RAWToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- RAWToARGBRow = RAWToARGBRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- RAWToARGBRow(src_raw, dst_argb, width);
- src_raw += src_stride_raw;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Convert RGB565 to ARGB.
-LIBYUV_API
-int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
- RGB565ToARGBRow_C;
- if (!src_rgb565 || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
- src_stride_rgb565 = -src_stride_rgb565;
- }
- // Coalesce rows.
- if (src_stride_rgb565 == width * 2 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_rgb565 = dst_stride_argb = 0;
- }
-#if defined(HAS_RGB565TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
- }
- }
-#elif defined(HAS_RGB565TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- RGB565ToARGBRow = RGB565ToARGBRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- RGB565ToARGBRow(src_rgb565, dst_argb, width);
- src_rgb565 += src_stride_rgb565;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Convert ARGB1555 to ARGB.
-LIBYUV_API
-int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
- int pix) = ARGB1555ToARGBRow_C;
- if (!src_argb1555 || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
- src_stride_argb1555 = -src_stride_argb1555;
- }
- // Coalesce rows.
- if (src_stride_argb1555 == width * 2 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_argb1555 = dst_stride_argb = 0;
- }
-#if defined(HAS_ARGB1555TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
- }
- }
-#elif defined(HAS_ARGB1555TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
- src_argb1555 += src_stride_argb1555;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Convert ARGB4444 to ARGB.
-LIBYUV_API
-int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
- int pix) = ARGB4444ToARGBRow_C;
- if (!src_argb4444 || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
- src_stride_argb4444 = -src_stride_argb4444;
- }
- // Coalesce rows.
- if (src_stride_argb4444 == width * 2 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_argb4444 = dst_stride_argb = 0;
- }
-#if defined(HAS_ARGB4444TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
- }
- }
-#elif defined(HAS_ARGB4444TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
- src_argb4444 += src_stride_argb4444;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Convert NV12 to ARGB.
-LIBYUV_API
-int NV12ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*NV12ToARGBRow)(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* rgb_buf,
- int width) = NV12ToARGBRow_C;
- if (!src_y || !src_uv || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
-#if defined(HAS_NV12TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- NV12ToARGBRow = NV12ToARGBRow_SSSE3;
- }
- }
- }
-#elif defined(HAS_NV12TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- NV12ToARGBRow(src_y, src_uv, dst_argb, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- if (y & 1) {
- src_uv += src_stride_uv;
- }
- }
- return 0;
-}
-
-// Convert NV21 to ARGB.
-LIBYUV_API
-int NV21ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*NV21ToARGBRow)(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* rgb_buf,
- int width) = NV21ToARGBRow_C;
- if (!src_y || !src_uv || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
-#if defined(HAS_NV21TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- NV21ToARGBRow = NV21ToARGBRow_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_NV21TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- NV21ToARGBRow = NV21ToARGBRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- NV21ToARGBRow(src_y, src_uv, dst_argb, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- if (y & 1) {
- src_uv += src_stride_uv;
- }
- }
- return 0;
-}
-
-// Convert M420 to ARGB.
-LIBYUV_API
-int M420ToARGB(const uint8* src_m420, int src_stride_m420,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*NV12ToARGBRow)(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* rgb_buf,
- int width) = NV12ToARGBRow_C;
- if (!src_m420 || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
-#if defined(HAS_NV12TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- NV12ToARGBRow = NV12ToARGBRow_SSSE3;
- }
- }
- }
-#elif defined(HAS_NV12TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
- NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
- dst_argb + dst_stride_argb, width);
- dst_argb += dst_stride_argb * 2;
- src_m420 += src_stride_m420 * 3;
- }
- if (height & 1) {
- NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
- }
- return 0;
-}
-
-// Convert YUY2 to ARGB.
-LIBYUV_API
-int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) =
- YUY2ToARGBRow_C;
- if (!src_yuy2 || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
- src_stride_yuy2 = -src_stride_yuy2;
- }
- // Coalesce rows.
- if (src_stride_yuy2 == width * 2 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_yuy2 = dst_stride_argb = 0;
- }
-#if defined(HAS_YUY2TOARGBROW_SSSE3)
- // Posix is 16, Windows is 8.
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
- }
- }
- }
-#elif defined(HAS_YUY2TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- YUY2ToARGBRow = YUY2ToARGBRow_NEON;
- }
- }
-#endif
- for (y = 0; y < height; ++y) {
- YUY2ToARGBRow(src_yuy2, dst_argb, width);
- src_yuy2 += src_stride_yuy2;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Convert UYVY to ARGB.
-LIBYUV_API
-int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) =
- UYVYToARGBRow_C;
- if (!src_uyvy || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
- src_stride_uyvy = -src_stride_uyvy;
- }
- // Coalesce rows.
- if (src_stride_uyvy == width * 2 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_uyvy = dst_stride_argb = 0;
- }
-#if defined(HAS_UYVYTOARGBROW_SSSE3)
- // Posix is 16, Windows is 8.
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- UYVYToARGBRow = UYVYToARGBRow_SSSE3;
- }
- }
- }
-#elif defined(HAS_UYVYTOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- UYVYToARGBRow = UYVYToARGBRow_NEON;
- }
- }
-#endif
- for (y = 0; y < height; ++y) {
- UYVYToARGBRow(src_uyvy, dst_argb, width);
- src_uyvy += src_stride_uyvy;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/convert_from.cc b/src/main/jni/libyuv/source/convert_from.cc
deleted file mode 100644
index c1a2f62f0..000000000
--- a/src/main/jni/libyuv/source/convert_from.cc
+++ /dev/null
@@ -1,1210 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert_from.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/convert.h" // For I420Copy
-#include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-#include "libyuv/scale.h" // For ScalePlane()
-#include "libyuv/video_common.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
-static __inline int Abs(int v) {
- return v >= 0 ? v : -v;
-}
-
-// I420 To any I4xx YUV format with mirroring.
-static int I420ToI4xx(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int src_y_width, int src_y_height,
- int dst_uv_width, int dst_uv_height) {
- const int dst_y_width = Abs(src_y_width);
- const int dst_y_height = Abs(src_y_height);
- const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
- const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
- if (src_y_width == 0 || src_y_height == 0 ||
- dst_uv_width <= 0 || dst_uv_height <= 0) {
- return -1;
- }
- ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
- dst_y, dst_stride_y, dst_y_width, dst_y_height,
- kFilterBilinear);
- ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
- dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
- kFilterBilinear);
- ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
- dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
- kFilterBilinear);
- return 0;
-}
-
-// 420 chroma is 1/2 width, 1/2 height
-// 422 chroma is 1/2 width, 1x height
-LIBYUV_API
-int I420ToI422(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- const int dst_uv_width = (Abs(width) + 1) >> 1;
- const int dst_uv_height = Abs(height);
- return I420ToI4xx(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height,
- dst_uv_width, dst_uv_height);
-}
-
-// 420 chroma is 1/2 width, 1/2 height
-// 444 chroma is 1x width, 1x height
-LIBYUV_API
-int I420ToI444(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- const int dst_uv_width = Abs(width);
- const int dst_uv_height = Abs(height);
- return I420ToI4xx(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height,
- dst_uv_width, dst_uv_height);
-}
-
-// 420 chroma is 1/2 width, 1/2 height
-// 411 chroma is 1/4 width, 1x height
-LIBYUV_API
-int I420ToI411(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- const int dst_uv_width = (Abs(width) + 3) >> 2;
- const int dst_uv_height = Abs(height);
- return I420ToI4xx(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height,
- dst_uv_width, dst_uv_height);
-}
-
-// Copy to I400. Source can be I420,422,444,400,NV12,NV21
-LIBYUV_API
-int I400Copy(const uint8* src_y, int src_stride_y,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
- if (!src_y || !dst_y ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_stride_y = -src_stride_y;
- }
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- return 0;
-}
-
-LIBYUV_API
-int I422ToYUY2(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_yuy2, int dst_stride_yuy2,
- int width, int height) {
- int y;
- void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_yuy2, int width) =
- I422ToYUY2Row_C;
- if (!src_y || !src_u || !src_v || !dst_yuy2 ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
- dst_stride_yuy2 = -dst_stride_yuy2;
- }
- // Coalesce rows.
- if (src_stride_y == width &&
- src_stride_u * 2 == width &&
- src_stride_v * 2 == width &&
- dst_stride_yuy2 == width * 2) {
- width *= height;
- height = 1;
- src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
- }
-#if defined(HAS_I422TOYUY2ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
- I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- I422ToYUY2Row = I422ToYUY2Row_SSE2;
- }
- }
-#elif defined(HAS_I422TOYUY2ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToYUY2Row = I422ToYUY2Row_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- dst_yuy2 += dst_stride_yuy2;
- }
- return 0;
-}
-
-LIBYUV_API
-int I420ToYUY2(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_yuy2, int dst_stride_yuy2,
- int width, int height) {
- int y;
- void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_yuy2, int width) =
- I422ToYUY2Row_C;
- if (!src_y || !src_u || !src_v || !dst_yuy2 ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
- dst_stride_yuy2 = -dst_stride_yuy2;
- }
-#if defined(HAS_I422TOYUY2ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
- I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- I422ToYUY2Row = I422ToYUY2Row_SSE2;
- }
- }
-#elif defined(HAS_I422TOYUY2ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToYUY2Row = I422ToYUY2Row_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
- I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
- dst_yuy2 + dst_stride_yuy2, width);
- src_y += src_stride_y * 2;
- src_u += src_stride_u;
- src_v += src_stride_v;
- dst_yuy2 += dst_stride_yuy2 * 2;
- }
- if (height & 1) {
- I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
- }
- return 0;
-}
-
-LIBYUV_API
-int I422ToUYVY(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_uyvy, int dst_stride_uyvy,
- int width, int height) {
- int y;
- void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_uyvy, int width) =
- I422ToUYVYRow_C;
- if (!src_y || !src_u || !src_v || !dst_uyvy ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
- dst_stride_uyvy = -dst_stride_uyvy;
- }
- // Coalesce rows.
- if (src_stride_y == width &&
- src_stride_u * 2 == width &&
- src_stride_v * 2 == width &&
- dst_stride_uyvy == width * 2) {
- width *= height;
- height = 1;
- src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
- }
-#if defined(HAS_I422TOUYVYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
- I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- I422ToUYVYRow = I422ToUYVYRow_SSE2;
- }
- }
-#elif defined(HAS_I422TOUYVYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToUYVYRow = I422ToUYVYRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- dst_uyvy += dst_stride_uyvy;
- }
- return 0;
-}
-
-LIBYUV_API
-int I420ToUYVY(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_uyvy, int dst_stride_uyvy,
- int width, int height) {
- int y;
- void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_uyvy, int width) =
- I422ToUYVYRow_C;
- if (!src_y || !src_u || !src_v || !dst_uyvy ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
- dst_stride_uyvy = -dst_stride_uyvy;
- }
-#if defined(HAS_I422TOUYVYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
- I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- I422ToUYVYRow = I422ToUYVYRow_SSE2;
- }
- }
-#elif defined(HAS_I422TOUYVYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToUYVYRow = I422ToUYVYRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
- I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
- dst_uyvy + dst_stride_uyvy, width);
- src_y += src_stride_y * 2;
- src_u += src_stride_u;
- src_v += src_stride_v;
- dst_uyvy += dst_stride_uyvy * 2;
- }
- if (height & 1) {
- I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
- }
- return 0;
-}
-
-LIBYUV_API
-int I420ToNV12(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_uv, int dst_stride_uv,
- int width, int height) {
- int y;
- void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
- int width) = MergeUVRow_C;
- // Coalesce rows.
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
- dst_stride_y = -dst_stride_y;
- dst_stride_uv = -dst_stride_uv;
- }
- if (src_stride_y == width &&
- dst_stride_y == width) {
- width *= height;
- height = 1;
- src_stride_y = dst_stride_y = 0;
- }
- // Coalesce rows.
- if (src_stride_u == halfwidth &&
- src_stride_v == halfwidth &&
- dst_stride_uv == halfwidth * 2) {
- halfwidth *= halfheight;
- halfheight = 1;
- src_stride_u = src_stride_v = dst_stride_uv = 0;
- }
-#if defined(HAS_MERGEUVROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
- MergeUVRow_ = MergeUVRow_Any_SSE2;
- if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
- IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
- IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
- MergeUVRow_ = MergeUVRow_SSE2;
- }
- }
- }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
- MergeUVRow_ = MergeUVRow_Any_AVX2;
- if (IS_ALIGNED(halfwidth, 32)) {
- MergeUVRow_ = MergeUVRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
- MergeUVRow_ = MergeUVRow_Any_NEON;
- if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow_ = MergeUVRow_NEON;
- }
- }
-#endif
-
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- for (y = 0; y < halfheight; ++y) {
- // Merge a row of U and V into a row of UV.
- MergeUVRow_(src_u, src_v, dst_uv, halfwidth);
- src_u += src_stride_u;
- src_v += src_stride_v;
- dst_uv += dst_stride_uv;
- }
- return 0;
-}
-
-LIBYUV_API
-int I420ToNV21(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_vu, int dst_stride_vu,
- int width, int height) {
- return I420ToNV12(src_y, src_stride_y,
- src_v, src_stride_v,
- src_u, src_stride_u,
- dst_y, src_stride_y,
- dst_vu, dst_stride_vu,
- width, height);
-}
-
-// Convert I420 to ARGB.
-LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*I422ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToARGBRow_C;
- if (!src_y || !src_u || !src_v || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
-#if defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
- I422ToARGBRow = I422ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGBRow = I422ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- I422ToARGBRow = I422ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
- IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
- IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
- IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
- I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to BGRA.
-LIBYUV_API
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_bgra, int dst_stride_bgra,
- int width, int height) {
- int y;
- void (*I422ToBGRARow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToBGRARow_C;
- if (!src_y || !src_u || !src_v || !dst_bgra ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
- dst_stride_bgra = -dst_stride_bgra;
- }
-#if defined(HAS_I422TOBGRAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
- I422ToBGRARow = I422ToBGRARow_SSSE3;
- }
- }
- }
-#elif defined(HAS_I422TOBGRAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- I422ToBGRARow = I422ToBGRARow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToBGRARow = I422ToBGRARow_NEON;
- }
- }
-#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
- IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
- IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
- IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
- I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
- dst_bgra += dst_stride_bgra;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to ABGR.
-LIBYUV_API
-int I420ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height) {
- int y;
- void (*I422ToABGRRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToABGRRow_C;
- if (!src_y || !src_u || !src_v || !dst_abgr ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
- dst_stride_abgr = -dst_stride_abgr;
- }
-#if defined(HAS_I422TOABGRROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
- I422ToABGRRow = I422ToABGRRow_SSSE3;
- }
- }
- }
-#elif defined(HAS_I422TOABGRROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- I422ToABGRRow = I422ToABGRRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToABGRRow = I422ToABGRRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
- dst_abgr += dst_stride_abgr;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RGBA.
-LIBYUV_API
-int I420ToRGBA(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_rgba, int dst_stride_rgba,
- int width, int height) {
- int y;
- void (*I422ToRGBARow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToRGBARow_C;
- if (!src_y || !src_u || !src_v || !dst_rgba ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
- dst_stride_rgba = -dst_stride_rgba;
- }
-#if defined(HAS_I422TORGBAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
- I422ToRGBARow = I422ToRGBARow_SSSE3;
- }
- }
- }
-#elif defined(HAS_I422TORGBAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- I422ToRGBARow = I422ToRGBARow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
- dst_rgba += dst_stride_rgba;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RGB24.
-LIBYUV_API
-int I420ToRGB24(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_rgb24, int dst_stride_rgb24,
- int width, int height) {
- int y;
- void (*I422ToRGB24Row)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToRGB24Row_C;
- if (!src_y || !src_u || !src_v || !dst_rgb24 ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
- dst_stride_rgb24 = -dst_stride_rgb24;
- }
-#if defined(HAS_I422TORGB24ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB24Row = I422ToRGB24Row_SSSE3;
- }
- }
-#elif defined(HAS_I422TORGB24ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB24Row = I422ToRGB24Row_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
- dst_rgb24 += dst_stride_rgb24;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RAW.
-LIBYUV_API
-int I420ToRAW(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_raw, int dst_stride_raw,
- int width, int height) {
- int y;
- void (*I422ToRAWRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToRAWRow_C;
- if (!src_y || !src_u || !src_v || !dst_raw ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_raw = dst_raw + (height - 1) * dst_stride_raw;
- dst_stride_raw = -dst_stride_raw;
- }
-#if defined(HAS_I422TORAWROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRAWRow = I422ToRAWRow_SSSE3;
- }
- }
-#elif defined(HAS_I422TORAWROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- I422ToRAWRow = I422ToRAWRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRAWRow = I422ToRAWRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
- dst_raw += dst_stride_raw;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to ARGB1555.
-LIBYUV_API
-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb1555, int dst_stride_argb1555,
- int width, int height) {
- int y;
- void (*I422ToARGB1555Row)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToARGB1555Row_C;
- if (!src_y || !src_u || !src_v || !dst_argb1555 ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
- dst_stride_argb1555 = -dst_stride_argb1555;
- }
-#if defined(HAS_I422TOARGB1555ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
- }
- }
-#elif defined(HAS_I422TOARGB1555ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB1555Row = I422ToARGB1555Row_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width);
- dst_argb1555 += dst_stride_argb1555;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-
-// Convert I420 to ARGB4444.
-LIBYUV_API
-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb4444, int dst_stride_argb4444,
- int width, int height) {
- int y;
- void (*I422ToARGB4444Row)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToARGB4444Row_C;
- if (!src_y || !src_u || !src_v || !dst_argb4444 ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
- dst_stride_argb4444 = -dst_stride_argb4444;
- }
-#if defined(HAS_I422TOARGB4444ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
- }
- }
-#elif defined(HAS_I422TOARGB4444ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB4444Row = I422ToARGB4444Row_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width);
- dst_argb4444 += dst_stride_argb4444;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RGB565.
-LIBYUV_API
-int I420ToRGB565(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_rgb565, int dst_stride_rgb565,
- int width, int height) {
- int y;
- void (*I422ToRGB565Row)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToRGB565Row_C;
- if (!src_y || !src_u || !src_v || !dst_rgb565 ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
-#if defined(HAS_I422TORGB565ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_SSSE3;
- }
- }
-#elif defined(HAS_I422TORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to specified format
-LIBYUV_API
-int ConvertFromI420(const uint8* y, int y_stride,
- const uint8* u, int u_stride,
- const uint8* v, int v_stride,
- uint8* dst_sample, int dst_sample_stride,
- int width, int height,
- uint32 fourcc) {
- uint32 format = CanonicalFourCC(fourcc);
- int r = 0;
- if (!y || !u|| !v || !dst_sample ||
- width <= 0 || height == 0) {
- return -1;
- }
- switch (format) {
- // Single plane formats
- case FOURCC_YUY2:
- r = I420ToYUY2(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 2,
- width, height);
- break;
- case FOURCC_UYVY:
- r = I420ToUYVY(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 2,
- width, height);
- break;
- case FOURCC_RGBP:
- r = I420ToRGB565(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 2,
- width, height);
- break;
- case FOURCC_RGBO:
- r = I420ToARGB1555(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 2,
- width, height);
- break;
- case FOURCC_R444:
- r = I420ToARGB4444(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 2,
- width, height);
- break;
- case FOURCC_24BG:
- r = I420ToRGB24(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 3,
- width, height);
- break;
- case FOURCC_RAW:
- r = I420ToRAW(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 3,
- width, height);
- break;
- case FOURCC_ARGB:
- r = I420ToARGB(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 4,
- width, height);
- break;
- case FOURCC_BGRA:
- r = I420ToBGRA(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 4,
- width, height);
- break;
- case FOURCC_ABGR:
- r = I420ToABGR(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 4,
- width, height);
- break;
- case FOURCC_RGBA:
- r = I420ToRGBA(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 4,
- width, height);
- break;
- case FOURCC_BGGR:
- r = I420ToBayerBGGR(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width,
- width, height);
- break;
- case FOURCC_GBRG:
- r = I420ToBayerGBRG(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width,
- width, height);
- break;
- case FOURCC_GRBG:
- r = I420ToBayerGRBG(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width,
- width, height);
- break;
- case FOURCC_RGGB:
- r = I420ToBayerRGGB(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width,
- width, height);
- break;
- case FOURCC_I400:
- r = I400Copy(y, y_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width,
- width, height);
- break;
- case FOURCC_NV12: {
- uint8* dst_uv = dst_sample + width * height;
- r = I420ToNV12(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width,
- dst_uv,
- dst_sample_stride ? dst_sample_stride : width,
- width, height);
- break;
- }
- case FOURCC_NV21: {
- uint8* dst_vu = dst_sample + width * height;
- r = I420ToNV21(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width,
- dst_vu,
- dst_sample_stride ? dst_sample_stride : width,
- width, height);
- break;
- }
- // TODO(fbarchard): Add M420 and Q420.
- // Triplanar formats
- // TODO(fbarchard): halfstride instead of halfwidth
- case FOURCC_I420:
- case FOURCC_YU12:
- case FOURCC_YV12: {
- int halfwidth = (width + 1) / 2;
- int halfheight = (height + 1) / 2;
- uint8* dst_u;
- uint8* dst_v;
- if (format == FOURCC_YV12) {
- dst_v = dst_sample + width * height;
- dst_u = dst_v + halfwidth * halfheight;
- } else {
- dst_u = dst_sample + width * height;
- dst_v = dst_u + halfwidth * halfheight;
- }
- r = I420Copy(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample, width,
- dst_u, halfwidth,
- dst_v, halfwidth,
- width, height);
- break;
- }
- case FOURCC_I422:
- case FOURCC_YV16: {
- int halfwidth = (width + 1) / 2;
- uint8* dst_u;
- uint8* dst_v;
- if (format == FOURCC_YV16) {
- dst_v = dst_sample + width * height;
- dst_u = dst_v + halfwidth * height;
- } else {
- dst_u = dst_sample + width * height;
- dst_v = dst_u + halfwidth * height;
- }
- r = I420ToI422(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample, width,
- dst_u, halfwidth,
- dst_v, halfwidth,
- width, height);
- break;
- }
- case FOURCC_I444:
- case FOURCC_YV24: {
- uint8* dst_u;
- uint8* dst_v;
- if (format == FOURCC_YV24) {
- dst_v = dst_sample + width * height;
- dst_u = dst_v + width * height;
- } else {
- dst_u = dst_sample + width * height;
- dst_v = dst_u + width * height;
- }
- r = I420ToI444(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample, width,
- dst_u, width,
- dst_v, width,
- width, height);
- break;
- }
- case FOURCC_I411: {
- int quarterwidth = (width + 3) / 4;
- uint8* dst_u = dst_sample + width * height;
- uint8* dst_v = dst_u + quarterwidth * height;
- r = I420ToI411(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample, width,
- dst_u, quarterwidth,
- dst_v, quarterwidth,
- width, height);
- break;
- }
-
- // Formats not supported - MJPG, biplanar, some rgb formats.
- default:
- return -1; // unknown fourcc - return failure code.
- }
- return r;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/convert_from_argb.cc b/src/main/jni/libyuv/source/convert_from_argb.cc
deleted file mode 100644
index de461ddb0..000000000
--- a/src/main/jni/libyuv/source/convert_from_argb.cc
+++ /dev/null
@@ -1,1133 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert_from_argb.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// ARGB little endian (bgra in memory) to I444
-LIBYUV_API
-int ARGBToI444(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
- void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int pix) = ARGBToUV444Row_C;
- if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_y == width &&
- dst_stride_u == width &&
- dst_stride_v == width) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
- }
-#if defined(HAS_ARGBTOUV444ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUV444Row = ARGBToUV444Row_SSSE3;
- }
- }
- }
-#elif defined(HAS_ARGBTOUV444ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToUV444Row = ARGBToUV444Row_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
- }
-
-#elif defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGBToUV444Row(src_argb, dst_u, dst_v, width);
- ARGBToYRow(src_argb, dst_y, width);
- src_argb += src_stride_argb;
- dst_y += dst_stride_y;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- return 0;
-}
-
-// ARGB little endian (bgra in memory) to I422
-LIBYUV_API
-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
- void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int pix) = ARGBToUV422Row_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
- if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_y == width &&
- dst_stride_u * 2 == width &&
- dst_stride_v * 2 == width) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
- }
-#if defined(HAS_ARGBTOUV422ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_SSSE3;
- }
- }
- }
-#elif defined(HAS_ARGBTOUV422ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_NEON;
- }
- }
-#endif
-
-#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
- }
-#elif defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGBToUV422Row(src_argb, dst_u, dst_v, width);
- ARGBToYRow(src_argb, dst_y, width);
- src_argb += src_stride_argb;
- dst_y += dst_stride_y;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- return 0;
-}
-
-// ARGB little endian (bgra in memory) to I411
-LIBYUV_API
-int ARGBToI411(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
- void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int pix) = ARGBToUV411Row_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
- if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_y == width &&
- dst_stride_u * 4 == width &&
- dst_stride_v * 4 == width) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
- }
-#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
- ARGBToYRow = ARGBToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToYRow = ARGBToYRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUV411ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 32) {
- ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUV411Row = ARGBToUV411Row_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGBToUV411Row(src_argb, dst_u, dst_v, width);
- ARGBToYRow(src_argb, dst_y, width);
- src_argb += src_stride_argb;
- dst_y += dst_stride_y;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- return 0;
-}
-
-LIBYUV_API
-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_uv, int dst_stride_uv,
- int width, int height) {
- int y;
- int halfwidth = (width + 1) >> 1;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
- void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
- int width) = MergeUVRow_C;
- if (!src_argb ||
- !dst_y || !dst_uv ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
- }
- }
-#elif defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
- }
- }
-#endif
-#if defined(HAS_MERGEUVROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
- MergeUVRow_ = MergeUVRow_Any_SSE2;
- if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
- MergeUVRow_ = MergeUVRow_SSE2;
- }
- }
- }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
- MergeUVRow_ = MergeUVRow_Any_AVX2;
- if (IS_ALIGNED(halfwidth, 32)) {
- MergeUVRow_ = MergeUVRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
- MergeUVRow_ = MergeUVRow_Any_NEON;
- if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow_ = MergeUVRow_NEON;
- }
- }
-#endif
- {
- // Allocate a rows of uv.
- align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
- uint8* row_v = row_u + ((halfwidth + 15) & ~15);
-
- for (y = 0; y < height - 1; y += 2) {
- ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
- MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
- ARGBToYRow(src_argb, dst_y, width);
- ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
- src_argb += src_stride_argb * 2;
- dst_y += dst_stride_y * 2;
- dst_uv += dst_stride_uv;
- }
- if (height & 1) {
- ARGBToUVRow(src_argb, 0, row_u, row_v, width);
- MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
- ARGBToYRow(src_argb, dst_y, width);
- }
- free_aligned_buffer_64(row_u);
- }
- return 0;
-}
-
-// Same as NV12 but U and V swapped.
-LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_uv, int dst_stride_uv,
- int width, int height) {
- int y;
- int halfwidth = (width + 1) >> 1;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
- void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
- int width) = MergeUVRow_C;
- if (!src_argb ||
- !dst_y || !dst_uv ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
- }
- }
-#elif defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
- }
- }
-#endif
-#if defined(HAS_MERGEUVROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
- MergeUVRow_ = MergeUVRow_Any_SSE2;
- if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
- MergeUVRow_ = MergeUVRow_SSE2;
- }
- }
- }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
- MergeUVRow_ = MergeUVRow_Any_AVX2;
- if (IS_ALIGNED(halfwidth, 32)) {
- MergeUVRow_ = MergeUVRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
- MergeUVRow_ = MergeUVRow_Any_NEON;
- if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow_ = MergeUVRow_NEON;
- }
- }
-#endif
- {
- // Allocate a rows of uv.
- align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
- uint8* row_v = row_u + ((halfwidth + 15) & ~15);
-
- for (y = 0; y < height - 1; y += 2) {
- ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
- MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
- ARGBToYRow(src_argb, dst_y, width);
- ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
- src_argb += src_stride_argb * 2;
- dst_y += dst_stride_y * 2;
- dst_uv += dst_stride_uv;
- }
- if (height & 1) {
- ARGBToUVRow(src_argb, 0, row_u, row_v, width);
- MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
- ARGBToYRow(src_argb, dst_y, width);
- }
- free_aligned_buffer_64(row_u);
- }
- return 0;
-}
-
-// Convert ARGB to YUY2.
-LIBYUV_API
-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
- uint8* dst_yuy2, int dst_stride_yuy2,
- int width, int height) {
- int y;
- void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int pix) = ARGBToUV422Row_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
- void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
-
- if (!src_argb || !dst_yuy2 ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
- dst_stride_yuy2 = -dst_stride_yuy2;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_yuy2 == width * 2) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_yuy2 = 0;
- }
-#if defined(HAS_ARGBTOUV422ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_SSSE3;
- }
- }
- }
-#elif defined(HAS_ARGBTOUV422ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
- }
-#elif defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_NEON;
- }
- }
-#endif
-
-#if defined(HAS_I422TOYUY2ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
- I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- I422ToYUY2Row = I422ToYUY2Row_SSE2;
- }
- }
-#elif defined(HAS_I422TOYUY2ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToYUY2Row = I422ToYUY2Row_NEON;
- }
- }
-#endif
-
- {
- // Allocate a rows of yuv.
- align_buffer_64(row_y, ((width + 63) & ~63) * 2);
- uint8* row_u = row_y + ((width + 63) & ~63);
- uint8* row_v = row_u + ((width + 63) & ~63) / 2;
-
- for (y = 0; y < height; ++y) {
- ARGBToUV422Row(src_argb, row_u, row_v, width);
- ARGBToYRow(src_argb, row_y, width);
- I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
- src_argb += src_stride_argb;
- dst_yuy2 += dst_stride_yuy2;
- }
-
- free_aligned_buffer_64(row_y);
- }
- return 0;
-}
-
-// Convert ARGB to UYVY.
-LIBYUV_API
-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
- uint8* dst_uyvy, int dst_stride_uyvy,
- int width, int height) {
- int y;
- void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int pix) = ARGBToUV422Row_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
- void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
-
- if (!src_argb || !dst_uyvy ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
- dst_stride_uyvy = -dst_stride_uyvy;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_uyvy == width * 2) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_uyvy = 0;
- }
-#if defined(HAS_ARGBTOUV422ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_SSSE3;
- }
- }
- }
-#elif defined(HAS_ARGBTOUV422ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
- }
-#elif defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_NEON;
- }
- }
-#endif
-
-#if defined(HAS_I422TOUYVYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
- I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- I422ToUYVYRow = I422ToUYVYRow_SSE2;
- }
- }
-#elif defined(HAS_I422TOUYVYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToUYVYRow = I422ToUYVYRow_NEON;
- }
- }
-#endif
-
- {
- // Allocate a rows of yuv.
- align_buffer_64(row_y, ((width + 63) & ~63) * 2);
- uint8* row_u = row_y + ((width + 63) & ~63);
- uint8* row_v = row_u + ((width + 63) & ~63) / 2;
-
- for (y = 0; y < height; ++y) {
- ARGBToUV422Row(src_argb, row_u, row_v, width);
- ARGBToYRow(src_argb, row_y, width);
- I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
- src_argb += src_stride_argb;
- dst_uyvy += dst_stride_uyvy;
- }
-
- free_aligned_buffer_64(row_y);
- }
- return 0;
-}
-
-// Convert ARGB to I400.
-LIBYUV_API
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
- int y;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
- if (!src_argb || !dst_y || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_y == width) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_y = 0;
- }
-#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
- ARGBToYRow = ARGBToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToYRow = ARGBToYRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGBToYRow(src_argb, dst_y, width);
- src_argb += src_stride_argb;
- dst_y += dst_stride_y;
- }
- return 0;
-}
-
-// Shuffle table for converting ARGB to RGBA.
-static uvec8 kShuffleMaskARGBToRGBA = {
- 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
-};
-
-// Convert ARGB to RGBA.
-LIBYUV_API
-int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgba, int dst_stride_rgba,
- int width, int height) {
- return ARGBShuffle(src_argb, src_stride_argb,
- dst_rgba, dst_stride_rgba,
- (const uint8*)(&kShuffleMaskARGBToRGBA),
- width, height);
-}
-
-// Convert ARGB To RGB24.
-LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgb24, int dst_stride_rgb24,
- int width, int height) {
- int y;
- void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
- ARGBToRGB24Row_C;
- if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_rgb24 == width * 3) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_rgb24 = 0;
- }
-#if defined(HAS_ARGBTORGB24ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
- }
- }
-#elif defined(HAS_ARGBTORGB24ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToRGB24Row = ARGBToRGB24Row_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGBToRGB24Row(src_argb, dst_rgb24, width);
- src_argb += src_stride_argb;
- dst_rgb24 += dst_stride_rgb24;
- }
- return 0;
-}
-
-// Convert ARGB To RAW.
-LIBYUV_API
-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
- uint8* dst_raw, int dst_stride_raw,
- int width, int height) {
- int y;
- void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
- ARGBToRAWRow_C;
- if (!src_argb || !dst_raw || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_raw == width * 3) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_raw = 0;
- }
-#if defined(HAS_ARGBTORAWROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToRAWRow = ARGBToRAWRow_SSSE3;
- }
- }
-#elif defined(HAS_ARGBTORAWROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToRAWRow = ARGBToRAWRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGBToRAWRow(src_argb, dst_raw, width);
- src_argb += src_stride_argb;
- dst_raw += dst_stride_raw;
- }
- return 0;
-}
-
-// Convert ARGB To RGB565.
-LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgb565, int dst_stride_rgb565,
- int width, int height) {
- int y;
- void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
- ARGBToRGB565Row_C;
- if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_rgb565 == width * 2) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_rgb565 = 0;
- }
-#if defined(HAS_ARGBTORGB565ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
- if (IS_ALIGNED(width, 4)) {
- ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
- }
- }
-#elif defined(HAS_ARGBTORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToRGB565Row = ARGBToRGB565Row_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGBToRGB565Row(src_argb, dst_rgb565, width);
- src_argb += src_stride_argb;
- dst_rgb565 += dst_stride_rgb565;
- }
- return 0;
-}
-
-// Convert ARGB To ARGB1555.
-LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb1555, int dst_stride_argb1555,
- int width, int height) {
- int y;
- void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
- ARGBToARGB1555Row_C;
- if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb1555 == width * 2) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_argb1555 = 0;
- }
-#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
- if (IS_ALIGNED(width, 4)) {
- ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
- }
- }
-#elif defined(HAS_ARGBTOARGB1555ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGBToARGB1555Row(src_argb, dst_argb1555, width);
- src_argb += src_stride_argb;
- dst_argb1555 += dst_stride_argb1555;
- }
- return 0;
-}
-
-// Convert ARGB To ARGB4444.
-LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb4444, int dst_stride_argb4444,
- int width, int height) {
- int y;
- void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
- ARGBToARGB4444Row_C;
- if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb4444 == width * 2) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_argb4444 = 0;
- }
-#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
- if (IS_ALIGNED(width, 4)) {
- ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
- }
- }
-#elif defined(HAS_ARGBTOARGB4444ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGBToARGB4444Row(src_argb, dst_argb4444, width);
- src_argb += src_stride_argb;
- dst_argb4444 += dst_stride_argb4444;
- }
- return 0;
-}
-
-// Convert ARGB to J420. (JPeg full range I420).
-LIBYUV_API
-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
- uint8* dst_yj, int dst_stride_yj,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
- void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
- void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
- ARGBToYJRow_C;
- if (!src_argb ||
- !dst_yj || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
-#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
- ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_Unaligned_SSSE3;
- ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_SSSE3;
- if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
- ARGBToYJRow = ARGBToYJRow_SSSE3;
- }
- }
- }
- }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
- ARGBToYJRow = ARGBToYJRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToYJRow = ARGBToYJRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYJROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToYJRow = ARGBToYJRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYJRow = ARGBToYJRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVJROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
- ARGBToYJRow(src_argb, dst_yj, width);
- ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
- src_argb += src_stride_argb * 2;
- dst_yj += dst_stride_yj * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
- ARGBToYJRow(src_argb, dst_yj, width);
- }
- return 0;
-}
-
-// Convert ARGB to J400.
-LIBYUV_API
-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
- uint8* dst_yj, int dst_stride_yj,
- int width, int height) {
- int y;
- void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
- ARGBToYJRow_C;
- if (!src_argb || !dst_yj || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_yj == width) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_yj = 0;
- }
-#if defined(HAS_ARGBTOYJROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
- ARGBToYJRow = ARGBToYJRow_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
- ARGBToYJRow = ARGBToYJRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToYJRow = ARGBToYJRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYJROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToYJRow = ARGBToYJRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYJRow = ARGBToYJRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGBToYJRow(src_argb, dst_yj, width);
- src_argb += src_stride_argb;
- dst_yj += dst_stride_yj;
- }
- return 0;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/convert_jpeg.cc b/src/main/jni/libyuv/source/convert_jpeg.cc
deleted file mode 100644
index bcb980f7f..000000000
--- a/src/main/jni/libyuv/source/convert_jpeg.cc
+++ /dev/null
@@ -1,392 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert.h"
-
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#ifdef HAVE_JPEG
-struct I420Buffers {
- uint8* y;
- int y_stride;
- uint8* u;
- int u_stride;
- uint8* v;
- int v_stride;
- int w;
- int h;
-};
-
-static void JpegCopyI420(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- I420Buffers* dest = (I420Buffers*)(opaque);
- I420Copy(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->y, dest->y_stride,
- dest->u, dest->u_stride,
- dest->v, dest->v_stride,
- dest->w, rows);
- dest->y += rows * dest->y_stride;
- dest->u += ((rows + 1) >> 1) * dest->u_stride;
- dest->v += ((rows + 1) >> 1) * dest->v_stride;
- dest->h -= rows;
-}
-
-static void JpegI422ToI420(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- I420Buffers* dest = (I420Buffers*)(opaque);
- I422ToI420(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->y, dest->y_stride,
- dest->u, dest->u_stride,
- dest->v, dest->v_stride,
- dest->w, rows);
- dest->y += rows * dest->y_stride;
- dest->u += ((rows + 1) >> 1) * dest->u_stride;
- dest->v += ((rows + 1) >> 1) * dest->v_stride;
- dest->h -= rows;
-}
-
-static void JpegI444ToI420(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- I420Buffers* dest = (I420Buffers*)(opaque);
- I444ToI420(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->y, dest->y_stride,
- dest->u, dest->u_stride,
- dest->v, dest->v_stride,
- dest->w, rows);
- dest->y += rows * dest->y_stride;
- dest->u += ((rows + 1) >> 1) * dest->u_stride;
- dest->v += ((rows + 1) >> 1) * dest->v_stride;
- dest->h -= rows;
-}
-
-static void JpegI411ToI420(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- I420Buffers* dest = (I420Buffers*)(opaque);
- I411ToI420(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->y, dest->y_stride,
- dest->u, dest->u_stride,
- dest->v, dest->v_stride,
- dest->w, rows);
- dest->y += rows * dest->y_stride;
- dest->u += ((rows + 1) >> 1) * dest->u_stride;
- dest->v += ((rows + 1) >> 1) * dest->v_stride;
- dest->h -= rows;
-}
-
-static void JpegI400ToI420(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- I420Buffers* dest = (I420Buffers*)(opaque);
- I400ToI420(data[0], strides[0],
- dest->y, dest->y_stride,
- dest->u, dest->u_stride,
- dest->v, dest->v_stride,
- dest->w, rows);
- dest->y += rows * dest->y_stride;
- dest->u += ((rows + 1) >> 1) * dest->u_stride;
- dest->v += ((rows + 1) >> 1) * dest->v_stride;
- dest->h -= rows;
-}
-
-// Query size of MJPG in pixels.
-LIBYUV_API
-int MJPGSize(const uint8* sample, size_t sample_size,
- int* width, int* height) {
- MJpegDecoder mjpeg_decoder;
- LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
- if (ret) {
- *width = mjpeg_decoder.GetWidth();
- *height = mjpeg_decoder.GetHeight();
- }
- mjpeg_decoder.UnloadFrame();
- return ret ? 0 : -1; // -1 for runtime failure.
-}
-
-// MJPG (Motion JPeg) to I420
-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
-LIBYUV_API
-int MJPGToI420(const uint8* sample,
- size_t sample_size,
- uint8* y, int y_stride,
- uint8* u, int u_stride,
- uint8* v, int v_stride,
- int w, int h,
- int dw, int dh) {
- if (sample_size == kUnknownDataSize) {
- // ERROR: MJPEG frame size unknown
- return -1;
- }
-
- // TODO(fbarchard): Port MJpeg to C.
- MJpegDecoder mjpeg_decoder;
- LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
- if (ret && (mjpeg_decoder.GetWidth() != w ||
- mjpeg_decoder.GetHeight() != h)) {
- // ERROR: MJPEG frame has unexpected dimensions
- mjpeg_decoder.UnloadFrame();
- return 1; // runtime failure
- }
- if (ret) {
- I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
- // YUV420
- if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 2 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
- // YUV422
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
- // YUV444
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
- // YUV411
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
- // YUV400
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceGrayscale &&
- mjpeg_decoder.GetNumComponents() == 1 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
- } else {
- // TODO(fbarchard): Implement conversion for any other colorspace/sample
- // factors that occur in practice. 411 is supported by libjpeg
- // ERROR: Unable to convert MJPEG frame because format is not supported
- mjpeg_decoder.UnloadFrame();
- return 1;
- }
- }
- return ret ? 0 : 1;
-}
-
-#ifdef HAVE_JPEG
-struct ARGBBuffers {
- uint8* argb;
- int argb_stride;
- int w;
- int h;
-};
-
-static void JpegI420ToARGB(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- ARGBBuffers* dest = (ARGBBuffers*)(opaque);
- I420ToARGB(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->argb, dest->argb_stride,
- dest->w, rows);
- dest->argb += rows * dest->argb_stride;
- dest->h -= rows;
-}
-
-static void JpegI422ToARGB(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- ARGBBuffers* dest = (ARGBBuffers*)(opaque);
- I422ToARGB(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->argb, dest->argb_stride,
- dest->w, rows);
- dest->argb += rows * dest->argb_stride;
- dest->h -= rows;
-}
-
-static void JpegI444ToARGB(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- ARGBBuffers* dest = (ARGBBuffers*)(opaque);
- I444ToARGB(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->argb, dest->argb_stride,
- dest->w, rows);
- dest->argb += rows * dest->argb_stride;
- dest->h -= rows;
-}
-
-static void JpegI411ToARGB(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- ARGBBuffers* dest = (ARGBBuffers*)(opaque);
- I411ToARGB(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->argb, dest->argb_stride,
- dest->w, rows);
- dest->argb += rows * dest->argb_stride;
- dest->h -= rows;
-}
-
-static void JpegI400ToARGB(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- ARGBBuffers* dest = (ARGBBuffers*)(opaque);
- I400ToARGB(data[0], strides[0],
- dest->argb, dest->argb_stride,
- dest->w, rows);
- dest->argb += rows * dest->argb_stride;
- dest->h -= rows;
-}
-
-// MJPG (Motion JPeg) to ARGB
-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
-LIBYUV_API
-int MJPGToARGB(const uint8* sample,
- size_t sample_size,
- uint8* argb, int argb_stride,
- int w, int h,
- int dw, int dh) {
- if (sample_size == kUnknownDataSize) {
- // ERROR: MJPEG frame size unknown
- return -1;
- }
-
- // TODO(fbarchard): Port MJpeg to C.
- MJpegDecoder mjpeg_decoder;
- LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
- if (ret && (mjpeg_decoder.GetWidth() != w ||
- mjpeg_decoder.GetHeight() != h)) {
- // ERROR: MJPEG frame has unexpected dimensions
- mjpeg_decoder.UnloadFrame();
- return 1; // runtime failure
- }
- if (ret) {
- ARGBBuffers bufs = { argb, argb_stride, dw, dh };
- // YUV420
- if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 2 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
- // YUV422
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
- // YUV444
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
- // YUV411
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
- // YUV400
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceGrayscale &&
- mjpeg_decoder.GetNumComponents() == 1 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
- } else {
- // TODO(fbarchard): Implement conversion for any other colorspace/sample
- // factors that occur in practice. 411 is supported by libjpeg
- // ERROR: Unable to convert MJPEG frame because format is not supported
- mjpeg_decoder.UnloadFrame();
- return 1;
- }
- }
- return ret ? 0 : 1;
-}
-#endif
-
-#endif
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/convert_to_argb.cc b/src/main/jni/libyuv/source/convert_to_argb.cc
deleted file mode 100644
index 1b228a7b4..000000000
--- a/src/main/jni/libyuv/source/convert_to_argb.cc
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert_argb.h"
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-#include "libyuv/rotate_argb.h"
-#include "libyuv/row.h"
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Convert camera sample to I420 with cropping, rotation and vertical flip.
-// src_width is used for source stride computation
-// src_height is used to compute location of planes, and indicate inversion
-// sample_size is measured in bytes and is the size of the frame.
-// With MJPEG it is the compressed size of the frame.
-LIBYUV_API
-int ConvertToARGB(const uint8* sample, size_t sample_size,
- uint8* crop_argb, int argb_stride,
- int crop_x, int crop_y,
- int src_width, int src_height,
- int crop_width, int crop_height,
- enum RotationMode rotation,
- uint32 fourcc) {
- uint32 format = CanonicalFourCC(fourcc);
- int aligned_src_width = (src_width + 1) & ~1;
- const uint8* src;
- const uint8* src_uv;
- int abs_src_height = (src_height < 0) ? -src_height : src_height;
- int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
- int r = 0;
-
- // One pass rotation is available for some formats. For the rest, convert
- // to I420 (with optional vertical flipping) into a temporary I420 buffer,
- // and then rotate the I420 to the final destination buffer.
- // For in-place conversion, if destination crop_argb is same as source sample,
- // also enable temporary buffer.
- LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
- crop_argb == sample;
- uint8* tmp_argb = crop_argb;
- int tmp_argb_stride = argb_stride;
- uint8* rotate_buffer = NULL;
- int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
-
- if (crop_argb == NULL || sample == NULL ||
- src_width <= 0 || crop_width <= 0 ||
- src_height == 0 || crop_height == 0) {
- return -1;
- }
- if (src_height < 0) {
- inv_crop_height = -inv_crop_height;
- }
-
- if (need_buf) {
- int argb_size = crop_width * abs_crop_height * 4;
- rotate_buffer = (uint8*)malloc(argb_size);
- if (!rotate_buffer) {
- return 1; // Out of memory runtime error.
- }
- crop_argb = rotate_buffer;
- argb_stride = crop_width;
- }
-
- switch (format) {
- // Single plane formats
- case FOURCC_YUY2:
- src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = YUY2ToARGB(src, aligned_src_width * 2,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_UYVY:
- src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = UYVYToARGB(src, aligned_src_width * 2,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_24BG:
- src = sample + (src_width * crop_y + crop_x) * 3;
- r = RGB24ToARGB(src, src_width * 3,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_RAW:
- src = sample + (src_width * crop_y + crop_x) * 3;
- r = RAWToARGB(src, src_width * 3,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_ARGB:
- src = sample + (src_width * crop_y + crop_x) * 4;
- r = ARGBToARGB(src, src_width * 4,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_BGRA:
- src = sample + (src_width * crop_y + crop_x) * 4;
- r = BGRAToARGB(src, src_width * 4,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_ABGR:
- src = sample + (src_width * crop_y + crop_x) * 4;
- r = ABGRToARGB(src, src_width * 4,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_RGBA:
- src = sample + (src_width * crop_y + crop_x) * 4;
- r = RGBAToARGB(src, src_width * 4,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_RGBP:
- src = sample + (src_width * crop_y + crop_x) * 2;
- r = RGB565ToARGB(src, src_width * 2,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_RGBO:
- src = sample + (src_width * crop_y + crop_x) * 2;
- r = ARGB1555ToARGB(src, src_width * 2,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_R444:
- src = sample + (src_width * crop_y + crop_x) * 2;
- r = ARGB4444ToARGB(src, src_width * 2,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
- // TODO(fbarchard): Support cropping Bayer by odd numbers
- // by adjusting fourcc.
- case FOURCC_BGGR:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerBGGRToARGB(src, src_width,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
-
- case FOURCC_GBRG:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerGBRGToARGB(src, src_width,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
-
- case FOURCC_GRBG:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerGRBGToARGB(src, src_width,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
-
- case FOURCC_RGGB:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerRGGBToARGB(src, src_width,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
-
- case FOURCC_I400:
- src = sample + src_width * crop_y + crop_x;
- r = I400ToARGB(src, src_width,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
-
- // Biplanar formats
- case FOURCC_NV12:
- src = sample + (src_width * crop_y + crop_x);
- src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
- r = NV12ToARGB(src, src_width,
- src_uv, aligned_src_width,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_NV21:
- src = sample + (src_width * crop_y + crop_x);
- src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
- // Call NV12 but with u and v parameters swapped.
- r = NV21ToARGB(src, src_width,
- src_uv, aligned_src_width,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_M420:
- src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
- r = M420ToARGB(src, src_width,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
-// case FOURCC_Q420:
-// src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
-// src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
-// src_width + crop_x * 2;
-// r = Q420ToARGB(src, src_width * 3,
-// src_uv, src_width * 3,
-// crop_argb, argb_stride,
-// crop_width, inv_crop_height);
-// break;
- // Triplanar formats
- case FOURCC_I420:
- case FOURCC_YU12:
- case FOURCC_YV12: {
- const uint8* src_y = sample + (src_width * crop_y + crop_x);
- const uint8* src_u;
- const uint8* src_v;
- int halfwidth = (src_width + 1) / 2;
- int halfheight = (abs_src_height + 1) / 2;
- if (format == FOURCC_YV12) {
- src_v = sample + src_width * abs_src_height +
- (halfwidth * crop_y + crop_x) / 2;
- src_u = sample + src_width * abs_src_height +
- halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
- } else {
- src_u = sample + src_width * abs_src_height +
- (halfwidth * crop_y + crop_x) / 2;
- src_v = sample + src_width * abs_src_height +
- halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
- }
- r = I420ToARGB(src_y, src_width,
- src_u, halfwidth,
- src_v, halfwidth,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
- }
- case FOURCC_I422:
- case FOURCC_YV16: {
- const uint8* src_y = sample + src_width * crop_y + crop_x;
- const uint8* src_u;
- const uint8* src_v;
- int halfwidth = (src_width + 1) / 2;
- if (format == FOURCC_YV16) {
- src_v = sample + src_width * abs_src_height +
- halfwidth * crop_y + crop_x / 2;
- src_u = sample + src_width * abs_src_height +
- halfwidth * (abs_src_height + crop_y) + crop_x / 2;
- } else {
- src_u = sample + src_width * abs_src_height +
- halfwidth * crop_y + crop_x / 2;
- src_v = sample + src_width * abs_src_height +
- halfwidth * (abs_src_height + crop_y) + crop_x / 2;
- }
- r = I422ToARGB(src_y, src_width,
- src_u, halfwidth,
- src_v, halfwidth,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
- }
- case FOURCC_I444:
- case FOURCC_YV24: {
- const uint8* src_y = sample + src_width * crop_y + crop_x;
- const uint8* src_u;
- const uint8* src_v;
- if (format == FOURCC_YV24) {
- src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
- src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
- } else {
- src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
- src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
- }
- r = I444ToARGB(src_y, src_width,
- src_u, src_width,
- src_v, src_width,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
- }
- case FOURCC_I411: {
- int quarterwidth = (src_width + 3) / 4;
- const uint8* src_y = sample + src_width * crop_y + crop_x;
- const uint8* src_u = sample + src_width * abs_src_height +
- quarterwidth * crop_y + crop_x / 4;
- const uint8* src_v = sample + src_width * abs_src_height +
- quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
- r = I411ToARGB(src_y, src_width,
- src_u, quarterwidth,
- src_v, quarterwidth,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
- }
-#ifdef HAVE_JPEG
- case FOURCC_MJPG:
- r = MJPGToARGB(sample, sample_size,
- crop_argb, argb_stride,
- src_width, abs_src_height, crop_width, inv_crop_height);
- break;
-#endif
- default:
- r = -1; // unknown fourcc - return failure code.
- }
-
- if (need_buf) {
- if (!r) {
- r = ARGBRotate(crop_argb, argb_stride,
- tmp_argb, tmp_argb_stride,
- crop_width, abs_crop_height, rotation);
- }
- free(rotate_buffer);
- }
-
- return r;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/convert_to_i420.cc b/src/main/jni/libyuv/source/convert_to_i420.cc
deleted file mode 100644
index 7b194fff7..000000000
--- a/src/main/jni/libyuv/source/convert_to_i420.cc
+++ /dev/null
@@ -1,383 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-
-#include "libyuv/convert.h"
-
-#include "libyuv/format_conversion.h"
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Convert camera sample to I420 with cropping, rotation and vertical flip.
-// src_width is used for source stride computation
-// src_height is used to compute location of planes, and indicate inversion
-// sample_size is measured in bytes and is the size of the frame.
-// With MJPEG it is the compressed size of the frame.
-LIBYUV_API
-int ConvertToI420(const uint8* sample,
- size_t sample_size,
- uint8* y, int y_stride,
- uint8* u, int u_stride,
- uint8* v, int v_stride,
- int crop_x, int crop_y,
- int src_width, int src_height,
- int crop_width, int crop_height,
- enum RotationMode rotation,
- uint32 fourcc) {
- uint32 format = CanonicalFourCC(fourcc);
- int aligned_src_width = (src_width + 1) & ~1;
- const uint8* src;
- const uint8* src_uv;
- int abs_src_height = (src_height < 0) ? -src_height : src_height;
- int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
- int r = 0;
- LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
- format != FOURCC_NV12 && format != FOURCC_NV21 &&
- format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
- uint8* tmp_y = y;
- uint8* tmp_u = u;
- uint8* tmp_v = v;
- int tmp_y_stride = y_stride;
- int tmp_u_stride = u_stride;
- int tmp_v_stride = v_stride;
- uint8* rotate_buffer = NULL;
- int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
-
- if (!y || !u || !v || !sample ||
- src_width <= 0 || crop_width <= 0 ||
- src_height == 0 || crop_height == 0) {
- return -1;
- }
- if (src_height < 0) {
- inv_crop_height = -inv_crop_height;
- }
-
- // One pass rotation is available for some formats. For the rest, convert
- // to I420 (with optional vertical flipping) into a temporary I420 buffer,
- // and then rotate the I420 to the final destination buffer.
- // For in-place conversion, if destination y is same as source sample,
- // also enable temporary buffer.
- if (need_buf) {
- int y_size = crop_width * abs_crop_height;
- int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
- rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
- if (!rotate_buffer) {
- return 1; // Out of memory runtime error.
- }
- y = rotate_buffer;
- u = y + y_size;
- v = u + uv_size;
- y_stride = crop_width;
- u_stride = v_stride = ((crop_width + 1) / 2);
- }
-
- switch (format) {
- // Single plane formats
- case FOURCC_YUY2:
- src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = YUY2ToI420(src, aligned_src_width * 2,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_UYVY:
- src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = UYVYToI420(src, aligned_src_width * 2,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_RGBP:
- src = sample + (src_width * crop_y + crop_x) * 2;
- r = RGB565ToI420(src, src_width * 2,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_RGBO:
- src = sample + (src_width * crop_y + crop_x) * 2;
- r = ARGB1555ToI420(src, src_width * 2,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_R444:
- src = sample + (src_width * crop_y + crop_x) * 2;
- r = ARGB4444ToI420(src, src_width * 2,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_24BG:
- src = sample + (src_width * crop_y + crop_x) * 3;
- r = RGB24ToI420(src, src_width * 3,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_RAW:
- src = sample + (src_width * crop_y + crop_x) * 3;
- r = RAWToI420(src, src_width * 3,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_ARGB:
- src = sample + (src_width * crop_y + crop_x) * 4;
- r = ARGBToI420(src, src_width * 4,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_BGRA:
- src = sample + (src_width * crop_y + crop_x) * 4;
- r = BGRAToI420(src, src_width * 4,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_ABGR:
- src = sample + (src_width * crop_y + crop_x) * 4;
- r = ABGRToI420(src, src_width * 4,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_RGBA:
- src = sample + (src_width * crop_y + crop_x) * 4;
- r = RGBAToI420(src, src_width * 4,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- // TODO(fbarchard): Support cropping Bayer by odd numbers
- // by adjusting fourcc.
- case FOURCC_BGGR:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerBGGRToI420(src, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_GBRG:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerGBRGToI420(src, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_GRBG:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerGRBGToI420(src, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_RGGB:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerRGGBToI420(src, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_I400:
- src = sample + src_width * crop_y + crop_x;
- r = I400ToI420(src, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- // Biplanar formats
- case FOURCC_NV12:
- src = sample + (src_width * crop_y + crop_x);
- src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
- r = NV12ToI420Rotate(src, src_width,
- src_uv, aligned_src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height, rotation);
- break;
- case FOURCC_NV21:
- src = sample + (src_width * crop_y + crop_x);
- src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
- // Call NV12 but with u and v parameters swapped.
- r = NV12ToI420Rotate(src, src_width,
- src_uv, aligned_src_width,
- y, y_stride,
- v, v_stride,
- u, u_stride,
- crop_width, inv_crop_height, rotation);
- break;
- case FOURCC_M420:
- src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
- r = M420ToI420(src, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_Q420:
- src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
- src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
- src_width + crop_x * 2;
- r = Q420ToI420(src, src_width * 3,
- src_uv, src_width * 3,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- // Triplanar formats
- case FOURCC_I420:
- case FOURCC_YU12:
- case FOURCC_YV12: {
- const uint8* src_y = sample + (src_width * crop_y + crop_x);
- const uint8* src_u;
- const uint8* src_v;
- int halfwidth = (src_width + 1) / 2;
- int halfheight = (abs_src_height + 1) / 2;
- if (format == FOURCC_YV12) {
- src_v = sample + src_width * abs_src_height +
- (halfwidth * crop_y + crop_x) / 2;
- src_u = sample + src_width * abs_src_height +
- halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
- } else {
- src_u = sample + src_width * abs_src_height +
- (halfwidth * crop_y + crop_x) / 2;
- src_v = sample + src_width * abs_src_height +
- halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
- }
- r = I420Rotate(src_y, src_width,
- src_u, halfwidth,
- src_v, halfwidth,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height, rotation);
- break;
- }
- case FOURCC_I422:
- case FOURCC_YV16: {
- const uint8* src_y = sample + src_width * crop_y + crop_x;
- const uint8* src_u;
- const uint8* src_v;
- int halfwidth = (src_width + 1) / 2;
- if (format == FOURCC_YV16) {
- src_v = sample + src_width * abs_src_height +
- halfwidth * crop_y + crop_x / 2;
- src_u = sample + src_width * abs_src_height +
- halfwidth * (abs_src_height + crop_y) + crop_x / 2;
- } else {
- src_u = sample + src_width * abs_src_height +
- halfwidth * crop_y + crop_x / 2;
- src_v = sample + src_width * abs_src_height +
- halfwidth * (abs_src_height + crop_y) + crop_x / 2;
- }
- r = I422ToI420(src_y, src_width,
- src_u, halfwidth,
- src_v, halfwidth,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- }
- case FOURCC_I444:
- case FOURCC_YV24: {
- const uint8* src_y = sample + src_width * crop_y + crop_x;
- const uint8* src_u;
- const uint8* src_v;
- if (format == FOURCC_YV24) {
- src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
- src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
- } else {
- src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
- src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
- }
- r = I444ToI420(src_y, src_width,
- src_u, src_width,
- src_v, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- }
- case FOURCC_I411: {
- int quarterwidth = (src_width + 3) / 4;
- const uint8* src_y = sample + src_width * crop_y + crop_x;
- const uint8* src_u = sample + src_width * abs_src_height +
- quarterwidth * crop_y + crop_x / 4;
- const uint8* src_v = sample + src_width * abs_src_height +
- quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
- r = I411ToI420(src_y, src_width,
- src_u, quarterwidth,
- src_v, quarterwidth,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- }
-#ifdef HAVE_JPEG
- case FOURCC_MJPG:
- r = MJPGToI420(sample, sample_size,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- src_width, abs_src_height, crop_width, inv_crop_height);
- break;
-#endif
- default:
- r = -1; // unknown fourcc - return failure code.
- }
-
- if (need_buf) {
- if (!r) {
- r = I420Rotate(y, y_stride,
- u, u_stride,
- v, v_stride,
- tmp_y, tmp_y_stride,
- tmp_u, tmp_u_stride,
- tmp_v, tmp_v_stride,
- crop_width, abs_crop_height, rotation);
- }
- free(rotate_buffer);
- }
-
- return r;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/cpu_id.cc b/src/main/jni/libyuv/source/cpu_id.cc
deleted file mode 100644
index deb4c4465..000000000
--- a/src/main/jni/libyuv/source/cpu_id.cc
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/cpu_id.h"
-
-#if defined(_MSC_VER) && !defined(__clang__)
-#include <intrin.h> // For __cpuidex()
-#endif
-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
- !defined(__native_client__) && \
- defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
-#include <immintrin.h> // For _xgetbv()
-#endif
-
-#if !defined(__native_client__)
-#include <stdlib.h> // For getenv()
-#endif
-
-// For ArmCpuCaps() but unittested on all platforms
-#include <stdio.h>
-#include <string.h>
-
-#include "libyuv/basic_types.h" // For CPU_X86
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// For functions that use the stack and have runtime checks for overflow,
-// use SAFEBUFFERS to avoid additional check.
-#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
-#define SAFEBUFFERS __declspec(safebuffers)
-#else
-#define SAFEBUFFERS
-#endif
-
-// Low level cpuid for X86. Returns zeros on other CPUs.
-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
- (defined(_M_IX86) || defined(_M_X64) || \
- defined(__i386__) || defined(__x86_64__))
-LIBYUV_API
-void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
-#if defined(_MSC_VER) && !defined(__clang__)
-#if (_MSC_FULL_VER >= 160040219)
- __cpuidex((int*)(cpu_info), info_eax, info_ecx);
-#elif defined(_M_IX86)
- __asm {
- mov eax, info_eax
- mov ecx, info_ecx
- mov edi, cpu_info
- cpuid
- mov [edi], eax
- mov [edi + 4], ebx
- mov [edi + 8], ecx
- mov [edi + 12], edx
- }
-#else
- if (info_ecx == 0) {
- __cpuid((int*)(cpu_info), info_eax);
- } else {
- cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
- }
-#endif
-#else // defined(_MSC_VER)
- uint32 info_ebx, info_edx;
- asm volatile ( // NOLINT
-#if defined( __i386__) && defined(__PIC__)
- // Preserve ebx for fpic 32 bit.
- "mov %%ebx, %%edi \n"
- "cpuid \n"
- "xchg %%edi, %%ebx \n"
- : "=D" (info_ebx),
-#else
- "cpuid \n"
- : "=b" (info_ebx),
-#endif // defined( __i386__) && defined(__PIC__)
- "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
- cpu_info[0] = info_eax;
- cpu_info[1] = info_ebx;
- cpu_info[2] = info_ecx;
- cpu_info[3] = info_edx;
-#endif // defined(_MSC_VER)
-}
-
-#if !defined(__native_client__)
-#define HAS_XGETBV
-// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
-int TestOsSaveYmm() {
- uint32 xcr0 = 0u;
-#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
- xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
-#elif defined(_M_IX86) && defined(_MSC_VER)
- __asm {
- xor ecx, ecx // xcr 0
- _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
- mov xcr0, eax
- }
-#elif defined(__i386__) || defined(__x86_64__)
- asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
-#endif // defined(_MSC_VER)
- return((xcr0 & 6) == 6); // Is ymm saved?
-}
-#endif // !defined(__native_client__)
-#else
-LIBYUV_API
-void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
- cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
-}
-#endif
-
-// based on libvpx arm_cpudetect.c
-// For Arm, but public to allow testing on any CPU
-LIBYUV_API SAFEBUFFERS
-int ArmCpuCaps(const char* cpuinfo_name) {
- char cpuinfo_line[512];
- FILE* f = fopen(cpuinfo_name, "r");
- if (!f) {
- // Assume Neon if /proc/cpuinfo is unavailable.
- // This will occur for Chrome sandbox for Pepper or Render process.
- return kCpuHasNEON;
- }
- while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
- if (memcmp(cpuinfo_line, "Features", 8) == 0) {
- char* p = strstr(cpuinfo_line, " neon");
- if (p && (p[5] == ' ' || p[5] == '\n')) {
- fclose(f);
- return kCpuHasNEON;
- }
- }
- }
- fclose(f);
- return 0;
-}
-
-#if defined(__mips__) && defined(__linux__)
-static int MipsCpuCaps(const char* search_string) {
- char cpuinfo_line[512];
- const char* file_name = "/proc/cpuinfo";
- FILE* f = fopen(file_name, "r");
- if (!f) {
- // Assume DSP if /proc/cpuinfo is unavailable.
- // This will occur for Chrome sandbox for Pepper or Render process.
- return kCpuHasMIPS_DSP;
- }
- while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) {
- if (strstr(cpuinfo_line, search_string) != NULL) {
- fclose(f);
- return kCpuHasMIPS_DSP;
- }
- }
- fclose(f);
- return 0;
-}
-#endif
-
-// CPU detect function for SIMD instruction sets.
-LIBYUV_API
-int cpu_info_ = kCpuInit; // cpu_info is not initialized yet.
-
-// Test environment variable for disabling CPU features. Any non-zero value
-// to disable. Zero ignored to make it easy to set the variable on/off.
-#if !defined(__native_client__) && !defined(_M_ARM)
-
-static LIBYUV_BOOL TestEnv(const char* name) {
- const char* var = getenv(name);
- if (var) {
- if (var[0] != '0') {
- return LIBYUV_TRUE;
- }
- }
- return LIBYUV_FALSE;
-}
-#else // nacl does not support getenv().
-static LIBYUV_BOOL TestEnv(const char*) {
- return LIBYUV_FALSE;
-}
-#endif
-
-LIBYUV_API SAFEBUFFERS
-int InitCpuFlags(void) {
-#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
-
- uint32 cpu_info0[4] = { 0, 0, 0, 0 };
- uint32 cpu_info1[4] = { 0, 0, 0, 0 };
- uint32 cpu_info7[4] = { 0, 0, 0, 0 };
- CpuId(0, 0, cpu_info0);
- CpuId(1, 0, cpu_info1);
- if (cpu_info0[0] >= 7) {
- CpuId(7, 0, cpu_info7);
- }
- cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
- ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
- ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
- ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
- ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
- ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
- kCpuHasX86;
-
-#ifdef HAS_XGETBV
- if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave
- TestOsSaveYmm()) { // Saves YMM.
- cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
- kCpuHasAVX;
- }
-#endif
- // Environment variable overrides for testing.
- if (TestEnv("LIBYUV_DISABLE_X86")) {
- cpu_info_ &= ~kCpuHasX86;
- }
- if (TestEnv("LIBYUV_DISABLE_SSE2")) {
- cpu_info_ &= ~kCpuHasSSE2;
- }
- if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
- cpu_info_ &= ~kCpuHasSSSE3;
- }
- if (TestEnv("LIBYUV_DISABLE_SSE41")) {
- cpu_info_ &= ~kCpuHasSSE41;
- }
- if (TestEnv("LIBYUV_DISABLE_SSE42")) {
- cpu_info_ &= ~kCpuHasSSE42;
- }
- if (TestEnv("LIBYUV_DISABLE_AVX")) {
- cpu_info_ &= ~kCpuHasAVX;
- }
- if (TestEnv("LIBYUV_DISABLE_AVX2")) {
- cpu_info_ &= ~kCpuHasAVX2;
- }
- if (TestEnv("LIBYUV_DISABLE_ERMS")) {
- cpu_info_ &= ~kCpuHasERMS;
- }
- if (TestEnv("LIBYUV_DISABLE_FMA3")) {
- cpu_info_ &= ~kCpuHasFMA3;
- }
-#elif defined(__mips__) && defined(__linux__)
- // Linux mips parse text file for dsp detect.
- cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP.
-#if defined(__mips_dspr2)
- cpu_info_ |= kCpuHasMIPS_DSPR2;
-#endif
- cpu_info_ |= kCpuHasMIPS;
-
- if (getenv("LIBYUV_DISABLE_MIPS")) {
- cpu_info_ &= ~kCpuHasMIPS;
- }
- if (getenv("LIBYUV_DISABLE_MIPS_DSP")) {
- cpu_info_ &= ~kCpuHasMIPS_DSP;
- }
- if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
- cpu_info_ &= ~kCpuHasMIPS_DSPR2;
- }
-#elif defined(__arm__) || defined(__aarch64__)
-// gcc -mfpu=neon defines __ARM_NEON__
-// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
-// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
-#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
- cpu_info_ = kCpuHasNEON;
-// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
-// flag in it.
-// So for aarch64, neon enabling is hard coded here.
-#elif defined(__aarch64__)
- cpu_info_ = kCpuHasNEON;
-#else
- // Linux arm parse text file for neon detect.
- cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
-#endif
- cpu_info_ |= kCpuHasARM;
- if (TestEnv("LIBYUV_DISABLE_NEON")) {
- cpu_info_ &= ~kCpuHasNEON;
- }
-#endif // __arm__
- if (TestEnv("LIBYUV_DISABLE_ASM")) {
- cpu_info_ = 0;
- }
- return cpu_info_;
-}
-
-LIBYUV_API
-void MaskCpuFlags(int enable_flags) {
- cpu_info_ = InitCpuFlags() & enable_flags;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/format_conversion.cc b/src/main/jni/libyuv/source/format_conversion.cc
deleted file mode 100644
index 3c1737153..000000000
--- a/src/main/jni/libyuv/source/format_conversion.cc
+++ /dev/null
@@ -1,554 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/format_conversion.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/video_common.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// generate a selector mask useful for pshufb
-static uint32 GenerateSelector(int select0, int select1) {
- return (uint32)(select0) |
- (uint32)((select1 + 4) << 8) |
- (uint32)((select0 + 8) << 16) |
- (uint32)((select1 + 12) << 24);
-}
-
-static int MakeSelectors(const int blue_index,
- const int green_index,
- const int red_index,
- uint32 dst_fourcc_bayer,
- uint32* index_map) {
- // Now build a lookup table containing the indices for the four pixels in each
- // 2x2 Bayer grid.
- switch (dst_fourcc_bayer) {
- case FOURCC_BGGR:
- index_map[0] = GenerateSelector(blue_index, green_index);
- index_map[1] = GenerateSelector(green_index, red_index);
- break;
- case FOURCC_GBRG:
- index_map[0] = GenerateSelector(green_index, blue_index);
- index_map[1] = GenerateSelector(red_index, green_index);
- break;
- case FOURCC_RGGB:
- index_map[0] = GenerateSelector(red_index, green_index);
- index_map[1] = GenerateSelector(green_index, blue_index);
- break;
- case FOURCC_GRBG:
- index_map[0] = GenerateSelector(green_index, red_index);
- index_map[1] = GenerateSelector(blue_index, green_index);
- break;
- default:
- return -1; // Bad FourCC
- }
- return 0;
-}
-
-// Converts 32 bit ARGB to Bayer RGB formats.
-LIBYUV_API
-int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
- uint8* dst_bayer, int dst_stride_bayer,
- int width, int height,
- uint32 dst_fourcc_bayer) {
- int y;
- const int blue_index = 0; // Offsets for ARGB format
- const int green_index = 1;
- const int red_index = 2;
- uint32 index_map[2];
- void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) = ARGBToBayerRow_C;
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- ARGBToBayerRow = ARGBToBayerRow_SSSE3;
- }
- }
-#elif defined(HAS_ARGBTOBAYERROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToBayerRow = ARGBToBayerRow_NEON;
- }
- }
-#endif
- if (MakeSelectors(blue_index, green_index, red_index,
- dst_fourcc_bayer, index_map)) {
- return -1; // Bad FourCC
- }
-
- for (y = 0; y < height; ++y) {
- ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width);
- src_argb += src_stride_argb;
- dst_bayer += dst_stride_bayer;
- }
- return 0;
-}
-
-#define AVG(a, b) (((a) + (b)) >> 1)
-
-static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
- uint8* dst_argb, int pix) {
- const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
- uint8 g = src_bayer0[1];
- uint8 r = src_bayer1[1];
- int x;
- for (x = 0; x < pix - 2; x += 2) {
- dst_argb[0] = src_bayer0[0];
- dst_argb[1] = AVG(g, src_bayer0[1]);
- dst_argb[2] = AVG(r, src_bayer1[1]);
- dst_argb[3] = 255U;
- dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]);
- dst_argb[5] = src_bayer0[1];
- dst_argb[6] = src_bayer1[1];
- dst_argb[7] = 255U;
- g = src_bayer0[1];
- r = src_bayer1[1];
- src_bayer0 += 2;
- src_bayer1 += 2;
- dst_argb += 8;
- }
- dst_argb[0] = src_bayer0[0];
- dst_argb[1] = AVG(g, src_bayer0[1]);
- dst_argb[2] = AVG(r, src_bayer1[1]);
- dst_argb[3] = 255U;
- if (!(pix & 1)) {
- dst_argb[4] = src_bayer0[0];
- dst_argb[5] = src_bayer0[1];
- dst_argb[6] = src_bayer1[1];
- dst_argb[7] = 255U;
- }
-}
-
-static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
- uint8* dst_argb, int pix) {
- const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
- uint8 g = src_bayer0[1];
- uint8 b = src_bayer1[1];
- int x;
- for (x = 0; x < pix - 2; x += 2) {
- dst_argb[0] = AVG(b, src_bayer1[1]);
- dst_argb[1] = AVG(g, src_bayer0[1]);
- dst_argb[2] = src_bayer0[0];
- dst_argb[3] = 255U;
- dst_argb[4] = src_bayer1[1];
- dst_argb[5] = src_bayer0[1];
- dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]);
- dst_argb[7] = 255U;
- g = src_bayer0[1];
- b = src_bayer1[1];
- src_bayer0 += 2;
- src_bayer1 += 2;
- dst_argb += 8;
- }
- dst_argb[0] = AVG(b, src_bayer1[1]);
- dst_argb[1] = AVG(g, src_bayer0[1]);
- dst_argb[2] = src_bayer0[0];
- dst_argb[3] = 255U;
- if (!(pix & 1)) {
- dst_argb[4] = src_bayer1[1];
- dst_argb[5] = src_bayer0[1];
- dst_argb[6] = src_bayer0[0];
- dst_argb[7] = 255U;
- }
-}
-
-static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
- uint8* dst_argb, int pix) {
- const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
- uint8 b = src_bayer0[1];
- int x;
- for (x = 0; x < pix - 2; x += 2) {
- dst_argb[0] = AVG(b, src_bayer0[1]);
- dst_argb[1] = src_bayer0[0];
- dst_argb[2] = src_bayer1[0];
- dst_argb[3] = 255U;
- dst_argb[4] = src_bayer0[1];
- dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
- dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]);
- dst_argb[7] = 255U;
- b = src_bayer0[1];
- src_bayer0 += 2;
- src_bayer1 += 2;
- dst_argb += 8;
- }
- dst_argb[0] = AVG(b, src_bayer0[1]);
- dst_argb[1] = src_bayer0[0];
- dst_argb[2] = src_bayer1[0];
- dst_argb[3] = 255U;
- if (!(pix & 1)) {
- dst_argb[4] = src_bayer0[1];
- dst_argb[5] = src_bayer0[0];
- dst_argb[6] = src_bayer1[0];
- dst_argb[7] = 255U;
- }
-}
-
-static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
- uint8* dst_argb, int pix) {
- const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
- uint8 r = src_bayer0[1];
- int x;
- for (x = 0; x < pix - 2; x += 2) {
- dst_argb[0] = src_bayer1[0];
- dst_argb[1] = src_bayer0[0];
- dst_argb[2] = AVG(r, src_bayer0[1]);
- dst_argb[3] = 255U;
- dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]);
- dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
- dst_argb[6] = src_bayer0[1];
- dst_argb[7] = 255U;
- r = src_bayer0[1];
- src_bayer0 += 2;
- src_bayer1 += 2;
- dst_argb += 8;
- }
- dst_argb[0] = src_bayer1[0];
- dst_argb[1] = src_bayer0[0];
- dst_argb[2] = AVG(r, src_bayer0[1]);
- dst_argb[3] = 255U;
- if (!(pix & 1)) {
- dst_argb[4] = src_bayer1[0];
- dst_argb[5] = src_bayer0[0];
- dst_argb[6] = src_bayer0[1];
- dst_argb[7] = 255U;
- }
-}
-
-// Converts any Bayer RGB format to ARGB.
-LIBYUV_API
-int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height,
- uint32 src_fourcc_bayer) {
- int y;
- void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int pix);
- void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int pix);
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
- switch (src_fourcc_bayer) {
- case FOURCC_BGGR:
- BayerRow0 = BayerRowBG;
- BayerRow1 = BayerRowGR;
- break;
- case FOURCC_GBRG:
- BayerRow0 = BayerRowGB;
- BayerRow1 = BayerRowRG;
- break;
- case FOURCC_GRBG:
- BayerRow0 = BayerRowGR;
- BayerRow1 = BayerRowBG;
- break;
- case FOURCC_RGGB:
- BayerRow0 = BayerRowRG;
- BayerRow1 = BayerRowGB;
- break;
- default:
- return -1; // Bad FourCC
- }
-
- for (y = 0; y < height - 1; y += 2) {
- BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
- BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
- dst_argb + dst_stride_argb, width);
- src_bayer += src_stride_bayer * 2;
- dst_argb += dst_stride_argb * 2;
- }
- if (height & 1) {
- BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
- }
- return 0;
-}
-
-// Converts any Bayer RGB format to ARGB.
-LIBYUV_API
-int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height,
- uint32 src_fourcc_bayer) {
- void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int pix);
- void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int pix);
-
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
- // Negative height means invert the image.
- if (height < 0) {
- int halfheight;
- height = -height;
- halfheight = (height + 1) >> 1;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_u = dst_u + (halfheight - 1) * dst_stride_u;
- dst_v = dst_v + (halfheight - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
- }
-#elif defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
- }
- }
-#endif
-
- switch (src_fourcc_bayer) {
- case FOURCC_BGGR:
- BayerRow0 = BayerRowBG;
- BayerRow1 = BayerRowGR;
- break;
- case FOURCC_GBRG:
- BayerRow0 = BayerRowGB;
- BayerRow1 = BayerRowRG;
- break;
- case FOURCC_GRBG:
- BayerRow0 = BayerRowGR;
- BayerRow1 = BayerRowBG;
- break;
- case FOURCC_RGGB:
- BayerRow0 = BayerRowRG;
- BayerRow1 = BayerRowGB;
- break;
- default:
- return -1; // Bad FourCC
- }
-
- {
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
- int y;
- for (y = 0; y < height - 1; y += 2) {
- BayerRow0(src_bayer, src_stride_bayer, row, width);
- BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
- row + kRowSize, width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
- src_bayer += src_stride_bayer * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- BayerRow0(src_bayer, src_stride_bayer, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- }
- free_aligned_buffer_64(row);
- }
- return 0;
-}
-
-// Convert I420 to Bayer.
-LIBYUV_API
-int I420ToBayer(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_bayer, int dst_stride_bayer,
- int width, int height,
- uint32 dst_fourcc_bayer) {
- void (*I422ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToARGBRow_C;
- void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) = ARGBToBayerRow_C;
- const int blue_index = 0; // Offsets for ARGB format
- const int green_index = 1;
- const int red_index = 2;
- uint32 index_map[2];
- // Negative height means invert the image.
- if (height < 0) {
- int halfheight;
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
-#if defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
- I422ToARGBRow = I422ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGBRow = I422ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- I422ToARGBRow = I422ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
- IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
- IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
- I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
- }
-#endif
-
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- ARGBToBayerRow = ARGBToBayerRow_SSSE3;
- }
- }
-#elif defined(HAS_ARGBTOBAYERROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToBayerRow = ARGBToBayerRow_NEON;
- }
- }
-#endif
-
- if (MakeSelectors(blue_index, green_index, red_index,
- dst_fourcc_bayer, index_map)) {
- return -1; // Bad FourCC
- }
- {
- // Allocate a row of ARGB.
- align_buffer_64(row, width * 4);
- int y;
- for (y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, row, width);
- ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width);
- dst_bayer += dst_stride_bayer;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- free_aligned_buffer_64(row);
- }
- return 0;
-}
-
-#define MAKEBAYERFOURCC(BAYER) \
-LIBYUV_API \
-int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer, \
- uint8* dst_y, int dst_stride_y, \
- uint8* dst_u, int dst_stride_u, \
- uint8* dst_v, int dst_stride_v, \
- int width, int height) { \
- return BayerToI420(src_bayer, src_stride_bayer, \
- dst_y, dst_stride_y, \
- dst_u, dst_stride_u, \
- dst_v, dst_stride_v, \
- width, height, \
- FOURCC_##BAYER); \
-} \
- \
-LIBYUV_API \
-int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y, \
- const uint8* src_u, int src_stride_u, \
- const uint8* src_v, int src_stride_v, \
- uint8* dst_bayer, int dst_stride_bayer, \
- int width, int height) { \
- return I420ToBayer(src_y, src_stride_y, \
- src_u, src_stride_u, \
- src_v, src_stride_v, \
- dst_bayer, dst_stride_bayer, \
- width, height, \
- FOURCC_##BAYER); \
-} \
- \
-LIBYUV_API \
-int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb, \
- uint8* dst_bayer, int dst_stride_bayer, \
- int width, int height) { \
- return ARGBToBayer(src_argb, src_stride_argb, \
- dst_bayer, dst_stride_bayer, \
- width, height, \
- FOURCC_##BAYER); \
-} \
- \
-LIBYUV_API \
-int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer, \
- uint8* dst_argb, int dst_stride_argb, \
- int width, int height) { \
- return BayerToARGB(src_bayer, src_stride_bayer, \
- dst_argb, dst_stride_argb, \
- width, height, \
- FOURCC_##BAYER); \
-}
-
-MAKEBAYERFOURCC(BGGR)
-MAKEBAYERFOURCC(GBRG)
-MAKEBAYERFOURCC(GRBG)
-MAKEBAYERFOURCC(RGGB)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/mjpeg_decoder.cc b/src/main/jni/libyuv/source/mjpeg_decoder.cc
deleted file mode 100644
index 36028c3cc..000000000
--- a/src/main/jni/libyuv/source/mjpeg_decoder.cc
+++ /dev/null
@@ -1,566 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/mjpeg_decoder.h"
-
-#ifdef HAVE_JPEG
-#include <assert.h>
-
-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
- !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-// Must be included before jpeglib.
-#include <setjmp.h>
-#define HAVE_SETJMP
-#endif
-struct FILE; // For jpeglib.h.
-
-// C++ build requires extern C for jpeg internals.
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <jpeglib.h>
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#include "libyuv/planar_functions.h" // For CopyPlane().
-
-namespace libyuv {
-
-#ifdef HAVE_SETJMP
-struct SetJmpErrorMgr {
- jpeg_error_mgr base; // Must be at the top
- jmp_buf setjmp_buffer;
-};
-#endif
-
-const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
-const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
-const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
-const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
-const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
-const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
-
-// Methods that are passed to jpeglib.
-boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
-void init_source(jpeg_decompress_struct* cinfo);
-void skip_input_data(jpeg_decompress_struct* cinfo,
- long num_bytes); // NOLINT
-void term_source(jpeg_decompress_struct* cinfo);
-void ErrorHandler(jpeg_common_struct* cinfo);
-
-MJpegDecoder::MJpegDecoder()
- : has_scanline_padding_(LIBYUV_FALSE),
- num_outbufs_(0),
- scanlines_(NULL),
- scanlines_sizes_(NULL),
- databuf_(NULL),
- databuf_strides_(NULL) {
- decompress_struct_ = new jpeg_decompress_struct;
- source_mgr_ = new jpeg_source_mgr;
-#ifdef HAVE_SETJMP
- error_mgr_ = new SetJmpErrorMgr;
- decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
- // Override standard exit()-based error handler.
- error_mgr_->base.error_exit = &ErrorHandler;
-#endif
- decompress_struct_->client_data = NULL;
- source_mgr_->init_source = &init_source;
- source_mgr_->fill_input_buffer = &fill_input_buffer;
- source_mgr_->skip_input_data = &skip_input_data;
- source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
- source_mgr_->term_source = &term_source;
- jpeg_create_decompress(decompress_struct_);
- decompress_struct_->src = source_mgr_;
- buf_vec_.buffers = &buf_;
- buf_vec_.len = 1;
-}
-
-MJpegDecoder::~MJpegDecoder() {
- jpeg_destroy_decompress(decompress_struct_);
- delete decompress_struct_;
- delete source_mgr_;
-#ifdef HAVE_SETJMP
- delete error_mgr_;
-#endif
- DestroyOutputBuffers();
-}
-
-LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
- if (!ValidateJpeg(src, src_len)) {
- return LIBYUV_FALSE;
- }
-
- buf_.data = src;
- buf_.len = static_cast<int>(src_len);
- buf_vec_.pos = 0;
- decompress_struct_->client_data = &buf_vec_;
-#ifdef HAVE_SETJMP
- if (setjmp(error_mgr_->setjmp_buffer)) {
- // We called jpeg_read_header, it experienced an error, and we called
- // longjmp() and rewound the stack to here. Return error.
- return LIBYUV_FALSE;
- }
-#endif
- if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
- // ERROR: Bad MJPEG header
- return LIBYUV_FALSE;
- }
- AllocOutputBuffers(GetNumComponents());
- for (int i = 0; i < num_outbufs_; ++i) {
- int scanlines_size = GetComponentScanlinesPerImcuRow(i);
- if (scanlines_sizes_[i] != scanlines_size) {
- if (scanlines_[i]) {
- delete scanlines_[i];
- }
- scanlines_[i] = new uint8* [scanlines_size];
- scanlines_sizes_[i] = scanlines_size;
- }
-
- // We allocate padding for the final scanline to pad it up to DCTSIZE bytes
- // to avoid memory errors, since jpeglib only reads full MCUs blocks. For
- // the preceding scanlines, the padding is not needed/wanted because the
- // following addresses will already be valid (they are the initial bytes of
- // the next scanline) and will be overwritten when jpeglib writes out that
- // next scanline.
- int databuf_stride = GetComponentStride(i);
- int databuf_size = scanlines_size * databuf_stride;
- if (databuf_strides_[i] != databuf_stride) {
- if (databuf_[i]) {
- delete databuf_[i];
- }
- databuf_[i] = new uint8[databuf_size];
- databuf_strides_[i] = databuf_stride;
- }
-
- if (GetComponentStride(i) != GetComponentWidth(i)) {
- has_scanline_padding_ = LIBYUV_TRUE;
- }
- }
- return LIBYUV_TRUE;
-}
-
-static int DivideAndRoundUp(int numerator, int denominator) {
- return (numerator + denominator - 1) / denominator;
-}
-
-static int DivideAndRoundDown(int numerator, int denominator) {
- return numerator / denominator;
-}
-
-// Returns width of the last loaded frame.
-int MJpegDecoder::GetWidth() {
- return decompress_struct_->image_width;
-}
-
-// Returns height of the last loaded frame.
-int MJpegDecoder::GetHeight() {
- return decompress_struct_->image_height;
-}
-
-// Returns format of the last loaded frame. The return value is one of the
-// kColorSpace* constants.
-int MJpegDecoder::GetColorSpace() {
- return decompress_struct_->jpeg_color_space;
-}
-
-// Number of color components in the color space.
-int MJpegDecoder::GetNumComponents() {
- return decompress_struct_->num_components;
-}
-
-// Sample factors of the n-th component.
-int MJpegDecoder::GetHorizSampFactor(int component) {
- return decompress_struct_->comp_info[component].h_samp_factor;
-}
-
-int MJpegDecoder::GetVertSampFactor(int component) {
- return decompress_struct_->comp_info[component].v_samp_factor;
-}
-
-int MJpegDecoder::GetHorizSubSampFactor(int component) {
- return decompress_struct_->max_h_samp_factor /
- GetHorizSampFactor(component);
-}
-
-int MJpegDecoder::GetVertSubSampFactor(int component) {
- return decompress_struct_->max_v_samp_factor /
- GetVertSampFactor(component);
-}
-
-int MJpegDecoder::GetImageScanlinesPerImcuRow() {
- return decompress_struct_->max_v_samp_factor * DCTSIZE;
-}
-
-int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
- int vs = GetVertSubSampFactor(component);
- return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
-}
-
-int MJpegDecoder::GetComponentWidth(int component) {
- int hs = GetHorizSubSampFactor(component);
- return DivideAndRoundUp(GetWidth(), hs);
-}
-
-int MJpegDecoder::GetComponentHeight(int component) {
- int vs = GetVertSubSampFactor(component);
- return DivideAndRoundUp(GetHeight(), vs);
-}
-
-// Get width in bytes padded out to a multiple of DCTSIZE
-int MJpegDecoder::GetComponentStride(int component) {
- return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
-}
-
-int MJpegDecoder::GetComponentSize(int component) {
- return GetComponentWidth(component) * GetComponentHeight(component);
-}
-
-LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
-#ifdef HAVE_SETJMP
- if (setjmp(error_mgr_->setjmp_buffer)) {
- // We called jpeg_abort_decompress, it experienced an error, and we called
- // longjmp() and rewound the stack to here. Return error.
- return LIBYUV_FALSE;
- }
-#endif
- jpeg_abort_decompress(decompress_struct_);
- return LIBYUV_TRUE;
-}
-
-// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
-LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
- uint8** planes, int dst_width, int dst_height) {
- if (dst_width != GetWidth() ||
- dst_height > GetHeight()) {
- // ERROR: Bad dimensions
- return LIBYUV_FALSE;
- }
-#ifdef HAVE_SETJMP
- if (setjmp(error_mgr_->setjmp_buffer)) {
- // We called into jpeglib, it experienced an error sometime during this
- // function call, and we called longjmp() and rewound the stack to here.
- // Return error.
- return LIBYUV_FALSE;
- }
-#endif
- if (!StartDecode()) {
- return LIBYUV_FALSE;
- }
- SetScanlinePointers(databuf_);
- int lines_left = dst_height;
- // Compute amount of lines to skip to implement vertical crop.
- // TODO(fbarchard): Ensure skip is a multiple of maximum component
- // subsample. ie 2
- int skip = (GetHeight() - dst_height) / 2;
- if (skip > 0) {
- // There is no API to skip lines in the output data, so we read them
- // into the temp buffer.
- while (skip >= GetImageScanlinesPerImcuRow()) {
- if (!DecodeImcuRow()) {
- FinishDecode();
- return LIBYUV_FALSE;
- }
- skip -= GetImageScanlinesPerImcuRow();
- }
- if (skip > 0) {
- // Have a partial iMCU row left over to skip. Must read it and then
- // copy the parts we want into the destination.
- if (!DecodeImcuRow()) {
- FinishDecode();
- return LIBYUV_FALSE;
- }
- for (int i = 0; i < num_outbufs_; ++i) {
- // TODO(fbarchard): Compute skip to avoid this
- assert(skip % GetVertSubSampFactor(i) == 0);
- int rows_to_skip =
- DivideAndRoundDown(skip, GetVertSubSampFactor(i));
- int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
- rows_to_skip;
- int data_to_skip = rows_to_skip * GetComponentStride(i);
- CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
- planes[i], GetComponentWidth(i),
- GetComponentWidth(i), scanlines_to_copy);
- planes[i] += scanlines_to_copy * GetComponentWidth(i);
- }
- lines_left -= (GetImageScanlinesPerImcuRow() - skip);
- }
- }
-
- // Read full MCUs but cropped horizontally
- for (; lines_left > GetImageScanlinesPerImcuRow();
- lines_left -= GetImageScanlinesPerImcuRow()) {
- if (!DecodeImcuRow()) {
- FinishDecode();
- return LIBYUV_FALSE;
- }
- for (int i = 0; i < num_outbufs_; ++i) {
- int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
- CopyPlane(databuf_[i], GetComponentStride(i),
- planes[i], GetComponentWidth(i),
- GetComponentWidth(i), scanlines_to_copy);
- planes[i] += scanlines_to_copy * GetComponentWidth(i);
- }
- }
-
- if (lines_left > 0) {
- // Have a partial iMCU row left over to decode.
- if (!DecodeImcuRow()) {
- FinishDecode();
- return LIBYUV_FALSE;
- }
- for (int i = 0; i < num_outbufs_; ++i) {
- int scanlines_to_copy =
- DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
- CopyPlane(databuf_[i], GetComponentStride(i),
- planes[i], GetComponentWidth(i),
- GetComponentWidth(i), scanlines_to_copy);
- planes[i] += scanlines_to_copy * GetComponentWidth(i);
- }
- }
- return FinishDecode();
-}
-
-LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
- int dst_width, int dst_height) {
- if (dst_width != GetWidth() ||
- dst_height > GetHeight()) {
- // ERROR: Bad dimensions
- return LIBYUV_FALSE;
- }
-#ifdef HAVE_SETJMP
- if (setjmp(error_mgr_->setjmp_buffer)) {
- // We called into jpeglib, it experienced an error sometime during this
- // function call, and we called longjmp() and rewound the stack to here.
- // Return error.
- return LIBYUV_FALSE;
- }
-#endif
- if (!StartDecode()) {
- return LIBYUV_FALSE;
- }
- SetScanlinePointers(databuf_);
- int lines_left = dst_height;
- // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
- int skip = (GetHeight() - dst_height) / 2;
- if (skip > 0) {
- while (skip >= GetImageScanlinesPerImcuRow()) {
- if (!DecodeImcuRow()) {
- FinishDecode();
- return LIBYUV_FALSE;
- }
- skip -= GetImageScanlinesPerImcuRow();
- }
- if (skip > 0) {
- // Have a partial iMCU row left over to skip.
- if (!DecodeImcuRow()) {
- FinishDecode();
- return LIBYUV_FALSE;
- }
- for (int i = 0; i < num_outbufs_; ++i) {
- // TODO(fbarchard): Compute skip to avoid this
- assert(skip % GetVertSubSampFactor(i) == 0);
- int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
- int data_to_skip = rows_to_skip * GetComponentStride(i);
- // Change our own data buffer pointers so we can pass them to the
- // callback.
- databuf_[i] += data_to_skip;
- }
- int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
- (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
- // Now change them back.
- for (int i = 0; i < num_outbufs_; ++i) {
- int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
- int data_to_skip = rows_to_skip * GetComponentStride(i);
- databuf_[i] -= data_to_skip;
- }
- lines_left -= scanlines_to_copy;
- }
- }
- // Read full MCUs until we get to the crop point.
- for (; lines_left >= GetImageScanlinesPerImcuRow();
- lines_left -= GetImageScanlinesPerImcuRow()) {
- if (!DecodeImcuRow()) {
- FinishDecode();
- return LIBYUV_FALSE;
- }
- (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
- }
- if (lines_left > 0) {
- // Have a partial iMCU row left over to decode.
- if (!DecodeImcuRow()) {
- FinishDecode();
- return LIBYUV_FALSE;
- }
- (*fn)(opaque, databuf_, databuf_strides_, lines_left);
- }
- return FinishDecode();
-}
-
-void init_source(j_decompress_ptr cinfo) {
- fill_input_buffer(cinfo);
-}
-
-boolean fill_input_buffer(j_decompress_ptr cinfo) {
- BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
- if (buf_vec->pos >= buf_vec->len) {
- assert(0 && "No more data");
- // ERROR: No more data
- return FALSE;
- }
- cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
- cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
- ++buf_vec->pos;
- return TRUE;
-}
-
-void skip_input_data(j_decompress_ptr cinfo,
- long num_bytes) { // NOLINT
- cinfo->src->next_input_byte += num_bytes;
-}
-
-void term_source(j_decompress_ptr cinfo) {
- // Nothing to do.
-}
-
-#ifdef HAVE_SETJMP
-void ErrorHandler(j_common_ptr cinfo) {
- // This is called when a jpeglib command experiences an error. Unfortunately
- // jpeglib's error handling model is not very flexible, because it expects the
- // error handler to not return--i.e., it wants the program to terminate. To
- // recover from errors we use setjmp() as shown in their example. setjmp() is
- // C's implementation for the "call with current continuation" functionality
- // seen in some functional programming languages.
- // A formatted message can be output, but is unsafe for release.
-#ifdef DEBUG
- char buf[JMSG_LENGTH_MAX];
- (*cinfo->err->format_message)(cinfo, buf);
- // ERROR: Error in jpeglib: buf
-#endif
-
- SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
- // This rewinds the call stack to the point of the corresponding setjmp()
- // and causes it to return (for a second time) with value 1.
- longjmp(mgr->setjmp_buffer, 1);
-}
-#endif
-
-void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
- if (num_outbufs != num_outbufs_) {
- // We could perhaps optimize this case to resize the output buffers without
- // necessarily having to delete and recreate each one, but it's not worth
- // it.
- DestroyOutputBuffers();
-
- scanlines_ = new uint8** [num_outbufs];
- scanlines_sizes_ = new int[num_outbufs];
- databuf_ = new uint8* [num_outbufs];
- databuf_strides_ = new int[num_outbufs];
-
- for (int i = 0; i < num_outbufs; ++i) {
- scanlines_[i] = NULL;
- scanlines_sizes_[i] = 0;
- databuf_[i] = NULL;
- databuf_strides_[i] = 0;
- }
-
- num_outbufs_ = num_outbufs;
- }
-}
-
-void MJpegDecoder::DestroyOutputBuffers() {
- for (int i = 0; i < num_outbufs_; ++i) {
- delete [] scanlines_[i];
- delete [] databuf_[i];
- }
- delete [] scanlines_;
- delete [] databuf_;
- delete [] scanlines_sizes_;
- delete [] databuf_strides_;
- scanlines_ = NULL;
- databuf_ = NULL;
- scanlines_sizes_ = NULL;
- databuf_strides_ = NULL;
- num_outbufs_ = 0;
-}
-
-// JDCT_IFAST and do_block_smoothing improve performance substantially.
-LIBYUV_BOOL MJpegDecoder::StartDecode() {
- decompress_struct_->raw_data_out = TRUE;
- decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default
- decompress_struct_->dither_mode = JDITHER_NONE;
- // Not applicable to 'raw':
- decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
- // Only for buffered mode:
- decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
- // Blocky but fast:
- decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
-
- if (!jpeg_start_decompress(decompress_struct_)) {
- // ERROR: Couldn't start JPEG decompressor";
- return LIBYUV_FALSE;
- }
- return LIBYUV_TRUE;
-}
-
-LIBYUV_BOOL MJpegDecoder::FinishDecode() {
- // jpeglib considers it an error if we finish without decoding the whole
- // image, so we call "abort" rather than "finish".
- jpeg_abort_decompress(decompress_struct_);
- return LIBYUV_TRUE;
-}
-
-void MJpegDecoder::SetScanlinePointers(uint8** data) {
- for (int i = 0; i < num_outbufs_; ++i) {
- uint8* data_i = data[i];
- for (int j = 0; j < scanlines_sizes_[i]; ++j) {
- scanlines_[i][j] = data_i;
- data_i += GetComponentStride(i);
- }
- }
-}
-
-inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
- return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
- jpeg_read_raw_data(decompress_struct_,
- scanlines_,
- GetImageScanlinesPerImcuRow());
-}
-
-// The helper function which recognizes the jpeg sub-sampling type.
-JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
- int* subsample_x, int* subsample_y, int number_of_components) {
- if (number_of_components == 3) { // Color images.
- if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
- subsample_x[1] == 2 && subsample_y[1] == 2 &&
- subsample_x[2] == 2 && subsample_y[2] == 2) {
- return kJpegYuv420;
- } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
- subsample_x[1] == 2 && subsample_y[1] == 1 &&
- subsample_x[2] == 2 && subsample_y[2] == 1) {
- return kJpegYuv422;
- } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
- subsample_x[1] == 1 && subsample_y[1] == 1 &&
- subsample_x[2] == 1 && subsample_y[2] == 1) {
- return kJpegYuv444;
- }
- } else if (number_of_components == 1) { // Grey-scale images.
- if (subsample_x[0] == 1 && subsample_y[0] == 1) {
- return kJpegYuv400;
- }
- }
- return kJpegUnknown;
-}
-
-} // namespace libyuv
-#endif // HAVE_JPEG
-
diff --git a/src/main/jni/libyuv/source/mjpeg_validate.cc b/src/main/jni/libyuv/source/mjpeg_validate.cc
deleted file mode 100644
index 23d22d099..000000000
--- a/src/main/jni/libyuv/source/mjpeg_validate.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/mjpeg_decoder.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Helper function to validate the jpeg appears intact.
-// TODO(fbarchard): Optimize case where SOI is found but EOI is not.
-LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
- size_t i;
- if (sample_size < 64) {
- // ERROR: Invalid jpeg size: sample_size
- return LIBYUV_FALSE;
- }
- if (sample[0] != 0xff || sample[1] != 0xd8) { // Start Of Image
- // ERROR: Invalid jpeg initial start code
- return LIBYUV_FALSE;
- }
- for (i = sample_size - 2; i > 1;) {
- if (sample[i] != 0xd9) {
- if (sample[i] == 0xff && sample[i + 1] == 0xd9) { // End Of Image
- return LIBYUV_TRUE; // Success: Valid jpeg.
- }
- --i;
- }
- --i;
- }
- // ERROR: Invalid jpeg end code not found. Size sample_size
- return LIBYUV_FALSE;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
-
diff --git a/src/main/jni/libyuv/source/planar_functions.cc b/src/main/jni/libyuv/source/planar_functions.cc
deleted file mode 100644
index 3857008ca..000000000
--- a/src/main/jni/libyuv/source/planar_functions.cc
+++ /dev/null
@@ -1,2291 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/planar_functions.h"
-
-#include <string.h> // for memset()
-
-#include "libyuv/cpu_id.h"
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Copy a plane of data
-LIBYUV_API
-void CopyPlane(const uint8* src_y, int src_stride_y,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
- int y;
- void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
- // Coalesce rows.
- if (src_stride_y == width &&
- dst_stride_y == width) {
- width *= height;
- height = 1;
- src_stride_y = dst_stride_y = 0;
- }
- // Nothing to do.
- if (src_y == dst_y && src_stride_y == dst_stride_y) {
- return;
- }
-#if defined(HAS_COPYROW_X86)
- if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
- CopyRow = CopyRow_X86;
- }
-#endif
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- CopyRow = CopyRow_SSE2;
- }
-#endif
-#if defined(HAS_COPYROW_ERMS)
- if (TestCpuFlag(kCpuHasERMS)) {
- CopyRow = CopyRow_ERMS;
- }
-#endif
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
- CopyRow = CopyRow_NEON;
- }
-#endif
-#if defined(HAS_COPYROW_MIPS)
- if (TestCpuFlag(kCpuHasMIPS)) {
- CopyRow = CopyRow_MIPS;
- }
-#endif
-
- // Copy plane
- for (y = 0; y < height; ++y) {
- CopyRow(src_y, dst_y, width);
- src_y += src_stride_y;
- dst_y += dst_stride_y;
- }
-}
-
-LIBYUV_API
-void CopyPlane_16(const uint16* src_y, int src_stride_y,
- uint16* dst_y, int dst_stride_y,
- int width, int height) {
- int y;
- void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C;
- // Coalesce rows.
- if (src_stride_y == width &&
- dst_stride_y == width) {
- width *= height;
- height = 1;
- src_stride_y = dst_stride_y = 0;
- }
-#if defined(HAS_COPYROW_16_X86)
- if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
- CopyRow = CopyRow_16_X86;
- }
-#endif
-#if defined(HAS_COPYROW_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- CopyRow = CopyRow_16_SSE2;
- }
-#endif
-#if defined(HAS_COPYROW_16_ERMS)
- if (TestCpuFlag(kCpuHasERMS)) {
- CopyRow = CopyRow_16_ERMS;
- }
-#endif
-#if defined(HAS_COPYROW_16_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
- CopyRow = CopyRow_16_NEON;
- }
-#endif
-#if defined(HAS_COPYROW_16_MIPS)
- if (TestCpuFlag(kCpuHasMIPS)) {
- CopyRow = CopyRow_16_MIPS;
- }
-#endif
-
- // Copy plane
- for (y = 0; y < height; ++y) {
- CopyRow(src_y, dst_y, width);
- src_y += src_stride_y;
- dst_y += dst_stride_y;
- }
-}
-
-// Copy I422.
-LIBYUV_API
-int I422Copy(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int halfwidth = (width + 1) >> 1;
- if (!src_y || !src_u || !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (height - 1) * src_stride_u;
- src_v = src_v + (height - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
- CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
- return 0;
-}
-
-// Copy I444.
-LIBYUV_API
-int I444Copy(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (height - 1) * src_stride_u;
- src_v = src_v + (height - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
-
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
- CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
- return 0;
-}
-
-// Copy I400.
-LIBYUV_API
-int I400ToI400(const uint8* src_y, int src_stride_y,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
- if (!src_y || !dst_y || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_stride_y = -src_stride_y;
- }
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- return 0;
-}
-
-// Convert I420 to I400.
-LIBYUV_API
-int I420ToI400(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
- if (!src_y || !dst_y || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_stride_y = -src_stride_y;
- }
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- return 0;
-}
-
-// Mirror a plane of data.
-void MirrorPlane(const uint8* src_y, int src_stride_y,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
- int y;
- void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_stride_y = -src_stride_y;
- }
-#if defined(HAS_MIRRORROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_NEON;
- }
-#endif
-#if defined(HAS_MIRRORROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_SSE2;
- }
-#endif
-#if defined(HAS_MIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- MirrorRow = MirrorRow_SSSE3;
- }
-#endif
-#if defined(HAS_MIRRORROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
- MirrorRow = MirrorRow_AVX2;
- }
-#endif
-
- // Mirror plane
- for (y = 0; y < height; ++y) {
- MirrorRow(src_y, dst_y, width);
- src_y += src_stride_y;
- dst_y += dst_stride_y;
- }
-}
-
-// Convert YUY2 to I422.
-LIBYUV_API
-int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
- void (*YUY2ToUV422Row)(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) =
- YUY2ToUV422Row_C;
- void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
- YUY2ToYRow_C;
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
- src_stride_yuy2 = -src_stride_yuy2;
- }
- // Coalesce rows.
- if (src_stride_yuy2 == width * 2 &&
- dst_stride_y == width &&
- dst_stride_u * 2 == width &&
- dst_stride_v * 2 == width) {
- width *= height;
- height = 1;
- src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
- }
-#if defined(HAS_YUY2TOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
- YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
- YUY2ToYRow = YUY2ToYRow_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
- YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- YUY2ToYRow = YUY2ToYRow_SSE2;
- }
- }
- }
- }
-#endif
-#if defined(HAS_YUY2TOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
- YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
- YUY2ToYRow = YUY2ToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
- YUY2ToYRow = YUY2ToYRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_YUY2TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- YUY2ToYRow = YUY2ToYRow_Any_NEON;
- if (width >= 16) {
- YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
- }
- if (IS_ALIGNED(width, 16)) {
- YUY2ToYRow = YUY2ToYRow_NEON;
- YUY2ToUV422Row = YUY2ToUV422Row_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
- YUY2ToYRow(src_yuy2, dst_y, width);
- src_yuy2 += src_stride_yuy2;
- dst_y += dst_stride_y;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- return 0;
-}
-
-// Convert UYVY to I422.
-LIBYUV_API
-int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
- void (*UYVYToUV422Row)(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) =
- UYVYToUV422Row_C;
- void (*UYVYToYRow)(const uint8* src_uyvy,
- uint8* dst_y, int pix) = UYVYToYRow_C;
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
- src_stride_uyvy = -src_stride_uyvy;
- }
- // Coalesce rows.
- if (src_stride_uyvy == width * 2 &&
- dst_stride_y == width &&
- dst_stride_u * 2 == width &&
- dst_stride_v * 2 == width) {
- width *= height;
- height = 1;
- src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
- }
-#if defined(HAS_UYVYTOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
- UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
- UYVYToYRow = UYVYToYRow_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
- UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
- UYVYToUV422Row = UYVYToUV422Row_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- UYVYToYRow = UYVYToYRow_SSE2;
- }
- }
- }
- }
-#endif
-#if defined(HAS_UYVYTOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
- UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
- UYVYToYRow = UYVYToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- UYVYToUV422Row = UYVYToUV422Row_AVX2;
- UYVYToYRow = UYVYToYRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_UYVYTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- UYVYToYRow = UYVYToYRow_Any_NEON;
- if (width >= 16) {
- UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
- }
- if (IS_ALIGNED(width, 16)) {
- UYVYToYRow = UYVYToYRow_NEON;
- UYVYToUV422Row = UYVYToUV422Row_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
- UYVYToYRow(src_uyvy, dst_y, width);
- src_uyvy += src_stride_uyvy;
- dst_y += dst_stride_y;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- return 0;
-}
-
-// Mirror I400 with optional flipping
-LIBYUV_API
-int I400Mirror(const uint8* src_y, int src_stride_y,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
- if (!src_y || !dst_y ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_stride_y = -src_stride_y;
- }
-
- MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- return 0;
-}
-
-// Mirror I420 with optional flipping
-LIBYUV_API
-int I420Mirror(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
-
- if (dst_y) {
- MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
- MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
- MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
- return 0;
-}
-
-// ARGB mirror.
-LIBYUV_API
-int ARGBMirror(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
- ARGBMirrorRow_C;
- if (!src_argb || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
-
-#if defined(HAS_ARGBMIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBMirrorRow = ARGBMirrorRow_SSSE3;
- }
-#endif
-#if defined(HAS_ARGBMIRRORROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
- ARGBMirrorRow = ARGBMirrorRow_AVX2;
- }
-#endif
-#if defined(HAS_ARGBMIRRORROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
- ARGBMirrorRow = ARGBMirrorRow_NEON;
- }
-#endif
-
- // Mirror plane
- for (y = 0; y < height; ++y) {
- ARGBMirrorRow(src_argb, dst_argb, width);
- src_argb += src_stride_argb;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Get a blender that optimized for the CPU, alignment and pixel count.
-// As there are 6 blenders to choose from, the caller should try to use
-// the same blend function for all pixels if possible.
-LIBYUV_API
-ARGBBlendRow GetARGBBlend() {
- void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width) = ARGBBlendRow_C;
-#if defined(HAS_ARGBBLENDROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBBlendRow = ARGBBlendRow_SSSE3;
- return ARGBBlendRow;
- }
-#endif
-#if defined(HAS_ARGBBLENDROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- ARGBBlendRow = ARGBBlendRow_SSE2;
- }
-#endif
-#if defined(HAS_ARGBBLENDROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBBlendRow = ARGBBlendRow_NEON;
- }
-#endif
- return ARGBBlendRow;
-}
-
-// Alpha Blend 2 ARGB images and store to destination.
-LIBYUV_API
-int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
- const uint8* src_argb1, int src_stride_argb1,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width) = GetARGBBlend();
- if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb0 == width * 4 &&
- src_stride_argb1 == width * 4 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
- }
-
- for (y = 0; y < height; ++y) {
- ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
- src_argb0 += src_stride_argb0;
- src_argb1 += src_stride_argb1;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Multiply 2 ARGB images and store to destination.
-LIBYUV_API
-int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
- const uint8* src_argb1, int src_stride_argb1,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
- int width) = ARGBMultiplyRow_C;
- if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb0 == width * 4 &&
- src_stride_argb1 == width * 4 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
- }
-#if defined(HAS_ARGBMULTIPLYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
- ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
- if (IS_ALIGNED(width, 4)) {
- ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_ARGBMULTIPLYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
- ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
- if (IS_ALIGNED(width, 8)) {
- ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBMULTIPLYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBMultiplyRow = ARGBMultiplyRow_NEON;
- }
- }
-#endif
-
- // Multiply plane
- for (y = 0; y < height; ++y) {
- ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
- src_argb0 += src_stride_argb0;
- src_argb1 += src_stride_argb1;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Add 2 ARGB images and store to destination.
-LIBYUV_API
-int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
- const uint8* src_argb1, int src_stride_argb1,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
- int width) = ARGBAddRow_C;
- if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb0 == width * 4 &&
- src_stride_argb1 == width * 4 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
- }
-#if defined(HAS_ARGBADDROW_SSE2) && defined(_MSC_VER)
- if (TestCpuFlag(kCpuHasSSE2)) {
- ARGBAddRow = ARGBAddRow_SSE2;
- }
-#endif
-#if defined(HAS_ARGBADDROW_SSE2) && !defined(_MSC_VER)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
- ARGBAddRow = ARGBAddRow_Any_SSE2;
- if (IS_ALIGNED(width, 4)) {
- ARGBAddRow = ARGBAddRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_ARGBADDROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
- ARGBAddRow = ARGBAddRow_Any_AVX2;
- if (IS_ALIGNED(width, 8)) {
- ARGBAddRow = ARGBAddRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBADDROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBAddRow = ARGBAddRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBAddRow = ARGBAddRow_NEON;
- }
- }
-#endif
-
- // Add plane
- for (y = 0; y < height; ++y) {
- ARGBAddRow(src_argb0, src_argb1, dst_argb, width);
- src_argb0 += src_stride_argb0;
- src_argb1 += src_stride_argb1;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Subtract 2 ARGB images and store to destination.
-LIBYUV_API
-int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
- const uint8* src_argb1, int src_stride_argb1,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
- int width) = ARGBSubtractRow_C;
- if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb0 == width * 4 &&
- src_stride_argb1 == width * 4 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
- }
-#if defined(HAS_ARGBSUBTRACTROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
- ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
- if (IS_ALIGNED(width, 4)) {
- ARGBSubtractRow = ARGBSubtractRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_ARGBSUBTRACTROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
- ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
- if (IS_ALIGNED(width, 8)) {
- ARGBSubtractRow = ARGBSubtractRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBSUBTRACTROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBSubtractRow = ARGBSubtractRow_NEON;
- }
- }
-#endif
-
- // Subtract plane
- for (y = 0; y < height; ++y) {
- ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width);
- src_argb0 += src_stride_argb0;
- src_argb1 += src_stride_argb1;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Convert I422 to BGRA.
-LIBYUV_API
-int I422ToBGRA(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_bgra, int dst_stride_bgra,
- int width, int height) {
- int y;
- void (*I422ToBGRARow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToBGRARow_C;
- if (!src_y || !src_u || !src_v ||
- !dst_bgra ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
- dst_stride_bgra = -dst_stride_bgra;
- }
- // Coalesce rows.
- if (src_stride_y == width &&
- src_stride_u * 2 == width &&
- src_stride_v * 2 == width &&
- dst_stride_bgra == width * 4) {
- width *= height;
- height = 1;
- src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0;
- }
-#if defined(HAS_I422TOBGRAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToBGRARow = I422ToBGRARow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToBGRARow = I422ToBGRARow_NEON;
- }
- }
-#elif defined(HAS_I422TOBGRAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
- I422ToBGRARow = I422ToBGRARow_SSSE3;
- }
- }
- }
-#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
- IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
- IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
- IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
- I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
- dst_bgra += dst_stride_bgra;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
-}
-
-// Convert I422 to ABGR.
-LIBYUV_API
-int I422ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height) {
- int y;
- void (*I422ToABGRRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToABGRRow_C;
- if (!src_y || !src_u || !src_v ||
- !dst_abgr ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
- dst_stride_abgr = -dst_stride_abgr;
- }
- // Coalesce rows.
- if (src_stride_y == width &&
- src_stride_u * 2 == width &&
- src_stride_v * 2 == width &&
- dst_stride_abgr == width * 4) {
- width *= height;
- height = 1;
- src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;
- }
-#if defined(HAS_I422TOABGRROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToABGRRow = I422ToABGRRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToABGRRow = I422ToABGRRow_NEON;
- }
- }
-#elif defined(HAS_I422TOABGRROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
- I422ToABGRRow = I422ToABGRRow_SSSE3;
- }
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
- dst_abgr += dst_stride_abgr;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
-}
-
-// Convert I422 to RGBA.
-LIBYUV_API
-int I422ToRGBA(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_rgba, int dst_stride_rgba,
- int width, int height) {
- int y;
- void (*I422ToRGBARow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToRGBARow_C;
- if (!src_y || !src_u || !src_v ||
- !dst_rgba ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
- dst_stride_rgba = -dst_stride_rgba;
- }
- // Coalesce rows.
- if (src_stride_y == width &&
- src_stride_u * 2 == width &&
- src_stride_v * 2 == width &&
- dst_stride_rgba == width * 4) {
- width *= height;
- height = 1;
- src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0;
- }
-#if defined(HAS_I422TORGBAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGBARow = I422ToRGBARow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGBARow = I422ToRGBARow_NEON;
- }
- }
-#elif defined(HAS_I422TORGBAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
- I422ToRGBARow = I422ToRGBARow_SSSE3;
- }
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
- dst_rgba += dst_stride_rgba;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
-}
-
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_rgb565, int dst_stride_rgb565,
- int width, int height) {
- int y;
- void (*NV12ToRGB565Row)(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* rgb_buf,
- int width) = NV12ToRGB565Row_C;
- if (!src_y || !src_uv || !dst_rgb565 ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
-#if defined(HAS_NV12TORGB565ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
- }
- }
-#elif defined(HAS_NV12TORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- NV12ToRGB565Row = NV12ToRGB565Row_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- NV12ToRGB565Row(src_y, src_uv, dst_rgb565, width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- if (y & 1) {
- src_uv += src_stride_uv;
- }
- }
- return 0;
-}
-
-// Convert NV21 to RGB565.
-LIBYUV_API
-int NV21ToRGB565(const uint8* src_y, int src_stride_y,
- const uint8* src_vu, int src_stride_vu,
- uint8* dst_rgb565, int dst_stride_rgb565,
- int width, int height) {
- int y;
- void (*NV21ToRGB565Row)(const uint8* y_buf,
- const uint8* src_vu,
- uint8* rgb_buf,
- int width) = NV21ToRGB565Row_C;
- if (!src_y || !src_vu || !dst_rgb565 ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
-#if defined(HAS_NV21TORGB565ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- NV21ToRGB565Row = NV21ToRGB565Row_SSSE3;
- }
- }
-#elif defined(HAS_NV21TORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- NV21ToRGB565Row = NV21ToRGB565Row_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- NV21ToRGB565Row(src_y, src_vu, dst_rgb565, width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- if (y & 1) {
- src_vu += src_stride_vu;
- }
- }
- return 0;
-}
-
-LIBYUV_API
-void SetPlane(uint8* dst_y, int dst_stride_y,
- int width, int height,
- uint32 value) {
- int y;
- uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
- void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C;
- // Coalesce rows.
- if (dst_stride_y == width) {
- width *= height;
- height = 1;
- dst_stride_y = 0;
- }
-#if defined(HAS_SETROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- SetRow = SetRow_NEON;
- }
-#endif
-#if defined(HAS_SETROW_X86)
- if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
- SetRow = SetRow_X86;
- }
-#endif
-
- // Set plane
- for (y = 0; y < height; ++y) {
- SetRow(dst_y, v32, width);
- dst_y += dst_stride_y;
- }
-}
-
-// Draw a rectangle into I420
-LIBYUV_API
-int I420Rect(uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int x, int y,
- int width, int height,
- int value_y, int value_u, int value_v) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- uint8* start_y = dst_y + y * dst_stride_y + x;
- uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
- uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
- if (!dst_y || !dst_u || !dst_v ||
- width <= 0 || height <= 0 ||
- x < 0 || y < 0 ||
- value_y < 0 || value_y > 255 ||
- value_u < 0 || value_u > 255 ||
- value_v < 0 || value_v > 255) {
- return -1;
- }
-
- SetPlane(start_y, dst_stride_y, width, height, value_y);
- SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
- SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
- return 0;
-}
-
-// Draw a rectangle into ARGB
-LIBYUV_API
-int ARGBRect(uint8* dst_argb, int dst_stride_argb,
- int dst_x, int dst_y,
- int width, int height,
- uint32 value) {
- if (!dst_argb ||
- width <= 0 || height <= 0 ||
- dst_x < 0 || dst_y < 0) {
- return -1;
- }
- dst_argb += dst_y * dst_stride_argb + dst_x * 4;
- // Coalesce rows.
- if (dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- dst_stride_argb = 0;
- }
-#if defined(HAS_SETROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height);
- return 0;
- }
-#endif
-#if defined(HAS_SETROW_X86)
- if (TestCpuFlag(kCpuHasX86)) {
- ARGBSetRows_X86(dst_argb, value, width, dst_stride_argb, height);
- return 0;
- }
-#endif
- ARGBSetRows_C(dst_argb, value, width, dst_stride_argb, height);
- return 0;
-}
-
-// Convert unattentuated ARGB to preattenuated ARGB.
-// An unattenutated ARGB alpha blend uses the formula
-// p = a * f + (1 - a) * b
-// where
-// p is output pixel
-// f is foreground pixel
-// b is background pixel
-// a is alpha value from foreground pixel
-// An preattenutated ARGB alpha blend uses the formula
-// p = f + (1 - a) * b
-// where
-// f is foreground pixel premultiplied by alpha
-
-LIBYUV_API
-int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
- int width) = ARGBAttenuateRow_C;
- if (!src_argb || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_argb = 0;
- }
-#if defined(HAS_ARGBATTENUATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
- if (IS_ALIGNED(width, 4)) {
- ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
- ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
- if (IS_ALIGNED(width, 4)) {
- ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
- ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
- if (IS_ALIGNED(width, 8)) {
- ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBAttenuateRow = ARGBAttenuateRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGBAttenuateRow(src_argb, dst_argb, width);
- src_argb += src_stride_argb;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Convert preattentuated ARGB to unattenuated ARGB.
-LIBYUV_API
-int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
- int width) = ARGBUnattenuateRow_C;
- if (!src_argb || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_argb = 0;
- }
-#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
- ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
- if (IS_ALIGNED(width, 4)) {
- ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
- ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
- if (IS_ALIGNED(width, 8)) {
- ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
- }
- }
-#endif
-// TODO(fbarchard): Neon version.
-
- for (y = 0; y < height; ++y) {
- ARGBUnattenuateRow(src_argb, dst_argb, width);
- src_argb += src_stride_argb;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Convert ARGB to Grayed ARGB.
-LIBYUV_API
-int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
- int width) = ARGBGrayRow_C;
- if (!src_argb || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_argb = 0;
- }
-#if defined(HAS_ARGBGRAYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBGrayRow = ARGBGrayRow_SSSE3;
- }
-#elif defined(HAS_ARGBGRAYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
- ARGBGrayRow = ARGBGrayRow_NEON;
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGBGrayRow(src_argb, dst_argb, width);
- src_argb += src_stride_argb;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Make a rectangle of ARGB gray scale.
-LIBYUV_API
-int ARGBGray(uint8* dst_argb, int dst_stride_argb,
- int dst_x, int dst_y,
- int width, int height) {
- int y;
- void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
- int width) = ARGBGrayRow_C;
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
- if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
- return -1;
- }
- // Coalesce rows.
- if (dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- dst_stride_argb = 0;
- }
-#if defined(HAS_ARGBGRAYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBGrayRow = ARGBGrayRow_SSSE3;
- }
-#elif defined(HAS_ARGBGRAYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
- ARGBGrayRow = ARGBGrayRow_NEON;
- }
-#endif
- for (y = 0; y < height; ++y) {
- ARGBGrayRow(dst, dst, width);
- dst += dst_stride_argb;
- }
- return 0;
-}
-
-// Make a rectangle of ARGB Sepia tone.
-LIBYUV_API
-int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
- int dst_x, int dst_y, int width, int height) {
- int y;
- void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
- if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
- return -1;
- }
- // Coalesce rows.
- if (dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- dst_stride_argb = 0;
- }
-#if defined(HAS_ARGBSEPIAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBSepiaRow = ARGBSepiaRow_SSSE3;
- }
-#elif defined(HAS_ARGBSEPIAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
- ARGBSepiaRow = ARGBSepiaRow_NEON;
- }
-#endif
- for (y = 0; y < height; ++y) {
- ARGBSepiaRow(dst, width);
- dst += dst_stride_argb;
- }
- return 0;
-}
-
-// Apply a 4x4 matrix to each ARGB pixel.
-// Note: Normally for shading, but can be used to swizzle or invert.
-LIBYUV_API
-int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- const int8* matrix_argb,
- int width, int height) {
- int y;
- void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
- const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;
- if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_argb = 0;
- }
-#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
- }
-#elif defined(HAS_ARGBCOLORMATRIXROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
- ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
- }
-#endif
- for (y = 0; y < height; ++y) {
- ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
- src_argb += src_stride_argb;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Apply a 4x3 matrix to each ARGB pixel.
-// Deprecated.
-LIBYUV_API
-int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
- const int8* matrix_rgb,
- int dst_x, int dst_y, int width, int height) {
- SIMD_ALIGNED(int8 matrix_argb[16]);
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
- if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||
- dst_x < 0 || dst_y < 0) {
- return -1;
- }
-
- // Convert 4x3 7 bit matrix to 4x4 6 bit matrix.
- matrix_argb[0] = matrix_rgb[0] / 2;
- matrix_argb[1] = matrix_rgb[1] / 2;
- matrix_argb[2] = matrix_rgb[2] / 2;
- matrix_argb[3] = matrix_rgb[3] / 2;
- matrix_argb[4] = matrix_rgb[4] / 2;
- matrix_argb[5] = matrix_rgb[5] / 2;
- matrix_argb[6] = matrix_rgb[6] / 2;
- matrix_argb[7] = matrix_rgb[7] / 2;
- matrix_argb[8] = matrix_rgb[8] / 2;
- matrix_argb[9] = matrix_rgb[9] / 2;
- matrix_argb[10] = matrix_rgb[10] / 2;
- matrix_argb[11] = matrix_rgb[11] / 2;
- matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
- matrix_argb[15] = 64; // 1.0
-
- return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb,
- dst, dst_stride_argb,
- &matrix_argb[0], width, height);
-}
-
-// Apply a color table each ARGB pixel.
-// Table contains 256 ARGB values.
-LIBYUV_API
-int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
- const uint8* table_argb,
- int dst_x, int dst_y, int width, int height) {
- int y;
- void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
- int width) = ARGBColorTableRow_C;
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
- if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
- dst_x < 0 || dst_y < 0) {
- return -1;
- }
- // Coalesce rows.
- if (dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- dst_stride_argb = 0;
- }
-#if defined(HAS_ARGBCOLORTABLEROW_X86)
- if (TestCpuFlag(kCpuHasX86)) {
- ARGBColorTableRow = ARGBColorTableRow_X86;
- }
-#endif
- for (y = 0; y < height; ++y) {
- ARGBColorTableRow(dst, table_argb, width);
- dst += dst_stride_argb;
- }
- return 0;
-}
-
-// Apply a color table each ARGB pixel but preserve destination alpha.
-// Table contains 256 ARGB values.
-LIBYUV_API
-int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
- const uint8* table_argb,
- int dst_x, int dst_y, int width, int height) {
- int y;
- void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
- int width) = RGBColorTableRow_C;
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
- if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
- dst_x < 0 || dst_y < 0) {
- return -1;
- }
- // Coalesce rows.
- if (dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- dst_stride_argb = 0;
- }
-#if defined(HAS_RGBCOLORTABLEROW_X86)
- if (TestCpuFlag(kCpuHasX86)) {
- RGBColorTableRow = RGBColorTableRow_X86;
- }
-#endif
- for (y = 0; y < height; ++y) {
- RGBColorTableRow(dst, table_argb, width);
- dst += dst_stride_argb;
- }
- return 0;
-}
-
-// ARGBQuantize is used to posterize art.
-// e.g. rgb / qvalue * qvalue + qvalue / 2
-// But the low levels implement efficiently with 3 parameters, and could be
-// used for other high level operations.
-// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
-// where scale is 1 / interval_size as a fixed point value.
-// The divide is replaces with a multiply by reciprocal fixed point multiply.
-// Caveat - although SSE2 saturates, the C function does not and should be used
-// with care if doing anything but quantization.
-LIBYUV_API
-int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
- int scale, int interval_size, int interval_offset,
- int dst_x, int dst_y, int width, int height) {
- int y;
- void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
- int interval_offset, int width) = ARGBQuantizeRow_C;
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
- if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
- interval_size < 1 || interval_size > 255) {
- return -1;
- }
- // Coalesce rows.
- if (dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- dst_stride_argb = 0;
- }
-#if defined(HAS_ARGBQUANTIZEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
- }
-#elif defined(HAS_ARGBQUANTIZEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
- ARGBQuantizeRow = ARGBQuantizeRow_NEON;
- }
-#endif
- for (y = 0; y < height; ++y) {
- ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
- dst += dst_stride_argb;
- }
- return 0;
-}
-
-// Computes table of cumulative sum for image where the value is the sum
-// of all values above and to the left of the entry. Used by ARGBBlur.
-LIBYUV_API
-int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
- int32* dst_cumsum, int dst_stride32_cumsum,
- int width, int height) {
- int y;
- void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
- const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
- int32* previous_cumsum = dst_cumsum;
- if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
- return -1;
- }
-#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
- }
-#endif
- memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel.
- for (y = 0; y < height; ++y) {
- ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
- previous_cumsum = dst_cumsum;
- dst_cumsum += dst_stride32_cumsum;
- src_argb += src_stride_argb;
- }
- return 0;
-}
-
-// Blur ARGB image.
-// Caller should allocate CumulativeSum table of width * height * 16 bytes
-// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
-// as the buffer is treated as circular.
-LIBYUV_API
-int ARGBBlur(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int32* dst_cumsum, int dst_stride32_cumsum,
- int width, int height, int radius) {
- int y;
- void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,
- const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
- void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
- int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;
- int32* cumsum_bot_row;
- int32* max_cumsum_bot_row;
- int32* cumsum_top_row;
-
- if (!src_argb || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- if (radius > height) {
- radius = height;
- }
- if (radius > (width / 2 - 1)) {
- radius = width / 2 - 1;
- }
- if (radius <= 0) {
- return -1;
- }
-#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
- CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
- }
-#endif
- // Compute enough CumulativeSum for first row to be blurred. After this
- // one row of CumulativeSum is updated at a time.
- ARGBComputeCumulativeSum(src_argb, src_stride_argb,
- dst_cumsum, dst_stride32_cumsum,
- width, radius);
-
- src_argb = src_argb + radius * src_stride_argb;
- cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
-
- max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
- cumsum_top_row = &dst_cumsum[0];
-
- for (y = 0; y < height; ++y) {
- int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
- int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
- int area = radius * (bot_y - top_y);
- int boxwidth = radius * 4;
- int x;
- int n;
-
- // Increment cumsum_top_row pointer with circular buffer wrap around.
- if (top_y) {
- cumsum_top_row += dst_stride32_cumsum;
- if (cumsum_top_row >= max_cumsum_bot_row) {
- cumsum_top_row = dst_cumsum;
- }
- }
- // Increment cumsum_bot_row pointer with circular buffer wrap around and
- // then fill in a row of CumulativeSum.
- if ((y + radius) < height) {
- const int32* prev_cumsum_bot_row = cumsum_bot_row;
- cumsum_bot_row += dst_stride32_cumsum;
- if (cumsum_bot_row >= max_cumsum_bot_row) {
- cumsum_bot_row = dst_cumsum;
- }
- ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row,
- width);
- src_argb += src_stride_argb;
- }
-
- // Left clipped.
- for (x = 0; x < radius + 1; ++x) {
- CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
- boxwidth, area, &dst_argb[x * 4], 1);
- area += (bot_y - top_y);
- boxwidth += 4;
- }
-
- // Middle unclipped.
- n = (width - 1) - radius - x + 1;
- CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
- boxwidth, area, &dst_argb[x * 4], n);
-
- // Right clipped.
- for (x += n; x <= width - 1; ++x) {
- area -= (bot_y - top_y);
- boxwidth -= 4;
- CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
- cumsum_bot_row + (x - radius - 1) * 4,
- boxwidth, area, &dst_argb[x * 4], 1);
- }
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Multiply ARGB image by a specified ARGB value.
-LIBYUV_API
-int ARGBShade(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height, uint32 value) {
- int y;
- void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
- int width, uint32 value) = ARGBShadeRow_C;
- if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_argb = 0;
- }
-#if defined(HAS_ARGBSHADEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBShadeRow = ARGBShadeRow_SSE2;
- }
-#elif defined(HAS_ARGBSHADEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
- ARGBShadeRow = ARGBShadeRow_NEON;
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGBShadeRow(src_argb, dst_argb, width, value);
- src_argb += src_stride_argb;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Interpolate 2 ARGB images by specified amount (0 to 255).
-LIBYUV_API
-int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
- const uint8* src_argb1, int src_stride_argb1,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height, int interpolation) {
- int y;
- void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) = InterpolateRow_C;
- if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb0 == width * 4 &&
- src_stride_argb1 == width * 4 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
- }
-#if defined(HAS_INTERPOLATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
- InterpolateRow = InterpolateRow_Any_SSE2;
- if (IS_ALIGNED(width, 4)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
- IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
- InterpolateRow = InterpolateRow_Any_SSSE3;
- if (IS_ALIGNED(width, 4)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
- IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
- InterpolateRow = InterpolateRow_Any_AVX2;
- if (IS_ALIGNED(width, 8)) {
- InterpolateRow = InterpolateRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
- InterpolateRow = InterpolateRow_Any_NEON;
- if (IS_ALIGNED(width, 4)) {
- InterpolateRow = InterpolateRow_NEON;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && width >= 1 &&
- IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) &&
- IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) &&
- IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
- ScaleARGBFilterRows = InterpolateRow_MIPS_DSPR2;
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- InterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
- width * 4, interpolation);
- src_argb0 += src_stride_argb0;
- src_argb1 += src_stride_argb1;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Shuffle ARGB channel order. e.g. BGRA to ARGB.
-LIBYUV_API
-int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_argb, int dst_stride_argb,
- const uint8* shuffler, int width, int height) {
- int y;
- void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
- const uint8* shuffler, int pix) = ARGBShuffleRow_C;
- if (!src_bgra || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_bgra = src_bgra + (height - 1) * src_stride_bgra;
- src_stride_bgra = -src_stride_bgra;
- }
- // Coalesce rows.
- if (src_stride_bgra == width * 4 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_bgra = dst_stride_argb = 0;
- }
-#if defined(HAS_ARGBSHUFFLEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
- ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
- if (IS_ALIGNED(width, 4)) {
- ARGBShuffleRow = ARGBShuffleRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBShuffleRow = ARGBShuffleRow_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_ARGBSHUFFLEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
- ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- ARGBShuffleRow = ARGBShuffleRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBSHUFFLEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
- ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
- if (IS_ALIGNED(width, 4)) {
- ARGBShuffleRow = ARGBShuffleRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
- src_bgra += src_stride_bgra;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Sobel ARGB effect.
-static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height,
- void (*SobelRow)(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst, int width)) {
- int y;
- void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) = ARGBToBayerGGRow_C;
- void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width) = SobelYRow_C;
- void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
- const uint8* src_y2, uint8* dst_sobely, int width) =
- SobelXRow_C;
- const int kEdge = 16; // Extra pixels at start of row for extrude/align.
- if (!src_argb || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // ARGBToBayer used to select G channel from ARGB.
-#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- ARGBToBayerRow = ARGBToBayerGGRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- ARGBToBayerRow = ARGBToBayerRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOBAYERGGROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToBayerRow = ARGBToBayerGGRow_NEON;
- }
- }
-#endif
-#if defined(HAS_SOBELYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- SobelYRow = SobelYRow_SSE2;
- }
-#endif
-#if defined(HAS_SOBELYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- SobelYRow = SobelYRow_NEON;
- }
-#endif
-#if defined(HAS_SOBELXROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- SobelXRow = SobelXRow_SSE2;
- }
-#endif
-#if defined(HAS_SOBELXROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- SobelXRow = SobelXRow_NEON;
- }
-#endif
- {
- // 3 rows with edges before/after.
- const int kRowSize = (width + kEdge + 15) & ~15;
- align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
- uint8* row_sobelx = rows;
- uint8* row_sobely = rows + kRowSize;
- uint8* row_y = rows + kRowSize * 2;
-
- // Convert first row.
- uint8* row_y0 = row_y + kEdge;
- uint8* row_y1 = row_y0 + kRowSize;
- uint8* row_y2 = row_y1 + kRowSize;
- ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width);
- row_y0[-1] = row_y0[0];
- memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind.
- ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width);
- row_y1[-1] = row_y1[0];
- memset(row_y1 + width, row_y1[width - 1], 16);
- memset(row_y2 + width, 0, 16);
-
- for (y = 0; y < height; ++y) {
- // Convert next row of ARGB to Y.
- if (y < (height - 1)) {
- src_argb += src_stride_argb;
- }
- ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width);
- row_y2[-1] = row_y2[0];
- row_y2[width] = row_y2[width - 1];
-
- SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);
- SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);
- SobelRow(row_sobelx, row_sobely, dst_argb, width);
-
- // Cycle thru circular queue of 3 row_y buffers.
- {
- uint8* row_yt = row_y0;
- row_y0 = row_y1;
- row_y1 = row_y2;
- row_y2 = row_yt;
- }
-
- dst_argb += dst_stride_argb;
- }
- free_aligned_buffer_64(rows);
- }
- return 0;
-}
-
-// Sobel ARGB effect.
-LIBYUV_API
-int ARGBSobel(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) = SobelRow_C;
-#if defined(HAS_SOBELROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- SobelRow = SobelRow_SSE2;
- }
-#endif
-#if defined(HAS_SOBELROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
- SobelRow = SobelRow_NEON;
- }
-#endif
- return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
- width, height, SobelRow);
-}
-
-// Sobel ARGB effect with planar output.
-LIBYUV_API
-int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
- void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_, int width) = SobelToPlaneRow_C;
-#if defined(HAS_SOBELTOPLANEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- SobelToPlaneRow = SobelToPlaneRow_SSE2;
- }
-#endif
-#if defined(HAS_SOBELTOPLANEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
- SobelToPlaneRow = SobelToPlaneRow_NEON;
- }
-#endif
- return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
- width, height, SobelToPlaneRow);
-}
-
-// SobelXY ARGB effect.
-// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel.
-LIBYUV_API
-int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) = SobelXYRow_C;
-#if defined(HAS_SOBELXYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- SobelXYRow = SobelXYRow_SSE2;
- }
-#endif
-#if defined(HAS_SOBELXYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
- SobelXYRow = SobelXYRow_NEON;
- }
-#endif
- return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
- width, height, SobelXYRow);
-}
-
-// Apply a 4x4 polynomial to each ARGB pixel.
-LIBYUV_API
-int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- const float* poly,
- int width, int height) {
- int y;
- void (*ARGBPolynomialRow)(const uint8* src_argb,
- uint8* dst_argb, const float* poly,
- int width) = ARGBPolynomialRow_C;
- if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_argb = 0;
- }
-#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
- ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
- }
-#endif
-#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
- IS_ALIGNED(width, 2)) {
- ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGBPolynomialRow(src_argb, dst_argb, poly, width);
- src_argb += src_stride_argb;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Apply a lumacolortable to each ARGB pixel.
-LIBYUV_API
-int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- const uint8* luma,
- int width, int height) {
- int y;
- void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,
- int width, const uint8* luma, const uint32 lumacoeff) =
- ARGBLumaColorTableRow_C;
- if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_argb = 0;
- }
-#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
- ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3;
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f);
- src_argb += src_stride_argb;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Copy Alpha from one ARGB image to another.
-LIBYUV_API
-int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
- ARGBCopyAlphaRow_C;
- if (!src_argb || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_argb = 0;
- }
-#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
- IS_ALIGNED(width, 8)) {
- ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
- }
-#endif
-#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
- ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGBCopyAlphaRow(src_argb, dst_argb, width);
- src_argb += src_stride_argb;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Copy a planar Y channel to the alpha channel of a destination ARGB image.
-LIBYUV_API
-int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
- ARGBCopyYToAlphaRow_C;
- if (!src_y || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_stride_y = -src_stride_y;
- }
- // Coalesce rows.
- if (src_stride_y == width &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_y = dst_stride_argb = 0;
- }
-#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
- IS_ALIGNED(width, 8)) {
- ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
- }
-#endif
-#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
- ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- ARGBCopyYToAlphaRow(src_y, dst_argb, width);
- src_y += src_stride_y;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/rotate.cc b/src/main/jni/libyuv/source/rotate.cc
deleted file mode 100644
index fe0e72b13..000000000
--- a/src/main/jni/libyuv/source/rotate.cc
+++ /dev/null
@@ -1,1315 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate.h"
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/convert.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#if defined(__APPLE__) && defined(__i386__)
-#define DECLARE_FUNCTION(name) \
- ".text \n" \
- ".private_extern _" #name " \n" \
- ".align 4,0x90 \n" \
-"_" #name ": \n"
-#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
-#define DECLARE_FUNCTION(name) \
- ".text \n" \
- ".align 4,0x90 \n" \
-"_" #name ": \n"
-#else
-#define DECLARE_FUNCTION(name) \
- ".text \n" \
- ".align 4,0x90 \n" \
-#name ": \n"
-#endif
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
- (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_MIRRORROW_NEON
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
-#define HAS_MIRRORROW_UV_NEON
-void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
-#define HAS_TRANSPOSE_WX8_NEON
-void TransposeWx8_NEON(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width);
-#define HAS_TRANSPOSE_UVWX8_NEON
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width);
-#elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
- (defined(__aarch64__) || defined(LIBYUV_NEON))
-// #define HAS_MIRRORROW_NEON
-// void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
-// #define HAS_MIRRORROW_UV_NEON
-// void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
-// #define HAS_TRANSPOSE_WX8_NEON
-// void TransposeWx8_NEON(const uint8* src, int src_stride,
-// uint8* dst, int dst_stride, int width);
-// #define HAS_TRANSPOSE_UVWX8_NEON
-// void TransposeUVWx8_NEON(const uint8* src, int src_stride,
-// uint8* dst_a, int dst_stride_a,
-// uint8* dst_b, int dst_stride_b,
-// int width);
-#endif // defined(__ARM_NEON__)
-
-#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
- defined(__mips__) && \
- defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_TRANSPOSE_WX8_MIPS_DSPR2
-void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width);
-
-void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width);
-#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
-void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width);
-#endif // defined(__mips__)
-
-#if !defined(LIBYUV_DISABLE_X86) && \
- defined(_M_IX86) && defined(_MSC_VER)
-#define HAS_TRANSPOSE_WX8_SSSE3
-__declspec(naked) __declspec(align(16))
-static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width) {
- __asm {
- push edi
- push esi
- push ebp
- mov eax, [esp + 12 + 4] // src
- mov edi, [esp + 12 + 8] // src_stride
- mov edx, [esp + 12 + 12] // dst
- mov esi, [esp + 12 + 16] // dst_stride
- mov ecx, [esp + 12 + 20] // width
-
- // Read in the data from the source pointer.
- // First round of bit swap.
- align 4
- convertloop:
- movq xmm0, qword ptr [eax]
- lea ebp, [eax + 8]
- movq xmm1, qword ptr [eax + edi]
- lea eax, [eax + 2 * edi]
- punpcklbw xmm0, xmm1
- movq xmm2, qword ptr [eax]
- movdqa xmm1, xmm0
- palignr xmm1, xmm1, 8
- movq xmm3, qword ptr [eax + edi]
- lea eax, [eax + 2 * edi]
- punpcklbw xmm2, xmm3
- movdqa xmm3, xmm2
- movq xmm4, qword ptr [eax]
- palignr xmm3, xmm3, 8
- movq xmm5, qword ptr [eax + edi]
- punpcklbw xmm4, xmm5
- lea eax, [eax + 2 * edi]
- movdqa xmm5, xmm4
- movq xmm6, qword ptr [eax]
- palignr xmm5, xmm5, 8
- movq xmm7, qword ptr [eax + edi]
- punpcklbw xmm6, xmm7
- mov eax, ebp
- movdqa xmm7, xmm6
- palignr xmm7, xmm7, 8
- // Second round of bit swap.
- punpcklwd xmm0, xmm2
- punpcklwd xmm1, xmm3
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- palignr xmm2, xmm2, 8
- palignr xmm3, xmm3, 8
- punpcklwd xmm4, xmm6
- punpcklwd xmm5, xmm7
- movdqa xmm6, xmm4
- movdqa xmm7, xmm5
- palignr xmm6, xmm6, 8
- palignr xmm7, xmm7, 8
- // Third round of bit swap.
- // Write to the destination pointer.
- punpckldq xmm0, xmm4
- movq qword ptr [edx], xmm0
- movdqa xmm4, xmm0
- palignr xmm4, xmm4, 8
- movq qword ptr [edx + esi], xmm4
- lea edx, [edx + 2 * esi]
- punpckldq xmm2, xmm6
- movdqa xmm6, xmm2
- palignr xmm6, xmm6, 8
- movq qword ptr [edx], xmm2
- punpckldq xmm1, xmm5
- movq qword ptr [edx + esi], xmm6
- lea edx, [edx + 2 * esi]
- movdqa xmm5, xmm1
- movq qword ptr [edx], xmm1
- palignr xmm5, xmm5, 8
- punpckldq xmm3, xmm7
- movq qword ptr [edx + esi], xmm5
- lea edx, [edx + 2 * esi]
- movq qword ptr [edx], xmm3
- movdqa xmm7, xmm3
- palignr xmm7, xmm7, 8
- sub ecx, 8
- movq qword ptr [edx + esi], xmm7
- lea edx, [edx + 2 * esi]
- jg convertloop
-
- pop ebp
- pop esi
- pop edi
- ret
- }
-}
-
-#define HAS_TRANSPOSE_UVWX8_SSE2
-__declspec(naked) __declspec(align(16))
-static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int w) {
- __asm {
- push ebx
- push esi
- push edi
- push ebp
- mov eax, [esp + 16 + 4] // src
- mov edi, [esp + 16 + 8] // src_stride
- mov edx, [esp + 16 + 12] // dst_a
- mov esi, [esp + 16 + 16] // dst_stride_a
- mov ebx, [esp + 16 + 20] // dst_b
- mov ebp, [esp + 16 + 24] // dst_stride_b
- mov ecx, esp
- sub esp, 4 + 16
- and esp, ~15
- mov [esp + 16], ecx
- mov ecx, [ecx + 16 + 28] // w
-
- align 4
- convertloop:
- // Read in the data from the source pointer.
- // First round of bit swap.
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + edi]
- lea eax, [eax + 2 * edi]
- movdqa xmm7, xmm0 // use xmm7 as temp register.
- punpcklbw xmm0, xmm1
- punpckhbw xmm7, xmm1
- movdqa xmm1, xmm7
- movdqa xmm2, [eax]
- movdqa xmm3, [eax + edi]
- lea eax, [eax + 2 * edi]
- movdqa xmm7, xmm2
- punpcklbw xmm2, xmm3
- punpckhbw xmm7, xmm3
- movdqa xmm3, xmm7
- movdqa xmm4, [eax]
- movdqa xmm5, [eax + edi]
- lea eax, [eax + 2 * edi]
- movdqa xmm7, xmm4
- punpcklbw xmm4, xmm5
- punpckhbw xmm7, xmm5
- movdqa xmm5, xmm7
- movdqa xmm6, [eax]
- movdqa xmm7, [eax + edi]
- lea eax, [eax + 2 * edi]
- movdqa [esp], xmm5 // backup xmm5
- neg edi
- movdqa xmm5, xmm6 // use xmm5 as temp register.
- punpcklbw xmm6, xmm7
- punpckhbw xmm5, xmm7
- movdqa xmm7, xmm5
- lea eax, [eax + 8 * edi + 16]
- neg edi
- // Second round of bit swap.
- movdqa xmm5, xmm0
- punpcklwd xmm0, xmm2
- punpckhwd xmm5, xmm2
- movdqa xmm2, xmm5
- movdqa xmm5, xmm1
- punpcklwd xmm1, xmm3
- punpckhwd xmm5, xmm3
- movdqa xmm3, xmm5
- movdqa xmm5, xmm4
- punpcklwd xmm4, xmm6
- punpckhwd xmm5, xmm6
- movdqa xmm6, xmm5
- movdqa xmm5, [esp] // restore xmm5
- movdqa [esp], xmm6 // backup xmm6
- movdqa xmm6, xmm5 // use xmm6 as temp register.
- punpcklwd xmm5, xmm7
- punpckhwd xmm6, xmm7
- movdqa xmm7, xmm6
- // Third round of bit swap.
- // Write to the destination pointer.
- movdqa xmm6, xmm0
- punpckldq xmm0, xmm4
- punpckhdq xmm6, xmm4
- movdqa xmm4, xmm6
- movdqa xmm6, [esp] // restore xmm6
- movlpd qword ptr [edx], xmm0
- movhpd qword ptr [ebx], xmm0
- movlpd qword ptr [edx + esi], xmm4
- lea edx, [edx + 2 * esi]
- movhpd qword ptr [ebx + ebp], xmm4
- lea ebx, [ebx + 2 * ebp]
- movdqa xmm0, xmm2 // use xmm0 as the temp register.
- punpckldq xmm2, xmm6
- movlpd qword ptr [edx], xmm2
- movhpd qword ptr [ebx], xmm2
- punpckhdq xmm0, xmm6
- movlpd qword ptr [edx + esi], xmm0
- lea edx, [edx + 2 * esi]
- movhpd qword ptr [ebx + ebp], xmm0
- lea ebx, [ebx + 2 * ebp]
- movdqa xmm0, xmm1 // use xmm0 as the temp register.
- punpckldq xmm1, xmm5
- movlpd qword ptr [edx], xmm1
- movhpd qword ptr [ebx], xmm1
- punpckhdq xmm0, xmm5
- movlpd qword ptr [edx + esi], xmm0
- lea edx, [edx + 2 * esi]
- movhpd qword ptr [ebx + ebp], xmm0
- lea ebx, [ebx + 2 * ebp]
- movdqa xmm0, xmm3 // use xmm0 as the temp register.
- punpckldq xmm3, xmm7
- movlpd qword ptr [edx], xmm3
- movhpd qword ptr [ebx], xmm3
- punpckhdq xmm0, xmm7
- sub ecx, 8
- movlpd qword ptr [edx + esi], xmm0
- lea edx, [edx + 2 * esi]
- movhpd qword ptr [ebx + ebp], xmm0
- lea ebx, [ebx + 2 * ebp]
- jg convertloop
-
- mov esp, [esp + 16]
- pop ebp
- pop edi
- pop esi
- pop ebx
- ret
- }
-}
-#elif !defined(LIBYUV_DISABLE_X86) && \
- (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
-#define HAS_TRANSPOSE_WX8_SSSE3
-static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width) {
- asm volatile (
- // Read in the data from the source pointer.
- // First round of bit swap.
- ".p2align 2 \n"
- "1: \n"
- "movq (%0),%%xmm0 \n"
- "movq (%0,%3),%%xmm1 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "movq (%0),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "palignr $0x8,%%xmm1,%%xmm1 \n"
- "movq (%0,%3),%%xmm3 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "movq (%0),%%xmm4 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "movq (%0,%3),%%xmm5 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "movq (%0),%%xmm6 \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq (%0,%3),%%xmm7 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "neg %3 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "lea 0x8(%0,%3,8),%0 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "neg %3 \n"
- // Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "palignr $0x8,%%xmm2,%%xmm2 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "movdqa %%xmm5,%%xmm7 \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- // Third round of bit swap.
- // Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "palignr $0x8,%%xmm4,%%xmm4 \n"
- "movq %%xmm4,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "movq %%xmm2,(%1) \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movq %%xmm6,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "movq %%xmm1,(%1) \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq %%xmm5,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movq %%xmm3,(%1) \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "sub $0x8,%2 \n"
- "movq %%xmm7,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "r"((intptr_t)(dst_stride)) // %4
- : "memory", "cc"
- #if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- #endif
- );
-}
-
-#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
-#define HAS_TRANSPOSE_UVWX8_SSE2
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int w);
- asm (
- DECLARE_FUNCTION(TransposeUVWx8_SSE2)
- "push %ebx \n"
- "push %esi \n"
- "push %edi \n"
- "push %ebp \n"
- "mov 0x14(%esp),%eax \n"
- "mov 0x18(%esp),%edi \n"
- "mov 0x1c(%esp),%edx \n"
- "mov 0x20(%esp),%esi \n"
- "mov 0x24(%esp),%ebx \n"
- "mov 0x28(%esp),%ebp \n"
- "mov %esp,%ecx \n"
- "sub $0x14,%esp \n"
- "and $0xfffffff0,%esp \n"
- "mov %ecx,0x10(%esp) \n"
- "mov 0x2c(%ecx),%ecx \n"
-
-"1: \n"
- "movdqa (%eax),%xmm0 \n"
- "movdqa (%eax,%edi,1),%xmm1 \n"
- "lea (%eax,%edi,2),%eax \n"
- "movdqa %xmm0,%xmm7 \n"
- "punpcklbw %xmm1,%xmm0 \n"
- "punpckhbw %xmm1,%xmm7 \n"
- "movdqa %xmm7,%xmm1 \n"
- "movdqa (%eax),%xmm2 \n"
- "movdqa (%eax,%edi,1),%xmm3 \n"
- "lea (%eax,%edi,2),%eax \n"
- "movdqa %xmm2,%xmm7 \n"
- "punpcklbw %xmm3,%xmm2 \n"
- "punpckhbw %xmm3,%xmm7 \n"
- "movdqa %xmm7,%xmm3 \n"
- "movdqa (%eax),%xmm4 \n"
- "movdqa (%eax,%edi,1),%xmm5 \n"
- "lea (%eax,%edi,2),%eax \n"
- "movdqa %xmm4,%xmm7 \n"
- "punpcklbw %xmm5,%xmm4 \n"
- "punpckhbw %xmm5,%xmm7 \n"
- "movdqa %xmm7,%xmm5 \n"
- "movdqa (%eax),%xmm6 \n"
- "movdqa (%eax,%edi,1),%xmm7 \n"
- "lea (%eax,%edi,2),%eax \n"
- "movdqa %xmm5,(%esp) \n"
- "neg %edi \n"
- "movdqa %xmm6,%xmm5 \n"
- "punpcklbw %xmm7,%xmm6 \n"
- "punpckhbw %xmm7,%xmm5 \n"
- "movdqa %xmm5,%xmm7 \n"
- "lea 0x10(%eax,%edi,8),%eax \n"
- "neg %edi \n"
- "movdqa %xmm0,%xmm5 \n"
- "punpcklwd %xmm2,%xmm0 \n"
- "punpckhwd %xmm2,%xmm5 \n"
- "movdqa %xmm5,%xmm2 \n"
- "movdqa %xmm1,%xmm5 \n"
- "punpcklwd %xmm3,%xmm1 \n"
- "punpckhwd %xmm3,%xmm5 \n"
- "movdqa %xmm5,%xmm3 \n"
- "movdqa %xmm4,%xmm5 \n"
- "punpcklwd %xmm6,%xmm4 \n"
- "punpckhwd %xmm6,%xmm5 \n"
- "movdqa %xmm5,%xmm6 \n"
- "movdqa (%esp),%xmm5 \n"
- "movdqa %xmm6,(%esp) \n"
- "movdqa %xmm5,%xmm6 \n"
- "punpcklwd %xmm7,%xmm5 \n"
- "punpckhwd %xmm7,%xmm6 \n"
- "movdqa %xmm6,%xmm7 \n"
- "movdqa %xmm0,%xmm6 \n"
- "punpckldq %xmm4,%xmm0 \n"
- "punpckhdq %xmm4,%xmm6 \n"
- "movdqa %xmm6,%xmm4 \n"
- "movdqa (%esp),%xmm6 \n"
- "movlpd %xmm0,(%edx) \n"
- "movhpd %xmm0,(%ebx) \n"
- "movlpd %xmm4,(%edx,%esi,1) \n"
- "lea (%edx,%esi,2),%edx \n"
- "movhpd %xmm4,(%ebx,%ebp,1) \n"
- "lea (%ebx,%ebp,2),%ebx \n"
- "movdqa %xmm2,%xmm0 \n"
- "punpckldq %xmm6,%xmm2 \n"
- "movlpd %xmm2,(%edx) \n"
- "movhpd %xmm2,(%ebx) \n"
- "punpckhdq %xmm6,%xmm0 \n"
- "movlpd %xmm0,(%edx,%esi,1) \n"
- "lea (%edx,%esi,2),%edx \n"
- "movhpd %xmm0,(%ebx,%ebp,1) \n"
- "lea (%ebx,%ebp,2),%ebx \n"
- "movdqa %xmm1,%xmm0 \n"
- "punpckldq %xmm5,%xmm1 \n"
- "movlpd %xmm1,(%edx) \n"
- "movhpd %xmm1,(%ebx) \n"
- "punpckhdq %xmm5,%xmm0 \n"
- "movlpd %xmm0,(%edx,%esi,1) \n"
- "lea (%edx,%esi,2),%edx \n"
- "movhpd %xmm0,(%ebx,%ebp,1) \n"
- "lea (%ebx,%ebp,2),%ebx \n"
- "movdqa %xmm3,%xmm0 \n"
- "punpckldq %xmm7,%xmm3 \n"
- "movlpd %xmm3,(%edx) \n"
- "movhpd %xmm3,(%ebx) \n"
- "punpckhdq %xmm7,%xmm0 \n"
- "sub $0x8,%ecx \n"
- "movlpd %xmm0,(%edx,%esi,1) \n"
- "lea (%edx,%esi,2),%edx \n"
- "movhpd %xmm0,(%ebx,%ebp,1) \n"
- "lea (%ebx,%ebp,2),%ebx \n"
- "jg 1b \n"
- "mov 0x10(%esp),%esp \n"
- "pop %ebp \n"
- "pop %edi \n"
- "pop %esi \n"
- "pop %ebx \n"
-#if defined(__native_client__)
- "pop %ecx \n"
- "and $0xffffffe0,%ecx \n"
- "jmp *%ecx \n"
-#else
- "ret \n"
-#endif
-);
-#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
- defined(__x86_64__)
-// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
-#define HAS_TRANSPOSE_WX8_FAST_SSSE3
-static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width) {
- asm volatile (
- // Read in the data from the source pointer.
- // First round of bit swap.
- ".p2align 2 \n"
-"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa (%0,%3),%%xmm1 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm0,%%xmm8 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm8 \n"
- "movdqa (%0),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm8,%%xmm9 \n"
- "palignr $0x8,%%xmm1,%%xmm1 \n"
- "palignr $0x8,%%xmm9,%%xmm9 \n"
- "movdqa (%0,%3),%%xmm3 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm2,%%xmm10 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "punpckhbw %%xmm3,%%xmm10 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "movdqa %%xmm10,%%xmm11 \n"
- "movdqa (%0),%%xmm4 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "palignr $0x8,%%xmm11,%%xmm11 \n"
- "movdqa (%0,%3),%%xmm5 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm4,%%xmm12 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "punpckhbw %%xmm5,%%xmm12 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "movdqa %%xmm12,%%xmm13 \n"
- "movdqa (%0),%%xmm6 \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "palignr $0x8,%%xmm13,%%xmm13 \n"
- "movdqa (%0,%3),%%xmm7 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm6,%%xmm14 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "punpckhbw %%xmm7,%%xmm14 \n"
- "neg %3 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "movdqa %%xmm14,%%xmm15 \n"
- "lea 0x10(%0,%3,8),%0 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- "neg %3 \n"
- // Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "palignr $0x8,%%xmm2,%%xmm2 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "movdqa %%xmm5,%%xmm7 \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "punpcklwd %%xmm10,%%xmm8 \n"
- "punpcklwd %%xmm11,%%xmm9 \n"
- "movdqa %%xmm8,%%xmm10 \n"
- "movdqa %%xmm9,%%xmm11 \n"
- "palignr $0x8,%%xmm10,%%xmm10 \n"
- "palignr $0x8,%%xmm11,%%xmm11 \n"
- "punpcklwd %%xmm14,%%xmm12 \n"
- "punpcklwd %%xmm15,%%xmm13 \n"
- "movdqa %%xmm12,%%xmm14 \n"
- "movdqa %%xmm13,%%xmm15 \n"
- "palignr $0x8,%%xmm14,%%xmm14 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- // Third round of bit swap.
- // Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "palignr $0x8,%%xmm4,%%xmm4 \n"
- "movq %%xmm4,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "movq %%xmm2,(%1) \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movq %%xmm6,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "movq %%xmm1,(%1) \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq %%xmm5,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movq %%xmm3,(%1) \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "movq %%xmm7,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm12,%%xmm8 \n"
- "movq %%xmm8,(%1) \n"
- "movdqa %%xmm8,%%xmm12 \n"
- "palignr $0x8,%%xmm12,%%xmm12 \n"
- "movq %%xmm12,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm14,%%xmm10 \n"
- "movdqa %%xmm10,%%xmm14 \n"
- "movq %%xmm10,(%1) \n"
- "palignr $0x8,%%xmm14,%%xmm14 \n"
- "punpckldq %%xmm13,%%xmm9 \n"
- "movq %%xmm14,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm9,%%xmm13 \n"
- "movq %%xmm9,(%1) \n"
- "palignr $0x8,%%xmm13,%%xmm13 \n"
- "movq %%xmm13,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm15,%%xmm11 \n"
- "movq %%xmm11,(%1) \n"
- "movdqa %%xmm11,%%xmm15 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- "sub $0x10,%2 \n"
- "movq %%xmm15,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "r"((intptr_t)(dst_stride)) // %4
- : "memory", "cc",
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
- "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
-);
-}
-
-#define HAS_TRANSPOSE_UVWX8_SSE2
-static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int w) {
- asm volatile (
- // Read in the data from the source pointer.
- // First round of bit swap.
- ".p2align 2 \n"
-"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa (%0,%4),%%xmm1 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm0,%%xmm8 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm1 \n"
- "movdqa (%0),%%xmm2 \n"
- "movdqa (%0,%4),%%xmm3 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm2,%%xmm8 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "punpckhbw %%xmm3,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm3 \n"
- "movdqa (%0),%%xmm4 \n"
- "movdqa (%0,%4),%%xmm5 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm4,%%xmm8 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "punpckhbw %%xmm5,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm5 \n"
- "movdqa (%0),%%xmm6 \n"
- "movdqa (%0,%4),%%xmm7 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm6,%%xmm8 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "neg %4 \n"
- "lea 0x10(%0,%4,8),%0 \n"
- "punpckhbw %%xmm7,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm7 \n"
- "neg %4 \n"
- // Second round of bit swap.
- "movdqa %%xmm0,%%xmm8 \n"
- "movdqa %%xmm1,%%xmm9 \n"
- "punpckhwd %%xmm2,%%xmm8 \n"
- "punpckhwd %%xmm3,%%xmm9 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm8,%%xmm2 \n"
- "movdqa %%xmm9,%%xmm3 \n"
- "movdqa %%xmm4,%%xmm8 \n"
- "movdqa %%xmm5,%%xmm9 \n"
- "punpckhwd %%xmm6,%%xmm8 \n"
- "punpckhwd %%xmm7,%%xmm9 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm8,%%xmm6 \n"
- "movdqa %%xmm9,%%xmm7 \n"
- // Third round of bit swap.
- // Write to the destination pointer.
- "movdqa %%xmm0,%%xmm8 \n"
- "punpckldq %%xmm4,%%xmm0 \n"
- "movlpd %%xmm0,(%1) \n" // Write back U channel
- "movhpd %%xmm0,(%2) \n" // Write back V channel
- "punpckhdq %%xmm4,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm2,%%xmm8 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movlpd %%xmm2,(%1) \n"
- "movhpd %%xmm2,(%2) \n"
- "punpckhdq %%xmm6,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm1,%%xmm8 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movlpd %%xmm1,(%1) \n"
- "movhpd %%xmm1,(%2) \n"
- "punpckhdq %%xmm5,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm3,%%xmm8 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movlpd %%xmm3,(%1) \n"
- "movhpd %%xmm3,(%2) \n"
- "punpckhdq %%xmm7,%%xmm8 \n"
- "sub $0x8,%3 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst_a), // %1
- "+r"(dst_b), // %2
- "+r"(w) // %3
- : "r"((intptr_t)(src_stride)), // %4
- "r"((intptr_t)(dst_stride_a)), // %5
- "r"((intptr_t)(dst_stride_b)) // %6
- : "memory", "cc",
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
- "xmm8", "xmm9"
-);
-}
-#endif
-#endif
-
-static void TransposeWx8_C(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width) {
- int i;
- for (i = 0; i < width; ++i) {
- dst[0] = src[0 * src_stride];
- dst[1] = src[1 * src_stride];
- dst[2] = src[2 * src_stride];
- dst[3] = src[3 * src_stride];
- dst[4] = src[4 * src_stride];
- dst[5] = src[5 * src_stride];
- dst[6] = src[6 * src_stride];
- dst[7] = src[7 * src_stride];
- ++src;
- dst += dst_stride;
- }
-}
-
-static void TransposeWxH_C(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
- int i;
- for (i = 0; i < width; ++i) {
- int j;
- for (j = 0; j < height; ++j) {
- dst[i * dst_stride + j] = src[j * src_stride + i];
- }
- }
-}
-
-LIBYUV_API
-void TransposePlane(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
- int i = height;
- void (*TransposeWx8)(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width) = TransposeWx8_C;
-#if defined(HAS_TRANSPOSE_WX8_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- TransposeWx8 = TransposeWx8_NEON;
- }
-#endif
-#if defined(HAS_TRANSPOSE_WX8_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
- TransposeWx8 = TransposeWx8_SSSE3;
- }
-#endif
-#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
- TransposeWx8 = TransposeWx8_FAST_SSSE3;
- }
-#endif
-#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
- if (IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
- TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
- } else {
- TransposeWx8 = TransposeWx8_MIPS_DSPR2;
- }
- }
-#endif
-
- // Work across the source in 8x8 tiles
- while (i >= 8) {
- TransposeWx8(src, src_stride, dst, dst_stride, width);
- src += 8 * src_stride; // Go down 8 rows.
- dst += 8; // Move over 8 columns.
- i -= 8;
- }
-
- TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
-}
-
-LIBYUV_API
-void RotatePlane90(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
- // Rotate by 90 is a transpose with the source read
- // from bottom to top. So set the source pointer to the end
- // of the buffer and flip the sign of the source stride.
- src += src_stride * (height - 1);
- src_stride = -src_stride;
- TransposePlane(src, src_stride, dst, dst_stride, width, height);
-}
-
-LIBYUV_API
-void RotatePlane270(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
- // Rotate by 270 is a transpose with the destination written
- // from bottom to top. So set the destination pointer to the end
- // of the buffer and flip the sign of the destination stride.
- dst += dst_stride * (width - 1);
- dst_stride = -dst_stride;
- TransposePlane(src, src_stride, dst, dst_stride, width, height);
-}
-
-LIBYUV_API
-void RotatePlane180(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
- // Swap first and last row and mirror the content. Uses a temporary row.
- align_buffer_64(row, width);
- const uint8* src_bot = src + src_stride * (height - 1);
- uint8* dst_bot = dst + dst_stride * (height - 1);
- int half_height = (height + 1) >> 1;
- int y;
- void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
- void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_MIRRORROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_NEON;
- }
-#endif
-#if defined(HAS_MIRRORROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
- MirrorRow = MirrorRow_SSE2;
- }
-#endif
-#if defined(HAS_MIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
- MirrorRow = MirrorRow_SSSE3;
- }
-#endif
-#if defined(HAS_MIRRORROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
- MirrorRow = MirrorRow_AVX2;
- }
-#endif
-#if defined(HAS_MIRRORROW_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
- IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
- IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
- MirrorRow = MirrorRow_MIPS_DSPR2;
- }
-#endif
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
- CopyRow = CopyRow_NEON;
- }
-#endif
-#if defined(HAS_COPYROW_X86)
- if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
- CopyRow = CopyRow_X86;
- }
-#endif
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
- CopyRow = CopyRow_SSE2;
- }
-#endif
-#if defined(HAS_COPYROW_ERMS)
- if (TestCpuFlag(kCpuHasERMS)) {
- CopyRow = CopyRow_ERMS;
- }
-#endif
-#if defined(HAS_COPYROW_MIPS)
- if (TestCpuFlag(kCpuHasMIPS)) {
- CopyRow = CopyRow_MIPS;
- }
-#endif
-
- // Odd height will harmlessly mirror the middle row twice.
- for (y = 0; y < half_height; ++y) {
- MirrorRow(src, row, width); // Mirror first row into a buffer
- src += src_stride;
- MirrorRow(src_bot, dst, width); // Mirror last row into first row
- dst += dst_stride;
- CopyRow(row, dst_bot, width); // Copy first mirrored row into last
- src_bot -= src_stride;
- dst_bot -= dst_stride;
- }
- free_aligned_buffer_64(row);
-}
-
-static void TransposeUVWx8_C(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width) {
- int i;
- for (i = 0; i < width; ++i) {
- dst_a[0] = src[0 * src_stride + 0];
- dst_b[0] = src[0 * src_stride + 1];
- dst_a[1] = src[1 * src_stride + 0];
- dst_b[1] = src[1 * src_stride + 1];
- dst_a[2] = src[2 * src_stride + 0];
- dst_b[2] = src[2 * src_stride + 1];
- dst_a[3] = src[3 * src_stride + 0];
- dst_b[3] = src[3 * src_stride + 1];
- dst_a[4] = src[4 * src_stride + 0];
- dst_b[4] = src[4 * src_stride + 1];
- dst_a[5] = src[5 * src_stride + 0];
- dst_b[5] = src[5 * src_stride + 1];
- dst_a[6] = src[6 * src_stride + 0];
- dst_b[6] = src[6 * src_stride + 1];
- dst_a[7] = src[7 * src_stride + 0];
- dst_b[7] = src[7 * src_stride + 1];
- src += 2;
- dst_a += dst_stride_a;
- dst_b += dst_stride_b;
- }
-}
-
-static void TransposeUVWxH_C(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height) {
- int i;
- for (i = 0; i < width * 2; i += 2) {
- int j;
- for (j = 0; j < height; ++j) {
- dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
- dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
- }
- }
-}
-
-LIBYUV_API
-void TransposeUV(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height) {
- int i = height;
- void (*TransposeUVWx8)(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width) = TransposeUVWx8_C;
-#if defined(HAS_TRANSPOSE_UVWX8_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- TransposeUVWx8 = TransposeUVWx8_NEON;
- }
-#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(width, 8) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
- TransposeUVWx8 = TransposeUVWx8_SSE2;
- }
-#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
- IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
- TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
- }
-#endif
-
- // Work through the source in 8x8 tiles.
- while (i >= 8) {
- TransposeUVWx8(src, src_stride,
- dst_a, dst_stride_a,
- dst_b, dst_stride_b,
- width);
- src += 8 * src_stride; // Go down 8 rows.
- dst_a += 8; // Move over 8 columns.
- dst_b += 8; // Move over 8 columns.
- i -= 8;
- }
-
- TransposeUVWxH_C(src, src_stride,
- dst_a, dst_stride_a,
- dst_b, dst_stride_b,
- width, i);
-}
-
-LIBYUV_API
-void RotateUV90(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height) {
- src += src_stride * (height - 1);
- src_stride = -src_stride;
-
- TransposeUV(src, src_stride,
- dst_a, dst_stride_a,
- dst_b, dst_stride_b,
- width, height);
-}
-
-LIBYUV_API
-void RotateUV270(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height) {
- dst_a += dst_stride_a * (width - 1);
- dst_b += dst_stride_b * (width - 1);
- dst_stride_a = -dst_stride_a;
- dst_stride_b = -dst_stride_b;
-
- TransposeUV(src, src_stride,
- dst_a, dst_stride_a,
- dst_b, dst_stride_b,
- width, height);
-}
-
-// Rotate 180 is a horizontal and vertical flip.
-LIBYUV_API
-void RotateUV180(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height) {
- int i;
- void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
- MirrorUVRow_C;
-#if defined(HAS_MIRRORUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
- MirrorRowUV = MirrorUVRow_NEON;
- }
-#elif defined(HAS_MIRRORROW_UV_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
- MirrorRowUV = MirrorUVRow_SSSE3;
- }
-#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
- IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
- MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
- }
-#endif
-
- dst_a += dst_stride_a * (height - 1);
- dst_b += dst_stride_b * (height - 1);
-
- for (i = 0; i < height; ++i) {
- MirrorRowUV(src, dst_a, dst_b, width);
- src += src_stride;
- dst_a -= dst_stride_a;
- dst_b -= dst_stride_b;
- }
-}
-
-LIBYUV_API
-int RotatePlane(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height,
- enum RotationMode mode) {
- if (!src || width <= 0 || height == 0 || !dst) {
- return -1;
- }
-
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src = src + (height - 1) * src_stride;
- src_stride = -src_stride;
- }
-
- switch (mode) {
- case kRotate0:
- // copy frame
- CopyPlane(src, src_stride,
- dst, dst_stride,
- width, height);
- return 0;
- case kRotate90:
- RotatePlane90(src, src_stride,
- dst, dst_stride,
- width, height);
- return 0;
- case kRotate270:
- RotatePlane270(src, src_stride,
- dst, dst_stride,
- width, height);
- return 0;
- case kRotate180:
- RotatePlane180(src, src_stride,
- dst, dst_stride,
- width, height);
- return 0;
- default:
- break;
- }
- return -1;
-}
-
-LIBYUV_API
-int I420Rotate(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height,
- enum RotationMode mode) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
- !dst_y || !dst_u || !dst_v) {
- return -1;
- }
-
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
-
- switch (mode) {
- case kRotate0:
- // copy frame
- return I420Copy(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height);
- case kRotate90:
- RotatePlane90(src_y, src_stride_y,
- dst_y, dst_stride_y,
- width, height);
- RotatePlane90(src_u, src_stride_u,
- dst_u, dst_stride_u,
- halfwidth, halfheight);
- RotatePlane90(src_v, src_stride_v,
- dst_v, dst_stride_v,
- halfwidth, halfheight);
- return 0;
- case kRotate270:
- RotatePlane270(src_y, src_stride_y,
- dst_y, dst_stride_y,
- width, height);
- RotatePlane270(src_u, src_stride_u,
- dst_u, dst_stride_u,
- halfwidth, halfheight);
- RotatePlane270(src_v, src_stride_v,
- dst_v, dst_stride_v,
- halfwidth, halfheight);
- return 0;
- case kRotate180:
- RotatePlane180(src_y, src_stride_y,
- dst_y, dst_stride_y,
- width, height);
- RotatePlane180(src_u, src_stride_u,
- dst_u, dst_stride_u,
- halfwidth, halfheight);
- RotatePlane180(src_v, src_stride_v,
- dst_v, dst_stride_v,
- halfwidth, halfheight);
- return 0;
- default:
- break;
- }
- return -1;
-}
-
-LIBYUV_API
-int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height,
- enum RotationMode mode) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_y || !src_uv || width <= 0 || height == 0 ||
- !dst_y || !dst_u || !dst_v) {
- return -1;
- }
-
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_uv = src_uv + (halfheight - 1) * src_stride_uv;
- src_stride_y = -src_stride_y;
- src_stride_uv = -src_stride_uv;
- }
-
- switch (mode) {
- case kRotate0:
- // copy frame
- return NV12ToI420(src_y, src_stride_y,
- src_uv, src_stride_uv,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height);
- case kRotate90:
- RotatePlane90(src_y, src_stride_y,
- dst_y, dst_stride_y,
- width, height);
- RotateUV90(src_uv, src_stride_uv,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- halfwidth, halfheight);
- return 0;
- case kRotate270:
- RotatePlane270(src_y, src_stride_y,
- dst_y, dst_stride_y,
- width, height);
- RotateUV270(src_uv, src_stride_uv,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- halfwidth, halfheight);
- return 0;
- case kRotate180:
- RotatePlane180(src_y, src_stride_y,
- dst_y, dst_stride_y,
- width, height);
- RotateUV180(src_uv, src_stride_uv,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- halfwidth, halfheight);
- return 0;
- default:
- break;
- }
- return -1;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/rotate_argb.cc b/src/main/jni/libyuv/source/rotate_argb.cc
deleted file mode 100644
index ab0f9ce07..000000000
--- a/src/main/jni/libyuv/source/rotate_argb.cc
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate.h"
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/convert.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// ARGBScale has a function to copy pixels to a row, striding each source
-// pixel by a constant.
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(_M_IX86) || \
- (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
-void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
- int src_stepx,
- uint8* dst_ptr, int dst_width);
-#endif
-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
- (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_SCALEARGBROWDOWNEVEN_NEON
-void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
- int src_stepx,
- uint8* dst_ptr, int dst_width);
-#endif
-
-void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
- int src_stepx,
- uint8* dst_ptr, int dst_width);
-
-static void ARGBTranspose(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
- int i;
- int src_pixel_step = src_stride >> 2;
- void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
- int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
-#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) && // Width of dest.
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
- }
-#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) && // Width of dest.
- IS_ALIGNED(src, 4)) {
- ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
- }
-#endif
-
- for (i = 0; i < width; ++i) { // column of source to row of dest.
- ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
- dst += dst_stride;
- src += 4;
- }
-}
-
-void ARGBRotate90(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
- // Rotate by 90 is a ARGBTranspose with the source read
- // from bottom to top. So set the source pointer to the end
- // of the buffer and flip the sign of the source stride.
- src += src_stride * (height - 1);
- src_stride = -src_stride;
- ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
-}
-
-void ARGBRotate270(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
- // Rotate by 270 is a ARGBTranspose with the destination written
- // from bottom to top. So set the destination pointer to the end
- // of the buffer and flip the sign of the destination stride.
- dst += dst_stride * (width - 1);
- dst_stride = -dst_stride;
- ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
-}
-
-void ARGBRotate180(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
- // Swap first and last row and mirror the content. Uses a temporary row.
- align_buffer_64(row, width * 4);
- const uint8* src_bot = src + src_stride * (height - 1);
- uint8* dst_bot = dst + dst_stride * (height - 1);
- int half_height = (height + 1) >> 1;
- int y;
- void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
- ARGBMirrorRow_C;
- void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_ARGBMIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
- ARGBMirrorRow = ARGBMirrorRow_SSSE3;
- }
-#endif
-#if defined(HAS_ARGBMIRRORROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
- ARGBMirrorRow = ARGBMirrorRow_AVX2;
- }
-#endif
-#if defined(HAS_ARGBMIRRORROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
- ARGBMirrorRow = ARGBMirrorRow_NEON;
- }
-#endif
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
- CopyRow = CopyRow_NEON;
- }
-#endif
-#if defined(HAS_COPYROW_X86)
- if (TestCpuFlag(kCpuHasX86)) {
- CopyRow = CopyRow_X86;
- }
-#endif
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
- CopyRow = CopyRow_SSE2;
- }
-#endif
-#if defined(HAS_COPYROW_ERMS)
- if (TestCpuFlag(kCpuHasERMS)) {
- CopyRow = CopyRow_ERMS;
- }
-#endif
-#if defined(HAS_COPYROW_MIPS)
- if (TestCpuFlag(kCpuHasMIPS)) {
- CopyRow = CopyRow_MIPS;
- }
-#endif
-
- // Odd height will harmlessly mirror the middle row twice.
- for (y = 0; y < half_height; ++y) {
- ARGBMirrorRow(src, row, width); // Mirror first row into a buffer
- ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row
- CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last
- src += src_stride;
- dst += dst_stride;
- src_bot -= src_stride;
- dst_bot -= dst_stride;
- }
- free_aligned_buffer_64(row);
-}
-
-LIBYUV_API
-int ARGBRotate(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height,
- enum RotationMode mode) {
- if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
- return -1;
- }
-
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
-
- switch (mode) {
- case kRotate0:
- // copy frame
- return ARGBCopy(src_argb, src_stride_argb,
- dst_argb, dst_stride_argb,
- width, height);
- case kRotate90:
- ARGBRotate90(src_argb, src_stride_argb,
- dst_argb, dst_stride_argb,
- width, height);
- return 0;
- case kRotate270:
- ARGBRotate270(src_argb, src_stride_argb,
- dst_argb, dst_stride_argb,
- width, height);
- return 0;
- case kRotate180:
- ARGBRotate180(src_argb, src_stride_argb,
- dst_argb, dst_stride_argb,
- width, height);
- return 0;
- default:
- break;
- }
- return -1;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/rotate_mips.cc b/src/main/jni/libyuv/source/rotate_mips.cc
deleted file mode 100644
index 70770fd06..000000000
--- a/src/main/jni/libyuv/source/rotate_mips.cc
+++ /dev/null
@@ -1,485 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_MIPS) && \
- defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
- (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width) {
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
- "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
- "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
- "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
- "addu $t3, $t2, %[src_stride] \n"
- "addu $t5, $t4, %[src_stride] \n"
- "addu $t6, $t2, $t4 \n"
- "andi $t0, %[dst], 0x3 \n"
- "andi $t1, %[dst_stride], 0x3 \n"
- "or $t0, $t0, $t1 \n"
- "bnez $t0, 11f \n"
- " subu $t7, $t9, %[src_stride] \n"
-//dst + dst_stride word aligned
- "1: \n"
- "lbu $t0, 0(%[src]) \n"
- "lbux $t1, %[src_stride](%[src]) \n"
- "lbux $t8, $t2(%[src]) \n"
- "lbux $t9, $t3(%[src]) \n"
- "sll $t1, $t1, 16 \n"
- "sll $t9, $t9, 16 \n"
- "or $t0, $t0, $t1 \n"
- "or $t8, $t8, $t9 \n"
- "precr.qb.ph $s0, $t8, $t0 \n"
- "lbux $t0, $t4(%[src]) \n"
- "lbux $t1, $t5(%[src]) \n"
- "lbux $t8, $t6(%[src]) \n"
- "lbux $t9, $t7(%[src]) \n"
- "sll $t1, $t1, 16 \n"
- "sll $t9, $t9, 16 \n"
- "or $t0, $t0, $t1 \n"
- "or $t8, $t8, $t9 \n"
- "precr.qb.ph $s1, $t8, $t0 \n"
- "sw $s0, 0(%[dst]) \n"
- "addiu %[width], -1 \n"
- "addiu %[src], 1 \n"
- "sw $s1, 4(%[dst]) \n"
- "bnez %[width], 1b \n"
- " addu %[dst], %[dst], %[dst_stride] \n"
- "b 2f \n"
-//dst + dst_stride unaligned
- "11: \n"
- "lbu $t0, 0(%[src]) \n"
- "lbux $t1, %[src_stride](%[src]) \n"
- "lbux $t8, $t2(%[src]) \n"
- "lbux $t9, $t3(%[src]) \n"
- "sll $t1, $t1, 16 \n"
- "sll $t9, $t9, 16 \n"
- "or $t0, $t0, $t1 \n"
- "or $t8, $t8, $t9 \n"
- "precr.qb.ph $s0, $t8, $t0 \n"
- "lbux $t0, $t4(%[src]) \n"
- "lbux $t1, $t5(%[src]) \n"
- "lbux $t8, $t6(%[src]) \n"
- "lbux $t9, $t7(%[src]) \n"
- "sll $t1, $t1, 16 \n"
- "sll $t9, $t9, 16 \n"
- "or $t0, $t0, $t1 \n"
- "or $t8, $t8, $t9 \n"
- "precr.qb.ph $s1, $t8, $t0 \n"
- "swr $s0, 0(%[dst]) \n"
- "swl $s0, 3(%[dst]) \n"
- "addiu %[width], -1 \n"
- "addiu %[src], 1 \n"
- "swr $s1, 4(%[dst]) \n"
- "swl $s1, 7(%[dst]) \n"
- "bnez %[width], 11b \n"
- "addu %[dst], %[dst], %[dst_stride] \n"
- "2: \n"
- ".set pop \n"
- :[src] "+r" (src),
- [dst] "+r" (dst),
- [width] "+r" (width)
- :[src_stride] "r" (src_stride),
- [dst_stride] "r" (dst_stride)
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6", "t7", "t8", "t9",
- "s0", "s1"
- );
-}
-
-void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width) {
- __asm__ __volatile__ (
- ".set noat \n"
- ".set push \n"
- ".set noreorder \n"
- "beqz %[width], 2f \n"
- " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
- "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
- "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
- "addu $t3, $t2, %[src_stride] \n"
- "addu $t5, $t4, %[src_stride] \n"
- "addu $t6, $t2, $t4 \n"
-
- "srl $AT, %[width], 0x2 \n"
- "andi $t0, %[dst], 0x3 \n"
- "andi $t1, %[dst_stride], 0x3 \n"
- "or $t0, $t0, $t1 \n"
- "bnez $t0, 11f \n"
- " subu $t7, $t9, %[src_stride] \n"
-//dst + dst_stride word aligned
- "1: \n"
- "lw $t0, 0(%[src]) \n"
- "lwx $t1, %[src_stride](%[src]) \n"
- "lwx $t8, $t2(%[src]) \n"
- "lwx $t9, $t3(%[src]) \n"
-
-// t0 = | 30 | 20 | 10 | 00 |
-// t1 = | 31 | 21 | 11 | 01 |
-// t8 = | 32 | 22 | 12 | 02 |
-// t9 = | 33 | 23 | 13 | 03 |
-
- "precr.qb.ph $s0, $t1, $t0 \n"
- "precr.qb.ph $s1, $t9, $t8 \n"
- "precrq.qb.ph $s2, $t1, $t0 \n"
- "precrq.qb.ph $s3, $t9, $t8 \n"
-
- // s0 = | 21 | 01 | 20 | 00 |
- // s1 = | 23 | 03 | 22 | 02 |
- // s2 = | 31 | 11 | 30 | 10 |
- // s3 = | 33 | 13 | 32 | 12 |
-
- "precr.qb.ph $s4, $s1, $s0 \n"
- "precrq.qb.ph $s5, $s1, $s0 \n"
- "precr.qb.ph $s6, $s3, $s2 \n"
- "precrq.qb.ph $s7, $s3, $s2 \n"
-
- // s4 = | 03 | 02 | 01 | 00 |
- // s5 = | 23 | 22 | 21 | 20 |
- // s6 = | 13 | 12 | 11 | 10 |
- // s7 = | 33 | 32 | 31 | 30 |
-
- "lwx $t0, $t4(%[src]) \n"
- "lwx $t1, $t5(%[src]) \n"
- "lwx $t8, $t6(%[src]) \n"
- "lwx $t9, $t7(%[src]) \n"
-
-// t0 = | 34 | 24 | 14 | 04 |
-// t1 = | 35 | 25 | 15 | 05 |
-// t8 = | 36 | 26 | 16 | 06 |
-// t9 = | 37 | 27 | 17 | 07 |
-
- "precr.qb.ph $s0, $t1, $t0 \n"
- "precr.qb.ph $s1, $t9, $t8 \n"
- "precrq.qb.ph $s2, $t1, $t0 \n"
- "precrq.qb.ph $s3, $t9, $t8 \n"
-
- // s0 = | 25 | 05 | 24 | 04 |
- // s1 = | 27 | 07 | 26 | 06 |
- // s2 = | 35 | 15 | 34 | 14 |
- // s3 = | 37 | 17 | 36 | 16 |
-
- "precr.qb.ph $t0, $s1, $s0 \n"
- "precrq.qb.ph $t1, $s1, $s0 \n"
- "precr.qb.ph $t8, $s3, $s2 \n"
- "precrq.qb.ph $t9, $s3, $s2 \n"
-
- // t0 = | 07 | 06 | 05 | 04 |
- // t1 = | 27 | 26 | 25 | 24 |
- // t8 = | 17 | 16 | 15 | 14 |
- // t9 = | 37 | 36 | 35 | 34 |
-
- "addu $s0, %[dst], %[dst_stride] \n"
- "addu $s1, $s0, %[dst_stride] \n"
- "addu $s2, $s1, %[dst_stride] \n"
-
- "sw $s4, 0(%[dst]) \n"
- "sw $t0, 4(%[dst]) \n"
- "sw $s6, 0($s0) \n"
- "sw $t8, 4($s0) \n"
- "sw $s5, 0($s1) \n"
- "sw $t1, 4($s1) \n"
- "sw $s7, 0($s2) \n"
- "sw $t9, 4($s2) \n"
-
- "addiu $AT, -1 \n"
- "addiu %[src], 4 \n"
-
- "bnez $AT, 1b \n"
- " addu %[dst], $s2, %[dst_stride] \n"
- "b 2f \n"
-//dst + dst_stride unaligned
- "11: \n"
- "lw $t0, 0(%[src]) \n"
- "lwx $t1, %[src_stride](%[src]) \n"
- "lwx $t8, $t2(%[src]) \n"
- "lwx $t9, $t3(%[src]) \n"
-
-// t0 = | 30 | 20 | 10 | 00 |
-// t1 = | 31 | 21 | 11 | 01 |
-// t8 = | 32 | 22 | 12 | 02 |
-// t9 = | 33 | 23 | 13 | 03 |
-
- "precr.qb.ph $s0, $t1, $t0 \n"
- "precr.qb.ph $s1, $t9, $t8 \n"
- "precrq.qb.ph $s2, $t1, $t0 \n"
- "precrq.qb.ph $s3, $t9, $t8 \n"
-
- // s0 = | 21 | 01 | 20 | 00 |
- // s1 = | 23 | 03 | 22 | 02 |
- // s2 = | 31 | 11 | 30 | 10 |
- // s3 = | 33 | 13 | 32 | 12 |
-
- "precr.qb.ph $s4, $s1, $s0 \n"
- "precrq.qb.ph $s5, $s1, $s0 \n"
- "precr.qb.ph $s6, $s3, $s2 \n"
- "precrq.qb.ph $s7, $s3, $s2 \n"
-
- // s4 = | 03 | 02 | 01 | 00 |
- // s5 = | 23 | 22 | 21 | 20 |
- // s6 = | 13 | 12 | 11 | 10 |
- // s7 = | 33 | 32 | 31 | 30 |
-
- "lwx $t0, $t4(%[src]) \n"
- "lwx $t1, $t5(%[src]) \n"
- "lwx $t8, $t6(%[src]) \n"
- "lwx $t9, $t7(%[src]) \n"
-
-// t0 = | 34 | 24 | 14 | 04 |
-// t1 = | 35 | 25 | 15 | 05 |
-// t8 = | 36 | 26 | 16 | 06 |
-// t9 = | 37 | 27 | 17 | 07 |
-
- "precr.qb.ph $s0, $t1, $t0 \n"
- "precr.qb.ph $s1, $t9, $t8 \n"
- "precrq.qb.ph $s2, $t1, $t0 \n"
- "precrq.qb.ph $s3, $t9, $t8 \n"
-
- // s0 = | 25 | 05 | 24 | 04 |
- // s1 = | 27 | 07 | 26 | 06 |
- // s2 = | 35 | 15 | 34 | 14 |
- // s3 = | 37 | 17 | 36 | 16 |
-
- "precr.qb.ph $t0, $s1, $s0 \n"
- "precrq.qb.ph $t1, $s1, $s0 \n"
- "precr.qb.ph $t8, $s3, $s2 \n"
- "precrq.qb.ph $t9, $s3, $s2 \n"
-
- // t0 = | 07 | 06 | 05 | 04 |
- // t1 = | 27 | 26 | 25 | 24 |
- // t8 = | 17 | 16 | 15 | 14 |
- // t9 = | 37 | 36 | 35 | 34 |
-
- "addu $s0, %[dst], %[dst_stride] \n"
- "addu $s1, $s0, %[dst_stride] \n"
- "addu $s2, $s1, %[dst_stride] \n"
-
- "swr $s4, 0(%[dst]) \n"
- "swl $s4, 3(%[dst]) \n"
- "swr $t0, 4(%[dst]) \n"
- "swl $t0, 7(%[dst]) \n"
- "swr $s6, 0($s0) \n"
- "swl $s6, 3($s0) \n"
- "swr $t8, 4($s0) \n"
- "swl $t8, 7($s0) \n"
- "swr $s5, 0($s1) \n"
- "swl $s5, 3($s1) \n"
- "swr $t1, 4($s1) \n"
- "swl $t1, 7($s1) \n"
- "swr $s7, 0($s2) \n"
- "swl $s7, 3($s2) \n"
- "swr $t9, 4($s2) \n"
- "swl $t9, 7($s2) \n"
-
- "addiu $AT, -1 \n"
- "addiu %[src], 4 \n"
-
- "bnez $AT, 11b \n"
- " addu %[dst], $s2, %[dst_stride] \n"
- "2: \n"
- ".set pop \n"
- ".set at \n"
- :[src] "+r" (src),
- [dst] "+r" (dst),
- [width] "+r" (width)
- :[src_stride] "r" (src_stride),
- [dst_stride] "r" (dst_stride)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
- "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
- );
-}
-
-void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width) {
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
- "beqz %[width], 2f \n"
- " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
- "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
- "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
- "addu $t3, $t2, %[src_stride] \n"
- "addu $t5, $t4, %[src_stride] \n"
- "addu $t6, $t2, $t4 \n"
- "subu $t7, $t9, %[src_stride] \n"
- "srl $t1, %[width], 1 \n"
-
-// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
- "andi $t0, %[dst_a], 0x3 \n"
- "andi $t8, %[dst_b], 0x3 \n"
- "or $t0, $t0, $t8 \n"
- "andi $t8, %[dst_stride_a], 0x3 \n"
- "andi $s5, %[dst_stride_b], 0x3 \n"
- "or $t8, $t8, $s5 \n"
- "or $t0, $t0, $t8 \n"
- "bnez $t0, 11f \n"
- " nop \n"
-// dst + dst_stride word aligned (both, a & b dst addresses)
- "1: \n"
- "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
- "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
- "addu $s5, %[dst_a], %[dst_stride_a] \n"
- "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
- "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
- "addu $s6, %[dst_b], %[dst_stride_b] \n"
-
- "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
- "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
- "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
-
- "sll $t0, $t0, 16 \n"
- "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
- "sll $t9, $t9, 16 \n"
- "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
-
- "sw $s3, 0($s5) \n"
- "sw $s4, 0($s6) \n"
-
- "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
-
- "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
- "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
- "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
- "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
- "sw $s3, 0(%[dst_a]) \n"
- "sw $s4, 0(%[dst_b]) \n"
-
- "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
- "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
- "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
-
- "sll $t0, $t0, 16 \n"
- "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
- "sll $t9, $t9, 16 \n"
- "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
- "sw $s3, 4($s5) \n"
- "sw $s4, 4($s6) \n"
-
- "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
-
- "addiu %[src], 4 \n"
- "addiu $t1, -1 \n"
- "sll $t0, %[dst_stride_a], 1 \n"
- "sll $t8, %[dst_stride_b], 1 \n"
- "sw $s3, 4(%[dst_a]) \n"
- "sw $s4, 4(%[dst_b]) \n"
- "addu %[dst_a], %[dst_a], $t0 \n"
- "bnez $t1, 1b \n"
- " addu %[dst_b], %[dst_b], $t8 \n"
- "b 2f \n"
- " nop \n"
-
-// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
- "11: \n"
- "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
- "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
- "addu $s5, %[dst_a], %[dst_stride_a] \n"
- "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
- "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
- "addu $s6, %[dst_b], %[dst_stride_b] \n"
-
- "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
- "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
- "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
-
- "sll $t0, $t0, 16 \n"
- "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
- "sll $t9, $t9, 16 \n"
- "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
-
- "swr $s3, 0($s5) \n"
- "swl $s3, 3($s5) \n"
- "swr $s4, 0($s6) \n"
- "swl $s4, 3($s6) \n"
-
- "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
-
- "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
- "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
- "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
- "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
- "swr $s3, 0(%[dst_a]) \n"
- "swl $s3, 3(%[dst_a]) \n"
- "swr $s4, 0(%[dst_b]) \n"
- "swl $s4, 3(%[dst_b]) \n"
-
- "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
- "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
- "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
-
- "sll $t0, $t0, 16 \n"
- "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
- "sll $t9, $t9, 16 \n"
- "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
-
- "swr $s3, 4($s5) \n"
- "swl $s3, 7($s5) \n"
- "swr $s4, 4($s6) \n"
- "swl $s4, 7($s6) \n"
-
- "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
-
- "addiu %[src], 4 \n"
- "addiu $t1, -1 \n"
- "sll $t0, %[dst_stride_a], 1 \n"
- "sll $t8, %[dst_stride_b], 1 \n"
- "swr $s3, 4(%[dst_a]) \n"
- "swl $s3, 7(%[dst_a]) \n"
- "swr $s4, 4(%[dst_b]) \n"
- "swl $s4, 7(%[dst_b]) \n"
- "addu %[dst_a], %[dst_a], $t0 \n"
- "bnez $t1, 11b \n"
- " addu %[dst_b], %[dst_b], $t8 \n"
-
- "2: \n"
- ".set pop \n"
- : [src] "+r" (src),
- [dst_a] "+r" (dst_a),
- [dst_b] "+r" (dst_b),
- [width] "+r" (width),
- [src_stride] "+r" (src_stride)
- : [dst_stride_a] "r" (dst_stride_a),
- [dst_stride_b] "r" (dst_stride_b)
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6", "t7", "t8", "t9",
- "s0", "s1", "s2", "s3",
- "s4", "s5", "s6"
- );
-}
-
-#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/rotate_neon.cc b/src/main/jni/libyuv/source/rotate_neon.cc
deleted file mode 100644
index d354e11fa..000000000
--- a/src/main/jni/libyuv/source/rotate_neon.cc
+++ /dev/null
@@ -1,533 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
-
-static uvec8 kVTbl4x4Transpose =
- { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
-
-void TransposeWx8_NEON(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width) {
- const uint8* src_temp = NULL;
- asm volatile (
- // loops are on blocks of 8. loop will stop when
- // counter gets to or below 0. starting the counter
- // at w-8 allow for this
- "sub %5, #8 \n"
-
- // handle 8x8 blocks. this should be the majority of the plane
- ".p2align 2 \n"
- "1: \n"
- "mov %0, %1 \n"
-
- MEMACCESS(0)
- "vld1.8 {d0}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.8 {d1}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.8 {d2}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.8 {d3}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.8 {d4}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.8 {d5}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.8 {d6}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.8 {d7}, [%0] \n"
-
- "vtrn.8 d1, d0 \n"
- "vtrn.8 d3, d2 \n"
- "vtrn.8 d5, d4 \n"
- "vtrn.8 d7, d6 \n"
-
- "vtrn.16 d1, d3 \n"
- "vtrn.16 d0, d2 \n"
- "vtrn.16 d5, d7 \n"
- "vtrn.16 d4, d6 \n"
-
- "vtrn.32 d1, d5 \n"
- "vtrn.32 d0, d4 \n"
- "vtrn.32 d3, d7 \n"
- "vtrn.32 d2, d6 \n"
-
- "vrev16.8 q0, q0 \n"
- "vrev16.8 q1, q1 \n"
- "vrev16.8 q2, q2 \n"
- "vrev16.8 q3, q3 \n"
-
- "mov %0, %3 \n"
-
- MEMACCESS(0)
- "vst1.8 {d1}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d0}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d3}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d2}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d5}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d4}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d7}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d6}, [%0] \n"
-
- "add %1, #8 \n" // src += 8
- "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
- "subs %5, #8 \n" // w -= 8
- "bge 1b \n"
-
- // add 8 back to counter. if the result is 0 there are
- // no residuals.
- "adds %5, #8 \n"
- "beq 4f \n"
-
- // some residual, so between 1 and 7 lines left to transpose
- "cmp %5, #2 \n"
- "blt 3f \n"
-
- "cmp %5, #4 \n"
- "blt 2f \n"
-
- // 4x8 block
- "mov %0, %1 \n"
- MEMACCESS(0)
- "vld1.32 {d0[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d0[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d1[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d1[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d2[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d2[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d3[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d3[1]}, [%0] \n"
-
- "mov %0, %3 \n"
-
- MEMACCESS(6)
- "vld1.8 {q3}, [%6] \n"
-
- "vtbl.8 d4, {d0, d1}, d6 \n"
- "vtbl.8 d5, {d0, d1}, d7 \n"
- "vtbl.8 d0, {d2, d3}, d6 \n"
- "vtbl.8 d1, {d2, d3}, d7 \n"
-
- // TODO(frkoenig): Rework shuffle above to
- // write out with 4 instead of 8 writes.
- MEMACCESS(0)
- "vst1.32 {d4[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d4[1]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d5[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d5[1]}, [%0] \n"
-
- "add %0, %3, #4 \n"
- MEMACCESS(0)
- "vst1.32 {d0[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d0[1]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d1[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d1[1]}, [%0] \n"
-
- "add %1, #4 \n" // src += 4
- "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
- "subs %5, #4 \n" // w -= 4
- "beq 4f \n"
-
- // some residual, check to see if it includes a 2x8 block,
- // or less
- "cmp %5, #2 \n"
- "blt 3f \n"
-
- // 2x8 block
- "2: \n"
- "mov %0, %1 \n"
- MEMACCESS(0)
- "vld1.16 {d0[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d1[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d0[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d1[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d0[2]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d1[2]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d0[3]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d1[3]}, [%0] \n"
-
- "vtrn.8 d0, d1 \n"
-
- "mov %0, %3 \n"
-
- MEMACCESS(0)
- "vst1.64 {d0}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.64 {d1}, [%0] \n"
-
- "add %1, #2 \n" // src += 2
- "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
- "subs %5, #2 \n" // w -= 2
- "beq 4f \n"
-
- // 1x8 block
- "3: \n"
- MEMACCESS(1)
- "vld1.8 {d0[0]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[1]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[2]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[3]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[4]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[5]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[6]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[7]}, [%1] \n"
-
- MEMACCESS(3)
- "vst1.64 {d0}, [%3] \n"
-
- "4: \n"
-
- : "+r"(src_temp), // %0
- "+r"(src), // %1
- "+r"(src_stride), // %2
- "+r"(dst), // %3
- "+r"(dst_stride), // %4
- "+r"(width) // %5
- : "r"(&kVTbl4x4Transpose) // %6
- : "memory", "cc", "q0", "q1", "q2", "q3"
- );
-}
-
-static uvec8 kVTbl4x4TransposeDi =
- { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
-
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width) {
- const uint8* src_temp = NULL;
- asm volatile (
- // loops are on blocks of 8. loop will stop when
- // counter gets to or below 0. starting the counter
- // at w-8 allow for this
- "sub %7, #8 \n"
-
- // handle 8x8 blocks. this should be the majority of the plane
- ".p2align 2 \n"
- "1: \n"
- "mov %0, %1 \n"
-
- MEMACCESS(0)
- "vld2.8 {d0, d1}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.8 {d2, d3}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.8 {d4, d5}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.8 {d6, d7}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.8 {d16, d17}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.8 {d18, d19}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.8 {d20, d21}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.8 {d22, d23}, [%0] \n"
-
- "vtrn.8 q1, q0 \n"
- "vtrn.8 q3, q2 \n"
- "vtrn.8 q9, q8 \n"
- "vtrn.8 q11, q10 \n"
-
- "vtrn.16 q1, q3 \n"
- "vtrn.16 q0, q2 \n"
- "vtrn.16 q9, q11 \n"
- "vtrn.16 q8, q10 \n"
-
- "vtrn.32 q1, q9 \n"
- "vtrn.32 q0, q8 \n"
- "vtrn.32 q3, q11 \n"
- "vtrn.32 q2, q10 \n"
-
- "vrev16.8 q0, q0 \n"
- "vrev16.8 q1, q1 \n"
- "vrev16.8 q2, q2 \n"
- "vrev16.8 q3, q3 \n"
- "vrev16.8 q8, q8 \n"
- "vrev16.8 q9, q9 \n"
- "vrev16.8 q10, q10 \n"
- "vrev16.8 q11, q11 \n"
-
- "mov %0, %3 \n"
-
- MEMACCESS(0)
- "vst1.8 {d2}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d0}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d6}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d4}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d18}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d16}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d22}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d20}, [%0] \n"
-
- "mov %0, %5 \n"
-
- MEMACCESS(0)
- "vst1.8 {d3}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.8 {d1}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.8 {d7}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.8 {d5}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.8 {d19}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.8 {d17}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.8 {d23}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.8 {d21}, [%0] \n"
-
- "add %1, #8*2 \n" // src += 8*2
- "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
- "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
- "subs %7, #8 \n" // w -= 8
- "bge 1b \n"
-
- // add 8 back to counter. if the result is 0 there are
- // no residuals.
- "adds %7, #8 \n"
- "beq 4f \n"
-
- // some residual, so between 1 and 7 lines left to transpose
- "cmp %7, #2 \n"
- "blt 3f \n"
-
- "cmp %7, #4 \n"
- "blt 2f \n"
-
- // TODO(frkoenig): Clean this up
- // 4x8 block
- "mov %0, %1 \n"
- MEMACCESS(0)
- "vld1.64 {d0}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d1}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d2}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d3}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d4}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d5}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d6}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d7}, [%0] \n"
-
- MEMACCESS(8)
- "vld1.8 {q15}, [%8] \n"
-
- "vtrn.8 q0, q1 \n"
- "vtrn.8 q2, q3 \n"
-
- "vtbl.8 d16, {d0, d1}, d30 \n"
- "vtbl.8 d17, {d0, d1}, d31 \n"
- "vtbl.8 d18, {d2, d3}, d30 \n"
- "vtbl.8 d19, {d2, d3}, d31 \n"
- "vtbl.8 d20, {d4, d5}, d30 \n"
- "vtbl.8 d21, {d4, d5}, d31 \n"
- "vtbl.8 d22, {d6, d7}, d30 \n"
- "vtbl.8 d23, {d6, d7}, d31 \n"
-
- "mov %0, %3 \n"
-
- MEMACCESS(0)
- "vst1.32 {d16[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d16[1]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d17[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d17[1]}, [%0], %4 \n"
-
- "add %0, %3, #4 \n"
- MEMACCESS(0)
- "vst1.32 {d20[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d20[1]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d21[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d21[1]}, [%0] \n"
-
- "mov %0, %5 \n"
-
- MEMACCESS(0)
- "vst1.32 {d18[0]}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.32 {d18[1]}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.32 {d19[0]}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.32 {d19[1]}, [%0], %6 \n"
-
- "add %0, %5, #4 \n"
- MEMACCESS(0)
- "vst1.32 {d22[0]}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.32 {d22[1]}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.32 {d23[0]}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.32 {d23[1]}, [%0] \n"
-
- "add %1, #4*2 \n" // src += 4 * 2
- "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a
- "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b
- "subs %7, #4 \n" // w -= 4
- "beq 4f \n"
-
- // some residual, check to see if it includes a 2x8 block,
- // or less
- "cmp %7, #2 \n"
- "blt 3f \n"
-
- // 2x8 block
- "2: \n"
- "mov %0, %1 \n"
- MEMACCESS(0)
- "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d1[3], d3[3]}, [%0] \n"
-
- "vtrn.8 d0, d1 \n"
- "vtrn.8 d2, d3 \n"
-
- "mov %0, %3 \n"
-
- MEMACCESS(0)
- "vst1.64 {d0}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.64 {d2}, [%0] \n"
-
- "mov %0, %5 \n"
-
- MEMACCESS(0)
- "vst1.64 {d1}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.64 {d3}, [%0] \n"
-
- "add %1, #2*2 \n" // src += 2 * 2
- "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a
- "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b
- "subs %7, #2 \n" // w -= 2
- "beq 4f \n"
-
- // 1x8 block
- "3: \n"
- MEMACCESS(1)
- "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[7], d1[7]}, [%1] \n"
-
- MEMACCESS(3)
- "vst1.64 {d0}, [%3] \n"
- MEMACCESS(5)
- "vst1.64 {d1}, [%5] \n"
-
- "4: \n"
-
- : "+r"(src_temp), // %0
- "+r"(src), // %1
- "+r"(src_stride), // %2
- "+r"(dst_a), // %3
- "+r"(dst_stride_a), // %4
- "+r"(dst_b), // %5
- "+r"(dst_stride_b), // %6
- "+r"(width) // %7
- : "r"(&kVTbl4x4TransposeDi) // %8
- : "memory", "cc",
- "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
- );
-}
-#endif
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/rotate_neon64.cc b/src/main/jni/libyuv/source/rotate_neon64.cc
deleted file mode 100644
index b080a2c6a..000000000
--- a/src/main/jni/libyuv/source/rotate_neon64.cc
+++ /dev/null
@@ -1,540 +0,0 @@
-/*
- * Copyright 2014 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-//this ifdef should be removed if TransposeWx8_NEON's aarch64 has
-//been done
-#ifdef HAS_TRANSPOSE_WX8_NEON
-static uvec8 kVTbl4x4Transpose =
- { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
-
-void TransposeWx8_NEON(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width) {
- const uint8* src_temp = NULL;
- asm volatile (
- // loops are on blocks of 8. loop will stop when
- // counter gets to or below 0. starting the counter
- // at w-8 allow for this
- "sub %5, #8 \n"
-
- // handle 8x8 blocks. this should be the majority of the plane
- ".p2align 2 \n"
- "1: \n"
- "mov %0, %1 \n"
-
- MEMACCESS(0)
- "vld1.8 {d0}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.8 {d1}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.8 {d2}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.8 {d3}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.8 {d4}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.8 {d5}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.8 {d6}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.8 {d7}, [%0] \n"
-
- "vtrn.8 d1, d0 \n"
- "vtrn.8 d3, d2 \n"
- "vtrn.8 d5, d4 \n"
- "vtrn.8 d7, d6 \n"
-
- "vtrn.16 d1, d3 \n"
- "vtrn.16 d0, d2 \n"
- "vtrn.16 d5, d7 \n"
- "vtrn.16 d4, d6 \n"
-
- "vtrn.32 d1, d5 \n"
- "vtrn.32 d0, d4 \n"
- "vtrn.32 d3, d7 \n"
- "vtrn.32 d2, d6 \n"
-
- "vrev16.8 q0, q0 \n"
- "vrev16.8 q1, q1 \n"
- "vrev16.8 q2, q2 \n"
- "vrev16.8 q3, q3 \n"
-
- "mov %0, %3 \n"
-
- MEMACCESS(0)
- "vst1.8 {d1}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d0}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d3}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d2}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d5}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d4}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d7}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d6}, [%0] \n"
-
- "add %1, #8 \n" // src += 8
- "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
- "subs %5, #8 \n" // w -= 8
- "bge 1b \n"
-
- // add 8 back to counter. if the result is 0 there are
- // no residuals.
- "adds %5, #8 \n"
- "beq 4f \n"
-
- // some residual, so between 1 and 7 lines left to transpose
- "cmp %5, #2 \n"
- "blt 3f \n"
-
- "cmp %5, #4 \n"
- "blt 2f \n"
-
- // 4x8 block
- "mov %0, %1 \n"
- MEMACCESS(0)
- "vld1.32 {d0[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d0[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d1[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d1[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d2[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d2[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d3[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d3[1]}, [%0] \n"
-
- "mov %0, %3 \n"
-
- MEMACCESS(6)
- "vld1.8 {q3}, [%6] \n"
-
- "vtbl.8 d4, {d0, d1}, d6 \n"
- "vtbl.8 d5, {d0, d1}, d7 \n"
- "vtbl.8 d0, {d2, d3}, d6 \n"
- "vtbl.8 d1, {d2, d3}, d7 \n"
-
- // TODO(frkoenig): Rework shuffle above to
- // write out with 4 instead of 8 writes.
- MEMACCESS(0)
- "vst1.32 {d4[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d4[1]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d5[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d5[1]}, [%0] \n"
-
- "add %0, %3, #4 \n"
- MEMACCESS(0)
- "vst1.32 {d0[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d0[1]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d1[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d1[1]}, [%0] \n"
-
- "add %1, #4 \n" // src += 4
- "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
- "subs %5, #4 \n" // w -= 4
- "beq 4f \n"
-
- // some residual, check to see if it includes a 2x8 block,
- // or less
- "cmp %5, #2 \n"
- "blt 3f \n"
-
- // 2x8 block
- "2: \n"
- "mov %0, %1 \n"
- MEMACCESS(0)
- "vld1.16 {d0[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d1[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d0[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d1[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d0[2]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d1[2]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d0[3]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d1[3]}, [%0] \n"
-
- "vtrn.8 d0, d1 \n"
-
- "mov %0, %3 \n"
-
- MEMACCESS(0)
- "vst1.64 {d0}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.64 {d1}, [%0] \n"
-
- "add %1, #2 \n" // src += 2
- "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
- "subs %5, #2 \n" // w -= 2
- "beq 4f \n"
-
- // 1x8 block
- "3: \n"
- MEMACCESS(1)
- "vld1.8 {d0[0]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[1]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[2]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[3]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[4]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[5]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[6]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[7]}, [%1] \n"
-
- MEMACCESS(3)
- "vst1.64 {d0}, [%3] \n"
-
- "4: \n"
-
- : "+r"(src_temp), // %0
- "+r"(src), // %1
- "+r"(src_stride), // %2
- "+r"(dst), // %3
- "+r"(dst_stride), // %4
- "+r"(width) // %5
- : "r"(&kVTbl4x4Transpose) // %6
- : "memory", "cc", "q0", "q1", "q2", "q3"
- );
-}
-#endif //HAS_TRANSPOSE_WX8_NEON
-
-//this ifdef should be removed if TransposeUVWx8_NEON's aarch64 has
-//been done
-#ifdef HAS_TRANSPOSE_UVWX8_NEON
-static uvec8 kVTbl4x4TransposeDi =
- { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
-
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width) {
- const uint8* src_temp = NULL;
- asm volatile (
- // loops are on blocks of 8. loop will stop when
- // counter gets to or below 0. starting the counter
- // at w-8 allow for this
- "sub %7, #8 \n"
-
- // handle 8x8 blocks. this should be the majority of the plane
- ".p2align 2 \n"
- "1: \n"
- "mov %0, %1 \n"
-
- MEMACCESS(0)
- "vld2.8 {d0, d1}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.8 {d2, d3}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.8 {d4, d5}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.8 {d6, d7}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.8 {d16, d17}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.8 {d18, d19}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.8 {d20, d21}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.8 {d22, d23}, [%0] \n"
-
- "vtrn.8 q1, q0 \n"
- "vtrn.8 q3, q2 \n"
- "vtrn.8 q9, q8 \n"
- "vtrn.8 q11, q10 \n"
-
- "vtrn.16 q1, q3 \n"
- "vtrn.16 q0, q2 \n"
- "vtrn.16 q9, q11 \n"
- "vtrn.16 q8, q10 \n"
-
- "vtrn.32 q1, q9 \n"
- "vtrn.32 q0, q8 \n"
- "vtrn.32 q3, q11 \n"
- "vtrn.32 q2, q10 \n"
-
- "vrev16.8 q0, q0 \n"
- "vrev16.8 q1, q1 \n"
- "vrev16.8 q2, q2 \n"
- "vrev16.8 q3, q3 \n"
- "vrev16.8 q8, q8 \n"
- "vrev16.8 q9, q9 \n"
- "vrev16.8 q10, q10 \n"
- "vrev16.8 q11, q11 \n"
-
- "mov %0, %3 \n"
-
- MEMACCESS(0)
- "vst1.8 {d2}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d0}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d6}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d4}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d18}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d16}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d22}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.8 {d20}, [%0] \n"
-
- "mov %0, %5 \n"
-
- MEMACCESS(0)
- "vst1.8 {d3}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.8 {d1}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.8 {d7}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.8 {d5}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.8 {d19}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.8 {d17}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.8 {d23}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.8 {d21}, [%0] \n"
-
- "add %1, #8*2 \n" // src += 8*2
- "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
- "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
- "subs %7, #8 \n" // w -= 8
- "bge 1b \n"
-
- // add 8 back to counter. if the result is 0 there are
- // no residuals.
- "adds %7, #8 \n"
- "beq 4f \n"
-
- // some residual, so between 1 and 7 lines left to transpose
- "cmp %7, #2 \n"
- "blt 3f \n"
-
- "cmp %7, #4 \n"
- "blt 2f \n"
-
- // TODO(frkoenig): Clean this up
- // 4x8 block
- "mov %0, %1 \n"
- MEMACCESS(0)
- "vld1.64 {d0}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d1}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d2}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d3}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d4}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d5}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d6}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d7}, [%0] \n"
-
- MEMACCESS(8)
- "vld1.8 {q15}, [%8] \n"
-
- "vtrn.8 q0, q1 \n"
- "vtrn.8 q2, q3 \n"
-
- "vtbl.8 d16, {d0, d1}, d30 \n"
- "vtbl.8 d17, {d0, d1}, d31 \n"
- "vtbl.8 d18, {d2, d3}, d30 \n"
- "vtbl.8 d19, {d2, d3}, d31 \n"
- "vtbl.8 d20, {d4, d5}, d30 \n"
- "vtbl.8 d21, {d4, d5}, d31 \n"
- "vtbl.8 d22, {d6, d7}, d30 \n"
- "vtbl.8 d23, {d6, d7}, d31 \n"
-
- "mov %0, %3 \n"
-
- MEMACCESS(0)
- "vst1.32 {d16[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d16[1]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d17[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d17[1]}, [%0], %4 \n"
-
- "add %0, %3, #4 \n"
- MEMACCESS(0)
- "vst1.32 {d20[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d20[1]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d21[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d21[1]}, [%0] \n"
-
- "mov %0, %5 \n"
-
- MEMACCESS(0)
- "vst1.32 {d18[0]}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.32 {d18[1]}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.32 {d19[0]}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.32 {d19[1]}, [%0], %6 \n"
-
- "add %0, %5, #4 \n"
- MEMACCESS(0)
- "vst1.32 {d22[0]}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.32 {d22[1]}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.32 {d23[0]}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.32 {d23[1]}, [%0] \n"
-
- "add %1, #4*2 \n" // src += 4 * 2
- "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a
- "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b
- "subs %7, #4 \n" // w -= 4
- "beq 4f \n"
-
- // some residual, check to see if it includes a 2x8 block,
- // or less
- "cmp %7, #2 \n"
- "blt 3f \n"
-
- // 2x8 block
- "2: \n"
- "mov %0, %1 \n"
- MEMACCESS(0)
- "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d1[3], d3[3]}, [%0] \n"
-
- "vtrn.8 d0, d1 \n"
- "vtrn.8 d2, d3 \n"
-
- "mov %0, %3 \n"
-
- MEMACCESS(0)
- "vst1.64 {d0}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.64 {d2}, [%0] \n"
-
- "mov %0, %5 \n"
-
- MEMACCESS(0)
- "vst1.64 {d1}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.64 {d3}, [%0] \n"
-
- "add %1, #2*2 \n" // src += 2 * 2
- "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a
- "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b
- "subs %7, #2 \n" // w -= 2
- "beq 4f \n"
-
- // 1x8 block
- "3: \n"
- MEMACCESS(1)
- "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[7], d1[7]}, [%1] \n"
-
- MEMACCESS(3)
- "vst1.64 {d0}, [%3] \n"
- MEMACCESS(5)
- "vst1.64 {d1}, [%5] \n"
-
- "4: \n"
-
- : "+r"(src_temp), // %0
- "+r"(src), // %1
- "+r"(src_stride), // %2
- "+r"(dst_a), // %3
- "+r"(dst_stride_a), // %4
- "+r"(dst_b), // %5
- "+r"(dst_stride_b), // %6
- "+r"(width) // %7
- : "r"(&kVTbl4x4TransposeDi) // %8
- : "memory", "cc",
- "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
- );
-}
-#endif // HAS_TRANSPOSE_UVWX8_NEON
-#endif // __aarch64__
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/row_any.cc b/src/main/jni/libyuv/source/row_any.cc
deleted file mode 100644
index aaa0378d7..000000000
--- a/src/main/jni/libyuv/source/row_any.cc
+++ /dev/null
@@ -1,602 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// TODO(fbarchard): Consider 'any' functions handling any quantity of pixels.
-// TODO(fbarchard): Consider 'any' functions handling odd alignment.
-// YUV to RGB does multiple of 8 with SIMD and remainder with C.
-#define YANY(NAMEANY, I420TORGB_SIMD, I420TORGB_C, UV_SHIFT, BPP, MASK) \
- void NAMEANY(const uint8* y_buf, \
- const uint8* u_buf, \
- const uint8* v_buf, \
- uint8* rgb_buf, \
- int width) { \
- int n = width & ~MASK; \
- I420TORGB_SIMD(y_buf, u_buf, v_buf, rgb_buf, n); \
- I420TORGB_C(y_buf + n, \
- u_buf + (n >> UV_SHIFT), \
- v_buf + (n >> UV_SHIFT), \
- rgb_buf + n * BPP, width & MASK); \
- }
-
-#ifdef HAS_I422TOARGBROW_SSSE3
-YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,
- 1, 4, 7)
-#endif // HAS_I422TOARGBROW_SSSE3
-#ifdef HAS_I444TOARGBROW_SSSE3
-YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
- 0, 4, 7)
-YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,
- 2, 4, 7)
-YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,
- 1, 4, 7)
-YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C,
- 1, 4, 7)
-YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C,
- 1, 4, 7)
-// I422ToRGB565Row_SSSE3 is unaligned.
-YANY(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, I422ToARGB4444Row_C,
- 1, 2, 7)
-YANY(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, I422ToARGB1555Row_C,
- 1, 2, 7)
-YANY(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, I422ToRGB565Row_C,
- 1, 2, 7)
-// I422ToRGB24Row_SSSE3 is unaligned.
-YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7)
-YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)
-YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)
-YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
-#endif // HAS_I444TOARGBROW_SSSE3
-#ifdef HAS_I422TOARGBROW_AVX2
-YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
-#endif // HAS_I422TOARGBROW_AVX2
-#ifdef HAS_I422TOARGBROW_NEON
-YANY(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, I444ToARGBRow_C, 0, 4, 7)
-YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4, 7)
-YANY(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, I411ToARGBRow_C, 2, 4, 7)
-YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1, 4, 7)
-YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1, 4, 7)
-YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1, 4, 7)
-YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1, 3, 7)
-YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1, 3, 7)
-YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C,
- 1, 2, 7)
-YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
- 1, 2, 7)
-YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
-#endif // HAS_I422TOARGBROW_NEON
-#ifdef HAS_I422TOYUY2ROW_NEON
-YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
-#endif // HAS_I422TOYUY2ROW_NEON
-#ifdef HAS_I422TOUYVYROW_NEON
-YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
-#endif // HAS_I422TOUYVYROW_NEON
-#undef YANY
-
-// Wrappers to handle odd width
-#define NV2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP) \
- void NAMEANY(const uint8* y_buf, \
- const uint8* uv_buf, \
- uint8* rgb_buf, \
- int width) { \
- int n = width & ~7; \
- NV12TORGB_SIMD(y_buf, uv_buf, rgb_buf, n); \
- NV12TORGB_C(y_buf + n, \
- uv_buf + (n >> UV_SHIFT), \
- rgb_buf + n * BPP, width & 7); \
- }
-
-#ifdef HAS_NV12TOARGBROW_SSSE3
-NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C,
- 0, 4)
-NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C,
- 0, 4)
-#endif // HAS_NV12TOARGBROW_SSSE3
-#ifdef HAS_NV12TOARGBROW_NEON
-NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4)
-NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4)
-#endif // HAS_NV12TOARGBROW_NEON
-#ifdef HAS_NV12TORGB565ROW_SSSE3
-NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C,
- 0, 2)
-NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C,
- 0, 2)
-#endif // HAS_NV12TORGB565ROW_SSSE3
-#ifdef HAS_NV12TORGB565ROW_NEON
-NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, 0, 2)
-NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2)
-#endif // HAS_NV12TORGB565ROW_NEON
-#undef NVANY
-
-#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \
- void NAMEANY(const uint8* src, \
- uint8* dst, \
- int width) { \
- int n = width & ~MASK; \
- ARGBTORGB_SIMD(src, dst, n); \
- ARGBTORGB_C(src + n * SBPP, dst + n * BPP, width & MASK); \
- }
-
-#if defined(HAS_ARGBTORGB24ROW_SSSE3)
-RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, ARGBToRGB24Row_C,
- 15, 4, 3)
-RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, ARGBToRAWRow_C,
- 15, 4, 3)
-RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, ARGBToRGB565Row_C,
- 3, 4, 2)
-RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C,
- 3, 4, 2)
-RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
- 3, 4, 2)
-#endif
-#if defined(HAS_I400TOARGBROW_SSE2)
-RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
- 7, 1, 4)
-#endif
-#if defined(HAS_YTOARGBROW_SSE2)
-RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,
- 7, 1, 4)
-RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C,
- 15, 2, 4)
-RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C,
- 15, 2, 4)
-// These require alignment on ARGB, so C is used for remainder.
-RGBANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, RGB24ToARGBRow_C,
- 15, 3, 4)
-RGBANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, RAWToARGBRow_C,
- 15, 3, 4)
-RGBANY(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, RGB565ToARGBRow_C,
- 7, 2, 4)
-RGBANY(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, ARGB1555ToARGBRow_C,
- 7, 2, 4)
-RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C,
- 7, 2, 4)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_NEON)
-RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3)
-RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 7, 4, 3)
-RGBANY(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, ARGBToRGB565Row_C,
- 7, 4, 2)
-RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C,
- 7, 4, 2)
-RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C,
- 7, 4, 2)
-RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C,
- 7, 1, 4)
-RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C,
- 7, 1, 4)
-RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C,
- 7, 2, 4)
-RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C,
- 7, 2, 4)
-#endif
-#undef RGBANY
-
-// ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst.
-#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \
- void NAMEANY(const uint8* src, \
- uint8* dst, uint32 selector, \
- int width) { \
- int n = width & ~MASK; \
- ARGBTORGB_SIMD(src, dst, selector, n); \
- ARGBTORGB_C(src + n * SBPP, dst + n * BPP, selector, width & MASK); \
- }
-
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
-BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C,
- 7, 4, 1)
-#endif
-#if defined(HAS_ARGBTOBAYERROW_NEON)
-BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C,
- 7, 4, 1)
-#endif
-#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
-BAYERANY(ARGBToBayerGGRow_Any_SSE2, ARGBToBayerGGRow_SSE2, ARGBToBayerGGRow_C,
- 7, 4, 1)
-#endif
-#if defined(HAS_ARGBTOBAYERGGROW_NEON)
-BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C,
- 7, 4, 1)
-#endif
-
-#undef BAYERANY
-
-// RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD.
-#define YANY(NAMEANY, ARGBTOY_SIMD, SBPP, BPP, NUM) \
- void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
- ARGBTOY_SIMD(src_argb, dst_y, width - NUM); \
- ARGBTOY_SIMD(src_argb + (width - NUM) * SBPP, \
- dst_y + (width - NUM) * BPP, NUM); \
- }
-
-#ifdef HAS_ARGBTOYROW_AVX2
-YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 4, 1, 32)
-YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 4, 1, 32)
-YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32)
-YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32)
-#endif
-#ifdef HAS_ARGBTOYROW_SSSE3
-YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16)
-#endif
-#ifdef HAS_BGRATOYROW_SSSE3
-YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16)
-YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16)
-YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16)
-YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16)
-YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16)
-#endif
-#ifdef HAS_ARGBTOYJROW_SSSE3
-YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16)
-#endif
-#ifdef HAS_ARGBTOYROW_NEON
-YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8)
-#endif
-#ifdef HAS_ARGBTOYJROW_NEON
-YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8)
-#endif
-#ifdef HAS_BGRATOYROW_NEON
-YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8)
-#endif
-#ifdef HAS_ABGRTOYROW_NEON
-YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8)
-#endif
-#ifdef HAS_RGBATOYROW_NEON
-YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8)
-#endif
-#ifdef HAS_RGB24TOYROW_NEON
-YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8)
-#endif
-#ifdef HAS_RAWTOYROW_NEON
-YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8)
-#endif
-#ifdef HAS_RGB565TOYROW_NEON
-YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
-#endif
-#ifdef HAS_ARGB1555TOYROW_NEON
-YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
-#endif
-#ifdef HAS_ARGB4444TOYROW_NEON
-YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
-#endif
-#ifdef HAS_YUY2TOYROW_NEON
-YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)
-#endif
-#ifdef HAS_UYVYTOYROW_NEON
-YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)
-#endif
-#ifdef HAS_RGB24TOARGBROW_NEON
-YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)
-#endif
-#ifdef HAS_RAWTOARGBROW_NEON
-YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)
-#endif
-#ifdef HAS_RGB565TOARGBROW_NEON
-YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)
-#endif
-#ifdef HAS_ARGB1555TOARGBROW_NEON
-YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8)
-#endif
-#ifdef HAS_ARGB4444TOARGBROW_NEON
-YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
-#endif
-#undef YANY
-
-#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \
- void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
- int n = width & ~MASK; \
- ARGBTOY_SIMD(src_argb, dst_y, n); \
- ARGBTOY_C(src_argb + n * SBPP, \
- dst_y + n * BPP, width & MASK); \
- }
-
-// Attenuate is destructive so last16 method can not be used due to overlap.
-#ifdef HAS_ARGBATTENUATEROW_SSSE3
-YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C,
- 4, 4, 3)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_SSE2
-YANY(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, ARGBAttenuateRow_C,
- 4, 4, 3)
-#endif
-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
-YANY(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, ARGBUnattenuateRow_C,
- 4, 4, 3)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_AVX2
-YANY(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, ARGBAttenuateRow_C,
- 4, 4, 7)
-#endif
-#ifdef HAS_ARGBUNATTENUATEROW_AVX2
-YANY(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, ARGBUnattenuateRow_C,
- 4, 4, 7)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_NEON
-YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
- 4, 4, 7)
-#endif
-#undef YANY
-
-// RGB/YUV to UV does multiple of 16 with SIMD and remainder with C.
-#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK) \
- void NAMEANY(const uint8* src_argb, int src_stride_argb, \
- uint8* dst_u, uint8* dst_v, int width) { \
- int n = width & ~MASK; \
- ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n); \
- ANYTOUV_C(src_argb + n * BPP, src_stride_argb, \
- dst_u + (n >> 1), \
- dst_v + (n >> 1), \
- width & MASK); \
- }
-
-#ifdef HAS_ARGBTOUVROW_AVX2
-UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31)
-UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31)
-UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31)
-#endif
-#ifdef HAS_ARGBTOUVROW_SSSE3
-UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15)
-UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_Unaligned_SSSE3, ARGBToUVJRow_C,
- 4, 15)
-UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15)
-UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15)
-UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15)
-UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15)
-UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15)
-#endif
-#ifdef HAS_ARGBTOUVROW_NEON
-UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15)
-#endif
-#ifdef HAS_ARGBTOUVJROW_NEON
-UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15)
-#endif
-#ifdef HAS_BGRATOUVROW_NEON
-UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15)
-#endif
-#ifdef HAS_ABGRTOUVROW_NEON
-UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15)
-#endif
-#ifdef HAS_RGBATOUVROW_NEON
-UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15)
-#endif
-#ifdef HAS_RGB24TOUVROW_NEON
-UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15)
-#endif
-#ifdef HAS_RAWTOUVROW_NEON
-UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)
-#endif
-#ifdef HAS_RGB565TOUVROW_NEON
-UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
-#endif
-#ifdef HAS_ARGB1555TOUVROW_NEON
-UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
-#endif
-#ifdef HAS_ARGB4444TOUVROW_NEON
-UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
-#endif
-#ifdef HAS_YUY2TOUVROW_NEON
-UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15)
-#endif
-#ifdef HAS_UYVYTOUVROW_NEON
-UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
-#endif
-#undef UVANY
-
-#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK, SHIFT) \
- void NAMEANY(const uint8* src_uv, \
- uint8* dst_u, uint8* dst_v, int width) { \
- int n = width & ~MASK; \
- ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \
- ANYTOUV_C(src_uv + n * BPP, \
- dst_u + (n >> SHIFT), \
- dst_v + (n >> SHIFT), \
- width & MASK); \
- }
-
-#ifdef HAS_ARGBTOUV444ROW_SSSE3
-UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3,
- ARGBToUV444Row_C, 4, 15, 0)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_AVX2
-UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2,
- YUY2ToUV422Row_C, 2, 31, 1)
-UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2,
- UYVYToUV422Row_C, 2, 31, 1)
-#endif
-#ifdef HAS_ARGBTOUVROW_SSSE3
-UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3,
- ARGBToUV422Row_C, 4, 15, 1)
-UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2,
- YUY2ToUV422Row_C, 2, 15, 1)
-UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2,
- UYVYToUV422Row_C, 2, 15, 1)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_NEON
-UV422ANY(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON,
- ARGBToUV444Row_C, 4, 7, 0)
-UV422ANY(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON,
- ARGBToUV422Row_C, 4, 15, 1)
-UV422ANY(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON,
- ARGBToUV411Row_C, 4, 31, 2)
-UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON,
- YUY2ToUV422Row_C, 2, 15, 1)
-UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
- UYVYToUV422Row_C, 2, 15, 1)
-#endif
-#undef UV422ANY
-
-#define SPLITUVROWANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \
- void NAMEANY(const uint8* src_uv, \
- uint8* dst_u, uint8* dst_v, int width) { \
- int n = width & ~MASK; \
- ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \
- ANYTOUV_C(src_uv + n * 2, \
- dst_u + n, \
- dst_v + n, \
- width & MASK); \
- }
-
-#ifdef HAS_SPLITUVROW_SSE2
-SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15)
-#endif
-#ifdef HAS_SPLITUVROW_AVX2
-SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31)
-#endif
-#ifdef HAS_SPLITUVROW_NEON
-SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15)
-#endif
-#ifdef HAS_SPLITUVROW_MIPS_DSPR2
-SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
- SplitUVRow_C, 15)
-#endif
-#undef SPLITUVROWANY
-
-#define MERGEUVROW_ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \
- void NAMEANY(const uint8* src_u, const uint8* src_v, \
- uint8* dst_uv, int width) { \
- int n = width & ~MASK; \
- ANYTOUV_SIMD(src_u, src_v, dst_uv, n); \
- ANYTOUV_C(src_u + n, \
- src_v + n, \
- dst_uv + n * 2, \
- width & MASK); \
- }
-
-#ifdef HAS_MERGEUVROW_SSE2
-MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15)
-#endif
-#ifdef HAS_MERGEUVROW_AVX2
-MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31)
-#endif
-#ifdef HAS_MERGEUVROW_NEON
-MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)
-#endif
-#undef MERGEUVROW_ANY
-
-#define MATHROW_ANY(NAMEANY, ARGBMATH_SIMD, ARGBMATH_C, MASK) \
- void NAMEANY(const uint8* src_argb0, const uint8* src_argb1, \
- uint8* dst_argb, int width) { \
- int n = width & ~MASK; \
- ARGBMATH_SIMD(src_argb0, src_argb1, dst_argb, n); \
- ARGBMATH_C(src_argb0 + n * 4, \
- src_argb1 + n * 4, \
- dst_argb + n * 4, \
- width & MASK); \
- }
-
-#ifdef HAS_ARGBMULTIPLYROW_SSE2
-MATHROW_ANY(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, ARGBMultiplyRow_C,
- 3)
-#endif
-#ifdef HAS_ARGBADDROW_SSE2
-MATHROW_ANY(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, ARGBAddRow_C, 3)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_SSE2
-MATHROW_ANY(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, ARGBSubtractRow_C,
- 3)
-#endif
-#ifdef HAS_ARGBMULTIPLYROW_AVX2
-MATHROW_ANY(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, ARGBMultiplyRow_C,
- 7)
-#endif
-#ifdef HAS_ARGBADDROW_AVX2
-MATHROW_ANY(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, ARGBAddRow_C, 7)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_AVX2
-MATHROW_ANY(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, ARGBSubtractRow_C,
- 7)
-#endif
-#ifdef HAS_ARGBMULTIPLYROW_NEON
-MATHROW_ANY(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, ARGBMultiplyRow_C,
- 7)
-#endif
-#ifdef HAS_ARGBADDROW_NEON
-MATHROW_ANY(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, ARGBAddRow_C, 7)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_NEON
-MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C,
- 7)
-#endif
-#undef MATHROW_ANY
-
-// Shuffle may want to work in place, so last16 method can not be used.
-#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \
- void NAMEANY(const uint8* src_argb, uint8* dst_argb, \
- const uint8* shuffler, int width) { \
- int n = width & ~MASK; \
- ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n); \
- ARGBTOY_C(src_argb + n * SBPP, \
- dst_argb + n * BPP, shuffler, width & MASK); \
- }
-
-#ifdef HAS_ARGBSHUFFLEROW_SSE2
-YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2,
- ARGBShuffleRow_C, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_SSSE3
-YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3,
- ARGBShuffleRow_C, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_AVX2
-YANY(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2,
- ARGBShuffleRow_C, 4, 4, 15)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_NEON
-YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON,
- ARGBShuffleRow_C, 4, 4, 3)
-#endif
-#undef YANY
-
-// Interpolate may want to work in place, so last16 method can not be used.
-#define NANY(NAMEANY, TERP_SIMD, TERP_C, SBPP, BPP, MASK) \
- void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
- ptrdiff_t src_stride_ptr, int width, \
- int source_y_fraction) { \
- int n = width & ~MASK; \
- TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr, \
- n, source_y_fraction); \
- TERP_C(dst_ptr + n * BPP, \
- src_ptr + n * SBPP, src_stride_ptr, \
- width & MASK, source_y_fraction); \
- }
-
-#ifdef HAS_INTERPOLATEROW_AVX2
-NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2,
- InterpolateRow_C, 1, 1, 32)
-#endif
-#ifdef HAS_INTERPOLATEROW_SSSE3
-NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3,
- InterpolateRow_C, 1, 1, 15)
-#endif
-#ifdef HAS_INTERPOLATEROW_SSE2
-NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2,
- InterpolateRow_C, 1, 1, 15)
-#endif
-#ifdef HAS_INTERPOLATEROW_NEON
-NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON,
- InterpolateRow_C, 1, 1, 15)
-#endif
-#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2
-NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2,
- InterpolateRow_C, 1, 1, 3)
-#endif
-#undef NANY
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/row_common.cc b/src/main/jni/libyuv/source/row_common.cc
deleted file mode 100644
index fa2b752a2..000000000
--- a/src/main/jni/libyuv/source/row_common.cc
+++ /dev/null
@@ -1,2286 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#include <string.h> // For memcpy and memset.
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// llvm x86 is poor at ternary operator, so use branchless min/max.
-
-#define USE_BRANCHLESS 1
-#if USE_BRANCHLESS
-static __inline int32 clamp0(int32 v) {
- return ((-(v) >> 31) & (v));
-}
-
-static __inline int32 clamp255(int32 v) {
- return (((255 - (v)) >> 31) | (v)) & 255;
-}
-
-static __inline uint32 Clamp(int32 val) {
- int v = clamp0(val);
- return (uint32)(clamp255(v));
-}
-
-static __inline uint32 Abs(int32 v) {
- int m = v >> 31;
- return (v + m) ^ m;
-}
-#else // USE_BRANCHLESS
-static __inline int32 clamp0(int32 v) {
- return (v < 0) ? 0 : v;
-}
-
-static __inline int32 clamp255(int32 v) {
- return (v > 255) ? 255 : v;
-}
-
-static __inline uint32 Clamp(int32 val) {
- int v = clamp0(val);
- return (uint32)(clamp255(v));
-}
-
-static __inline uint32 Abs(int32 v) {
- return (v < 0) ? -v : v;
-}
-#endif // USE_BRANCHLESS
-
-#ifdef LIBYUV_LITTLE_ENDIAN
-#define WRITEWORD(p, v) *(uint32*)(p) = v
-#else
-static inline void WRITEWORD(uint8* p, uint32 v) {
- p[0] = (uint8)(v & 255);
- p[1] = (uint8)((v >> 8) & 255);
- p[2] = (uint8)((v >> 16) & 255);
- p[3] = (uint8)((v >> 24) & 255);
-}
-#endif
-
-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
- int x;
- for (x = 0; x < width; ++x) {
- uint8 b = src_rgb24[0];
- uint8 g = src_rgb24[1];
- uint8 r = src_rgb24[2];
- dst_argb[0] = b;
- dst_argb[1] = g;
- dst_argb[2] = r;
- dst_argb[3] = 255u;
- dst_argb += 4;
- src_rgb24 += 3;
- }
-}
-
-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
- int x;
- for (x = 0; x < width; ++x) {
- uint8 r = src_raw[0];
- uint8 g = src_raw[1];
- uint8 b = src_raw[2];
- dst_argb[0] = b;
- dst_argb[1] = g;
- dst_argb[2] = r;
- dst_argb[3] = 255u;
- dst_argb += 4;
- src_raw += 3;
- }
-}
-
-void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
- int x;
- for (x = 0; x < width; ++x) {
- uint8 b = src_rgb565[0] & 0x1f;
- uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
- uint8 r = src_rgb565[1] >> 3;
- dst_argb[0] = (b << 3) | (b >> 2);
- dst_argb[1] = (g << 2) | (g >> 4);
- dst_argb[2] = (r << 3) | (r >> 2);
- dst_argb[3] = 255u;
- dst_argb += 4;
- src_rgb565 += 2;
- }
-}
-
-void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
- int width) {
- int x;
- for (x = 0; x < width; ++x) {
- uint8 b = src_argb1555[0] & 0x1f;
- uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
- uint8 r = (src_argb1555[1] & 0x7c) >> 2;
- uint8 a = src_argb1555[1] >> 7;
- dst_argb[0] = (b << 3) | (b >> 2);
- dst_argb[1] = (g << 3) | (g >> 2);
- dst_argb[2] = (r << 3) | (r >> 2);
- dst_argb[3] = -a;
- dst_argb += 4;
- src_argb1555 += 2;
- }
-}
-
-void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
- int width) {
- int x;
- for (x = 0; x < width; ++x) {
- uint8 b = src_argb4444[0] & 0x0f;
- uint8 g = src_argb4444[0] >> 4;
- uint8 r = src_argb4444[1] & 0x0f;
- uint8 a = src_argb4444[1] >> 4;
- dst_argb[0] = (b << 4) | b;
- dst_argb[1] = (g << 4) | g;
- dst_argb[2] = (r << 4) | r;
- dst_argb[3] = (a << 4) | a;
- dst_argb += 4;
- src_argb4444 += 2;
- }
-}
-
-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
- int x;
- for (x = 0; x < width; ++x) {
- uint8 b = src_argb[0];
- uint8 g = src_argb[1];
- uint8 r = src_argb[2];
- dst_rgb[0] = b;
- dst_rgb[1] = g;
- dst_rgb[2] = r;
- dst_rgb += 3;
- src_argb += 4;
- }
-}
-
-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
- int x;
- for (x = 0; x < width; ++x) {
- uint8 b = src_argb[0];
- uint8 g = src_argb[1];
- uint8 r = src_argb[2];
- dst_rgb[0] = r;
- dst_rgb[1] = g;
- dst_rgb[2] = b;
- dst_rgb += 3;
- src_argb += 4;
- }
-}
-
-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- uint8 b0 = src_argb[0] >> 3;
- uint8 g0 = src_argb[1] >> 2;
- uint8 r0 = src_argb[2] >> 3;
- uint8 b1 = src_argb[4] >> 3;
- uint8 g1 = src_argb[5] >> 2;
- uint8 r1 = src_argb[6] >> 3;
- WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
- (b1 << 16) | (g1 << 21) | (r1 << 27));
- dst_rgb += 4;
- src_argb += 8;
- }
- if (width & 1) {
- uint8 b0 = src_argb[0] >> 3;
- uint8 g0 = src_argb[1] >> 2;
- uint8 r0 = src_argb[2] >> 3;
- *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
- }
-}
-
-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- uint8 b0 = src_argb[0] >> 3;
- uint8 g0 = src_argb[1] >> 3;
- uint8 r0 = src_argb[2] >> 3;
- uint8 a0 = src_argb[3] >> 7;
- uint8 b1 = src_argb[4] >> 3;
- uint8 g1 = src_argb[5] >> 3;
- uint8 r1 = src_argb[6] >> 3;
- uint8 a1 = src_argb[7] >> 7;
- *(uint32*)(dst_rgb) =
- b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
- (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
- dst_rgb += 4;
- src_argb += 8;
- }
- if (width & 1) {
- uint8 b0 = src_argb[0] >> 3;
- uint8 g0 = src_argb[1] >> 3;
- uint8 r0 = src_argb[2] >> 3;
- uint8 a0 = src_argb[3] >> 7;
- *(uint16*)(dst_rgb) =
- b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
- }
-}
-
-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- uint8 b0 = src_argb[0] >> 4;
- uint8 g0 = src_argb[1] >> 4;
- uint8 r0 = src_argb[2] >> 4;
- uint8 a0 = src_argb[3] >> 4;
- uint8 b1 = src_argb[4] >> 4;
- uint8 g1 = src_argb[5] >> 4;
- uint8 r1 = src_argb[6] >> 4;
- uint8 a1 = src_argb[7] >> 4;
- *(uint32*)(dst_rgb) =
- b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
- (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
- dst_rgb += 4;
- src_argb += 8;
- }
- if (width & 1) {
- uint8 b0 = src_argb[0] >> 4;
- uint8 g0 = src_argb[1] >> 4;
- uint8 r0 = src_argb[2] >> 4;
- uint8 a0 = src_argb[3] >> 4;
- *(uint16*)(dst_rgb) =
- b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
- }
-}
-
-static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
- return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
-}
-
-static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
- return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
-}
-static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
- return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
-}
-
-#define MAKEROWY(NAME, R, G, B, BPP) \
-void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
- int x; \
- for (x = 0; x < width; ++x) { \
- dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
- src_argb0 += BPP; \
- dst_y += 1; \
- } \
-} \
-void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \
- uint8* dst_u, uint8* dst_v, int width) { \
- const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
- int x; \
- for (x = 0; x < width - 1; x += 2) { \
- uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + \
- src_rgb1[B] + src_rgb1[B + BPP]) >> 2; \
- uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + \
- src_rgb1[G] + src_rgb1[G + BPP]) >> 2; \
- uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + \
- src_rgb1[R] + src_rgb1[R + BPP]) >> 2; \
- dst_u[0] = RGBToU(ar, ag, ab); \
- dst_v[0] = RGBToV(ar, ag, ab); \
- src_rgb0 += BPP * 2; \
- src_rgb1 += BPP * 2; \
- dst_u += 1; \
- dst_v += 1; \
- } \
- if (width & 1) { \
- uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
- uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
- uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
- dst_u[0] = RGBToU(ar, ag, ab); \
- dst_v[0] = RGBToV(ar, ag, ab); \
- } \
-}
-
-MAKEROWY(ARGB, 2, 1, 0, 4)
-MAKEROWY(BGRA, 1, 2, 3, 4)
-MAKEROWY(ABGR, 0, 1, 2, 4)
-MAKEROWY(RGBA, 3, 2, 1, 4)
-MAKEROWY(RGB24, 2, 1, 0, 3)
-MAKEROWY(RAW, 0, 1, 2, 3)
-#undef MAKEROWY
-
-// JPeg uses a variation on BT.601-1 full range
-// y = 0.29900 * r + 0.58700 * g + 0.11400 * b
-// u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center
-// v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center
-// BT.601 Mpeg range uses:
-// b 0.1016 * 255 = 25.908 = 25
-// g 0.5078 * 255 = 129.489 = 129
-// r 0.2578 * 255 = 65.739 = 66
-// JPeg 8 bit Y (not used):
-// b 0.11400 * 256 = 29.184 = 29
-// g 0.58700 * 256 = 150.272 = 150
-// r 0.29900 * 256 = 76.544 = 77
-// JPeg 7 bit Y:
-// b 0.11400 * 128 = 14.592 = 15
-// g 0.58700 * 128 = 75.136 = 75
-// r 0.29900 * 128 = 38.272 = 38
-// JPeg 8 bit U:
-// b 0.50000 * 255 = 127.5 = 127
-// g -0.33126 * 255 = -84.4713 = -84
-// r -0.16874 * 255 = -43.0287 = -43
-// JPeg 8 bit V:
-// b -0.08131 * 255 = -20.73405 = -20
-// g -0.41869 * 255 = -106.76595 = -107
-// r 0.50000 * 255 = 127.5 = 127
-
-static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
- return (38 * r + 75 * g + 15 * b + 64) >> 7;
-}
-
-static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
- return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
-}
-static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
- return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
-}
-
-#define AVGB(a, b) (((a) + (b) + 1) >> 1)
-
-#define MAKEROWYJ(NAME, R, G, B, BPP) \
-void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
- int x; \
- for (x = 0; x < width; ++x) { \
- dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \
- src_argb0 += BPP; \
- dst_y += 1; \
- } \
-} \
-void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \
- uint8* dst_u, uint8* dst_v, int width) { \
- const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
- int x; \
- for (x = 0; x < width - 1; x += 2) { \
- uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
- AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
- uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
- AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
- uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
- AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
- dst_u[0] = RGBToUJ(ar, ag, ab); \
- dst_v[0] = RGBToVJ(ar, ag, ab); \
- src_rgb0 += BPP * 2; \
- src_rgb1 += BPP * 2; \
- dst_u += 1; \
- dst_v += 1; \
- } \
- if (width & 1) { \
- uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \
- uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \
- uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \
- dst_u[0] = RGBToUJ(ar, ag, ab); \
- dst_v[0] = RGBToVJ(ar, ag, ab); \
- } \
-}
-
-MAKEROWYJ(ARGB, 2, 1, 0, 4)
-#undef MAKEROWYJ
-
-void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
- int x;
- for (x = 0; x < width; ++x) {
- uint8 b = src_rgb565[0] & 0x1f;
- uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
- uint8 r = src_rgb565[1] >> 3;
- b = (b << 3) | (b >> 2);
- g = (g << 2) | (g >> 4);
- r = (r << 3) | (r >> 2);
- dst_y[0] = RGBToY(r, g, b);
- src_rgb565 += 2;
- dst_y += 1;
- }
-}
-
-void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
- int x;
- for (x = 0; x < width; ++x) {
- uint8 b = src_argb1555[0] & 0x1f;
- uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
- uint8 r = (src_argb1555[1] & 0x7c) >> 2;
- b = (b << 3) | (b >> 2);
- g = (g << 3) | (g >> 2);
- r = (r << 3) | (r >> 2);
- dst_y[0] = RGBToY(r, g, b);
- src_argb1555 += 2;
- dst_y += 1;
- }
-}
-
-void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
- int x;
- for (x = 0; x < width; ++x) {
- uint8 b = src_argb4444[0] & 0x0f;
- uint8 g = src_argb4444[0] >> 4;
- uint8 r = src_argb4444[1] & 0x0f;
- b = (b << 4) | b;
- g = (g << 4) | g;
- r = (r << 4) | r;
- dst_y[0] = RGBToY(r, g, b);
- src_argb4444 += 2;
- dst_y += 1;
- }
-}
-
-void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_u, uint8* dst_v, int width) {
- const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
- int x;
- for (x = 0; x < width - 1; x += 2) {
- uint8 b0 = src_rgb565[0] & 0x1f;
- uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
- uint8 r0 = src_rgb565[1] >> 3;
- uint8 b1 = src_rgb565[2] & 0x1f;
- uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
- uint8 r1 = src_rgb565[3] >> 3;
- uint8 b2 = next_rgb565[0] & 0x1f;
- uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
- uint8 r2 = next_rgb565[1] >> 3;
- uint8 b3 = next_rgb565[2] & 0x1f;
- uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
- uint8 r3 = next_rgb565[3] >> 3;
- uint8 b = (b0 + b1 + b2 + b3); // 565 * 4 = 787.
- uint8 g = (g0 + g1 + g2 + g3);
- uint8 r = (r0 + r1 + r2 + r3);
- b = (b << 1) | (b >> 6); // 787 -> 888.
- r = (r << 1) | (r >> 6);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
- src_rgb565 += 4;
- next_rgb565 += 4;
- dst_u += 1;
- dst_v += 1;
- }
- if (width & 1) {
- uint8 b0 = src_rgb565[0] & 0x1f;
- uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
- uint8 r0 = src_rgb565[1] >> 3;
- uint8 b2 = next_rgb565[0] & 0x1f;
- uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
- uint8 r2 = next_rgb565[1] >> 3;
- uint8 b = (b0 + b2); // 565 * 2 = 676.
- uint8 g = (g0 + g2);
- uint8 r = (r0 + r2);
- b = (b << 2) | (b >> 4); // 676 -> 888
- g = (g << 1) | (g >> 6);
- r = (r << 2) | (r >> 4);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
- }
-}
-
-void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_u, uint8* dst_v, int width) {
- const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
- int x;
- for (x = 0; x < width - 1; x += 2) {
- uint8 b0 = src_argb1555[0] & 0x1f;
- uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
- uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
- uint8 b1 = src_argb1555[2] & 0x1f;
- uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
- uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
- uint8 b2 = next_argb1555[0] & 0x1f;
- uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
- uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
- uint8 b3 = next_argb1555[2] & 0x1f;
- uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
- uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
- uint8 b = (b0 + b1 + b2 + b3); // 555 * 4 = 777.
- uint8 g = (g0 + g1 + g2 + g3);
- uint8 r = (r0 + r1 + r2 + r3);
- b = (b << 1) | (b >> 6); // 777 -> 888.
- g = (g << 1) | (g >> 6);
- r = (r << 1) | (r >> 6);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
- src_argb1555 += 4;
- next_argb1555 += 4;
- dst_u += 1;
- dst_v += 1;
- }
- if (width & 1) {
- uint8 b0 = src_argb1555[0] & 0x1f;
- uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
- uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
- uint8 b2 = next_argb1555[0] & 0x1f;
- uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
- uint8 r2 = next_argb1555[1] >> 3;
- uint8 b = (b0 + b2); // 555 * 2 = 666.
- uint8 g = (g0 + g2);
- uint8 r = (r0 + r2);
- b = (b << 2) | (b >> 4); // 666 -> 888.
- g = (g << 2) | (g >> 4);
- r = (r << 2) | (r >> 4);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
- }
-}
-
-void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_u, uint8* dst_v, int width) {
- const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
- int x;
- for (x = 0; x < width - 1; x += 2) {
- uint8 b0 = src_argb4444[0] & 0x0f;
- uint8 g0 = src_argb4444[0] >> 4;
- uint8 r0 = src_argb4444[1] & 0x0f;
- uint8 b1 = src_argb4444[2] & 0x0f;
- uint8 g1 = src_argb4444[2] >> 4;
- uint8 r1 = src_argb4444[3] & 0x0f;
- uint8 b2 = next_argb4444[0] & 0x0f;
- uint8 g2 = next_argb4444[0] >> 4;
- uint8 r2 = next_argb4444[1] & 0x0f;
- uint8 b3 = next_argb4444[2] & 0x0f;
- uint8 g3 = next_argb4444[2] >> 4;
- uint8 r3 = next_argb4444[3] & 0x0f;
- uint8 b = (b0 + b1 + b2 + b3); // 444 * 4 = 666.
- uint8 g = (g0 + g1 + g2 + g3);
- uint8 r = (r0 + r1 + r2 + r3);
- b = (b << 2) | (b >> 4); // 666 -> 888.
- g = (g << 2) | (g >> 4);
- r = (r << 2) | (r >> 4);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
- src_argb4444 += 4;
- next_argb4444 += 4;
- dst_u += 1;
- dst_v += 1;
- }
- if (width & 1) {
- uint8 b0 = src_argb4444[0] & 0x0f;
- uint8 g0 = src_argb4444[0] >> 4;
- uint8 r0 = src_argb4444[1] & 0x0f;
- uint8 b2 = next_argb4444[0] & 0x0f;
- uint8 g2 = next_argb4444[0] >> 4;
- uint8 r2 = next_argb4444[1] & 0x0f;
- uint8 b = (b0 + b2); // 444 * 2 = 555.
- uint8 g = (g0 + g2);
- uint8 r = (r0 + r2);
- b = (b << 3) | (b >> 2); // 555 -> 888.
- g = (g << 3) | (g >> 2);
- r = (r << 3) | (r >> 2);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
- }
-}
-
-void ARGBToUV444Row_C(const uint8* src_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- int x;
- for (x = 0; x < width; ++x) {
- uint8 ab = src_argb[0];
- uint8 ag = src_argb[1];
- uint8 ar = src_argb[2];
- dst_u[0] = RGBToU(ar, ag, ab);
- dst_v[0] = RGBToV(ar, ag, ab);
- src_argb += 4;
- dst_u += 1;
- dst_v += 1;
- }
-}
-
-void ARGBToUV422Row_C(const uint8* src_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
- uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
- uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
- dst_u[0] = RGBToU(ar, ag, ab);
- dst_v[0] = RGBToV(ar, ag, ab);
- src_argb += 8;
- dst_u += 1;
- dst_v += 1;
- }
- if (width & 1) {
- uint8 ab = src_argb[0];
- uint8 ag = src_argb[1];
- uint8 ar = src_argb[2];
- dst_u[0] = RGBToU(ar, ag, ab);
- dst_v[0] = RGBToV(ar, ag, ab);
- }
-}
-
-void ARGBToUV411Row_C(const uint8* src_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- int x;
- for (x = 0; x < width - 3; x += 4) {
- uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
- uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
- uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
- dst_u[0] = RGBToU(ar, ag, ab);
- dst_v[0] = RGBToV(ar, ag, ab);
- src_argb += 16;
- dst_u += 1;
- dst_v += 1;
- }
- if ((width & 3) == 3) {
- uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3;
- uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3;
- uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3;
- dst_u[0] = RGBToU(ar, ag, ab);
- dst_v[0] = RGBToV(ar, ag, ab);
- } else if ((width & 3) == 2) {
- uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
- uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
- uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
- dst_u[0] = RGBToU(ar, ag, ab);
- dst_v[0] = RGBToV(ar, ag, ab);
- } else if ((width & 3) == 1) {
- uint8 ab = src_argb[0];
- uint8 ag = src_argb[1];
- uint8 ar = src_argb[2];
- dst_u[0] = RGBToU(ar, ag, ab);
- dst_v[0] = RGBToV(ar, ag, ab);
- }
-}
-
-void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
- int x;
- for (x = 0; x < width; ++x) {
- uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
- dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
- dst_argb[3] = src_argb[3];
- dst_argb += 4;
- src_argb += 4;
- }
-}
-
-// Convert a row of image to Sepia tone.
-void ARGBSepiaRow_C(uint8* dst_argb, int width) {
- int x;
- for (x = 0; x < width; ++x) {
- int b = dst_argb[0];
- int g = dst_argb[1];
- int r = dst_argb[2];
- int sb = (b * 17 + g * 68 + r * 35) >> 7;
- int sg = (b * 22 + g * 88 + r * 45) >> 7;
- int sr = (b * 24 + g * 98 + r * 50) >> 7;
- // b does not over flow. a is preserved from original.
- dst_argb[0] = sb;
- dst_argb[1] = clamp255(sg);
- dst_argb[2] = clamp255(sr);
- dst_argb += 4;
- }
-}
-
-// Apply color matrix to a row of image. Matrix is signed.
-// TODO(fbarchard): Consider adding rounding (+32).
-void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
- const int8* matrix_argb, int width) {
- int x;
- for (x = 0; x < width; ++x) {
- int b = src_argb[0];
- int g = src_argb[1];
- int r = src_argb[2];
- int a = src_argb[3];
- int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
- r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
- int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
- r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
- int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
- r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
- int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
- r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
- dst_argb[0] = Clamp(sb);
- dst_argb[1] = Clamp(sg);
- dst_argb[2] = Clamp(sr);
- dst_argb[3] = Clamp(sa);
- src_argb += 4;
- dst_argb += 4;
- }
-}
-
-// Apply color table to a row of image.
-void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
- int x;
- for (x = 0; x < width; ++x) {
- int b = dst_argb[0];
- int g = dst_argb[1];
- int r = dst_argb[2];
- int a = dst_argb[3];
- dst_argb[0] = table_argb[b * 4 + 0];
- dst_argb[1] = table_argb[g * 4 + 1];
- dst_argb[2] = table_argb[r * 4 + 2];
- dst_argb[3] = table_argb[a * 4 + 3];
- dst_argb += 4;
- }
-}
-
-// Apply color table to a row of image.
-void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
- int x;
- for (x = 0; x < width; ++x) {
- int b = dst_argb[0];
- int g = dst_argb[1];
- int r = dst_argb[2];
- dst_argb[0] = table_argb[b * 4 + 0];
- dst_argb[1] = table_argb[g * 4 + 1];
- dst_argb[2] = table_argb[r * 4 + 2];
- dst_argb += 4;
- }
-}
-
-void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
- int interval_offset, int width) {
- int x;
- for (x = 0; x < width; ++x) {
- int b = dst_argb[0];
- int g = dst_argb[1];
- int r = dst_argb[2];
- dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
- dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
- dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
- dst_argb += 4;
- }
-}
-
-#define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v * f >> 24
-
-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
- uint32 value) {
- const uint32 b_scale = REPEAT8(value & 0xff);
- const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
- const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
- const uint32 a_scale = REPEAT8(value >> 24);
-
- int i;
- for (i = 0; i < width; ++i) {
- const uint32 b = REPEAT8(src_argb[0]);
- const uint32 g = REPEAT8(src_argb[1]);
- const uint32 r = REPEAT8(src_argb[2]);
- const uint32 a = REPEAT8(src_argb[3]);
- dst_argb[0] = SHADE(b, b_scale);
- dst_argb[1] = SHADE(g, g_scale);
- dst_argb[2] = SHADE(r, r_scale);
- dst_argb[3] = SHADE(a, a_scale);
- src_argb += 4;
- dst_argb += 4;
- }
-}
-#undef REPEAT8
-#undef SHADE
-
-#define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v * f >> 16
-
-void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- int i;
- for (i = 0; i < width; ++i) {
- const uint32 b = REPEAT8(src_argb0[0]);
- const uint32 g = REPEAT8(src_argb0[1]);
- const uint32 r = REPEAT8(src_argb0[2]);
- const uint32 a = REPEAT8(src_argb0[3]);
- const uint32 b_scale = src_argb1[0];
- const uint32 g_scale = src_argb1[1];
- const uint32 r_scale = src_argb1[2];
- const uint32 a_scale = src_argb1[3];
- dst_argb[0] = SHADE(b, b_scale);
- dst_argb[1] = SHADE(g, g_scale);
- dst_argb[2] = SHADE(r, r_scale);
- dst_argb[3] = SHADE(a, a_scale);
- src_argb0 += 4;
- src_argb1 += 4;
- dst_argb += 4;
- }
-}
-#undef REPEAT8
-#undef SHADE
-
-#define SHADE(f, v) clamp255(v + f)
-
-void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- int i;
- for (i = 0; i < width; ++i) {
- const int b = src_argb0[0];
- const int g = src_argb0[1];
- const int r = src_argb0[2];
- const int a = src_argb0[3];
- const int b_add = src_argb1[0];
- const int g_add = src_argb1[1];
- const int r_add = src_argb1[2];
- const int a_add = src_argb1[3];
- dst_argb[0] = SHADE(b, b_add);
- dst_argb[1] = SHADE(g, g_add);
- dst_argb[2] = SHADE(r, r_add);
- dst_argb[3] = SHADE(a, a_add);
- src_argb0 += 4;
- src_argb1 += 4;
- dst_argb += 4;
- }
-}
-#undef SHADE
-
-#define SHADE(f, v) clamp0(f - v)
-
-void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- int i;
- for (i = 0; i < width; ++i) {
- const int b = src_argb0[0];
- const int g = src_argb0[1];
- const int r = src_argb0[2];
- const int a = src_argb0[3];
- const int b_sub = src_argb1[0];
- const int g_sub = src_argb1[1];
- const int r_sub = src_argb1[2];
- const int a_sub = src_argb1[3];
- dst_argb[0] = SHADE(b, b_sub);
- dst_argb[1] = SHADE(g, g_sub);
- dst_argb[2] = SHADE(r, r_sub);
- dst_argb[3] = SHADE(a, a_sub);
- src_argb0 += 4;
- src_argb1 += 4;
- dst_argb += 4;
- }
-}
-#undef SHADE
-
-// Sobel functions which mimics SSSE3.
-void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
- uint8* dst_sobelx, int width) {
- int i;
- for (i = 0; i < width; ++i) {
- int a = src_y0[i];
- int b = src_y1[i];
- int c = src_y2[i];
- int a_sub = src_y0[i + 2];
- int b_sub = src_y1[i + 2];
- int c_sub = src_y2[i + 2];
- int a_diff = a - a_sub;
- int b_diff = b - b_sub;
- int c_diff = c - c_sub;
- int sobel = Abs(a_diff + b_diff * 2 + c_diff);
- dst_sobelx[i] = (uint8)(clamp255(sobel));
- }
-}
-
-void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width) {
- int i;
- for (i = 0; i < width; ++i) {
- int a = src_y0[i + 0];
- int b = src_y0[i + 1];
- int c = src_y0[i + 2];
- int a_sub = src_y1[i + 0];
- int b_sub = src_y1[i + 1];
- int c_sub = src_y1[i + 2];
- int a_diff = a - a_sub;
- int b_diff = b - b_sub;
- int c_diff = c - c_sub;
- int sobel = Abs(a_diff + b_diff * 2 + c_diff);
- dst_sobely[i] = (uint8)(clamp255(sobel));
- }
-}
-
-void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
- int i;
- for (i = 0; i < width; ++i) {
- int r = src_sobelx[i];
- int b = src_sobely[i];
- int s = clamp255(r + b);
- dst_argb[0] = (uint8)(s);
- dst_argb[1] = (uint8)(s);
- dst_argb[2] = (uint8)(s);
- dst_argb[3] = (uint8)(255u);
- dst_argb += 4;
- }
-}
-
-void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_y, int width) {
- int i;
- for (i = 0; i < width; ++i) {
- int r = src_sobelx[i];
- int b = src_sobely[i];
- int s = clamp255(r + b);
- dst_y[i] = (uint8)(s);
- }
-}
-
-void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
- int i;
- for (i = 0; i < width; ++i) {
- int r = src_sobelx[i];
- int b = src_sobely[i];
- int g = clamp255(r + b);
- dst_argb[0] = (uint8)(b);
- dst_argb[1] = (uint8)(g);
- dst_argb[2] = (uint8)(r);
- dst_argb[3] = (uint8)(255u);
- dst_argb += 4;
- }
-}
-
-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
- // Copy a Y to RGB.
- int x;
- for (x = 0; x < width; ++x) {
- uint8 y = src_y[0];
- dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
- dst_argb[3] = 255u;
- dst_argb += 4;
- ++src_y;
- }
-}
-
-// C reference code that mimics the YUV assembly.
-
-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
-
-#define UB 127 /* min(63,(int8)(2.018 * 64)) */
-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
-#define UR 0
-
-#define VB 0
-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
-
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
-
-static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
- uint8* b, uint8* g, uint8* r) {
- int32 y1 = ((int32)(y) - 16) * YG;
- *b = Clamp((int32)((u * UB + v * VB) - (BB) + y1) >> 6);
- *g = Clamp((int32)((u * UG + v * VG) - (BG) + y1) >> 6);
- *r = Clamp((int32)((u * UR + v * VR) - (BR) + y1) >> 6);
-}
-
-#if !defined(LIBYUV_DISABLE_NEON) && \
- (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-// C mimic assembly.
-// TODO(fbarchard): Remove subsampling from Neon.
-void I444ToARGBRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
- int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
- uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
- YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- rgb_buf[3] = 255;
- YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
- rgb_buf[7] = 255;
- src_y += 2;
- src_u += 2;
- src_v += 2;
- rgb_buf += 8; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- }
-}
-#else
-void I444ToARGBRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
- int width) {
- int x;
- for (x = 0; x < width; ++x) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- rgb_buf[3] = 255;
- src_y += 1;
- src_u += 1;
- src_v += 1;
- rgb_buf += 4; // Advance 1 pixel.
- }
-}
-#endif
-// Also used for 420
-void I422ToARGBRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
- int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- rgb_buf[3] = 255;
- YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
- rgb_buf[7] = 255;
- src_y += 2;
- src_u += 1;
- src_v += 1;
- rgb_buf += 8; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- rgb_buf[3] = 255;
- }
-}
-
-void I422ToRGB24Row_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
- int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
- src_y += 2;
- src_u += 1;
- src_v += 1;
- rgb_buf += 6; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- }
-}
-
-void I422ToRAWRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
- int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
- YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
- src_y += 2;
- src_u += 1;
- src_v += 1;
- rgb_buf += 6; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
- }
-}
-
-void I422ToARGB4444Row_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb4444,
- int width) {
- uint8 b0;
- uint8 g0;
- uint8 r0;
- uint8 b1;
- uint8 g1;
- uint8 r1;
- int x;
- for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
- YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
- b0 = b0 >> 4;
- g0 = g0 >> 4;
- r0 = r0 >> 4;
- b1 = b1 >> 4;
- g1 = g1 >> 4;
- r1 = r1 >> 4;
- *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
- (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
- src_y += 2;
- src_u += 1;
- src_v += 1;
- dst_argb4444 += 4; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
- b0 = b0 >> 4;
- g0 = g0 >> 4;
- r0 = r0 >> 4;
- *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
- 0xf000;
- }
-}
-
-void I422ToARGB1555Row_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb1555,
- int width) {
- uint8 b0;
- uint8 g0;
- uint8 r0;
- uint8 b1;
- uint8 g1;
- uint8 r1;
- int x;
- for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
- YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
- b0 = b0 >> 3;
- g0 = g0 >> 3;
- r0 = r0 >> 3;
- b1 = b1 >> 3;
- g1 = g1 >> 3;
- r1 = r1 >> 3;
- *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
- (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
- src_y += 2;
- src_u += 1;
- src_v += 1;
- dst_argb1555 += 4; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
- b0 = b0 >> 3;
- g0 = g0 >> 3;
- r0 = r0 >> 3;
- *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
- 0x8000;
- }
-}
-
-void I422ToRGB565Row_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb565,
- int width) {
- uint8 b0;
- uint8 g0;
- uint8 r0;
- uint8 b1;
- uint8 g1;
- uint8 r1;
- int x;
- for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
- YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
- b0 = b0 >> 3;
- g0 = g0 >> 2;
- r0 = r0 >> 3;
- b1 = b1 >> 3;
- g1 = g1 >> 2;
- r1 = r1 >> 3;
- *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
- (b1 << 16) | (g1 << 21) | (r1 << 27);
- src_y += 2;
- src_u += 1;
- src_v += 1;
- dst_rgb565 += 4; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
- b0 = b0 >> 3;
- g0 = g0 >> 2;
- r0 = r0 >> 3;
- *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
- }
-}
-
-void I411ToARGBRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
- int width) {
- int x;
- for (x = 0; x < width - 3; x += 4) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- rgb_buf[3] = 255;
- YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
- rgb_buf[7] = 255;
- YuvPixel(src_y[2], src_u[0], src_v[0],
- rgb_buf + 8, rgb_buf + 9, rgb_buf + 10);
- rgb_buf[11] = 255;
- YuvPixel(src_y[3], src_u[0], src_v[0],
- rgb_buf + 12, rgb_buf + 13, rgb_buf + 14);
- rgb_buf[15] = 255;
- src_y += 4;
- src_u += 1;
- src_v += 1;
- rgb_buf += 16; // Advance 4 pixels.
- }
- if (width & 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- rgb_buf[3] = 255;
- YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
- rgb_buf[7] = 255;
- src_y += 2;
- rgb_buf += 8; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- rgb_buf[3] = 255;
- }
-}
-
-void NV12ToARGBRow_C(const uint8* src_y,
- const uint8* usrc_v,
- uint8* rgb_buf,
- int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- rgb_buf[3] = 255;
- YuvPixel(src_y[1], usrc_v[0], usrc_v[1],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
- rgb_buf[7] = 255;
- src_y += 2;
- usrc_v += 2;
- rgb_buf += 8; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- rgb_buf[3] = 255;
- }
-}
-
-void NV21ToARGBRow_C(const uint8* src_y,
- const uint8* src_vu,
- uint8* rgb_buf,
- int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_vu[1], src_vu[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- rgb_buf[3] = 255;
-
- YuvPixel(src_y[1], src_vu[1], src_vu[0],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
- rgb_buf[7] = 255;
-
- src_y += 2;
- src_vu += 2;
- rgb_buf += 8; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_y[0], src_vu[1], src_vu[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- rgb_buf[3] = 255;
- }
-}
-
-void NV12ToRGB565Row_C(const uint8* src_y,
- const uint8* usrc_v,
- uint8* dst_rgb565,
- int width) {
- uint8 b0;
- uint8 g0;
- uint8 r0;
- uint8 b1;
- uint8 g1;
- uint8 r1;
- int x;
- for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
- YuvPixel(src_y[1], usrc_v[0], usrc_v[1], &b1, &g1, &r1);
- b0 = b0 >> 3;
- g0 = g0 >> 2;
- r0 = r0 >> 3;
- b1 = b1 >> 3;
- g1 = g1 >> 2;
- r1 = r1 >> 3;
- *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
- (b1 << 16) | (g1 << 21) | (r1 << 27);
- src_y += 2;
- usrc_v += 2;
- dst_rgb565 += 4; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
- b0 = b0 >> 3;
- g0 = g0 >> 2;
- r0 = r0 >> 3;
- *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
- }
-}
-
-void NV21ToRGB565Row_C(const uint8* src_y,
- const uint8* vsrc_u,
- uint8* dst_rgb565,
- int width) {
- uint8 b0;
- uint8 g0;
- uint8 r0;
- uint8 b1;
- uint8 g1;
- uint8 r1;
- int x;
- for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
- YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1);
- b0 = b0 >> 3;
- g0 = g0 >> 2;
- r0 = r0 >> 3;
- b1 = b1 >> 3;
- g1 = g1 >> 2;
- r1 = r1 >> 3;
- *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
- (b1 << 16) | (g1 << 21) | (r1 << 27);
- src_y += 2;
- vsrc_u += 2;
- dst_rgb565 += 4; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
- b0 = b0 >> 3;
- g0 = g0 >> 2;
- r0 = r0 >> 3;
- *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
- }
-}
-
-void YUY2ToARGBRow_C(const uint8* src_yuy2,
- uint8* rgb_buf,
- int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- rgb_buf[3] = 255;
- YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
- rgb_buf[7] = 255;
- src_yuy2 += 4;
- rgb_buf += 8; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- rgb_buf[3] = 255;
- }
-}
-
-void UYVYToARGBRow_C(const uint8* src_uyvy,
- uint8* rgb_buf,
- int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- rgb_buf[3] = 255;
- YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
- rgb_buf[7] = 255;
- src_uyvy += 4;
- rgb_buf += 8; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- rgb_buf[3] = 255;
- }
-}
-
-void I422ToBGRARow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
- int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
- rgb_buf[0] = 255;
- YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 7, rgb_buf + 6, rgb_buf + 5);
- rgb_buf[4] = 255;
- src_y += 2;
- src_u += 1;
- src_v += 1;
- rgb_buf += 8; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
- rgb_buf[0] = 255;
- }
-}
-
-void I422ToABGRRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
- int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
- rgb_buf[3] = 255;
- YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 6, rgb_buf + 5, rgb_buf + 4);
- rgb_buf[7] = 255;
- src_y += 2;
- src_u += 1;
- src_v += 1;
- rgb_buf += 8; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
- rgb_buf[3] = 255;
- }
-}
-
-void I422ToRGBARow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
- int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
- rgb_buf[0] = 255;
- YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 5, rgb_buf + 6, rgb_buf + 7);
- rgb_buf[4] = 255;
- src_y += 2;
- src_u += 1;
- src_v += 1;
- rgb_buf += 8; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
- rgb_buf[0] = 255;
- }
-}
-
-void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], 128, 128,
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- rgb_buf[3] = 255;
- YuvPixel(src_y[1], 128, 128,
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
- rgb_buf[7] = 255;
- src_y += 2;
- rgb_buf += 8; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_y[0], 128, 128,
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- rgb_buf[3] = 255;
- }
-}
-
-void MirrorRow_C(const uint8* src, uint8* dst, int width) {
- int x;
- src += width - 1;
- for (x = 0; x < width - 1; x += 2) {
- dst[x] = src[0];
- dst[x + 1] = src[-1];
- src -= 2;
- }
- if (width & 1) {
- dst[width - 1] = src[0];
- }
-}
-
-void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
- int x;
- src_uv += (width - 1) << 1;
- for (x = 0; x < width - 1; x += 2) {
- dst_u[x] = src_uv[0];
- dst_u[x + 1] = src_uv[-2];
- dst_v[x] = src_uv[1];
- dst_v[x + 1] = src_uv[-2 + 1];
- src_uv -= 4;
- }
- if (width & 1) {
- dst_u[width - 1] = src_uv[0];
- dst_v[width - 1] = src_uv[1];
- }
-}
-
-void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
- int x;
- const uint32* src32 = (const uint32*)(src);
- uint32* dst32 = (uint32*)(dst);
- src32 += width - 1;
- for (x = 0; x < width - 1; x += 2) {
- dst32[x] = src32[0];
- dst32[x + 1] = src32[-1];
- src32 -= 2;
- }
- if (width & 1) {
- dst32[width - 1] = src32[0];
- }
-}
-
-void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- dst_u[x] = src_uv[0];
- dst_u[x + 1] = src_uv[2];
- dst_v[x] = src_uv[1];
- dst_v[x + 1] = src_uv[3];
- src_uv += 4;
- }
- if (width & 1) {
- dst_u[width - 1] = src_uv[0];
- dst_v[width - 1] = src_uv[1];
- }
-}
-
-void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
- int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- dst_uv[0] = src_u[x];
- dst_uv[1] = src_v[x];
- dst_uv[2] = src_u[x + 1];
- dst_uv[3] = src_v[x + 1];
- dst_uv += 4;
- }
- if (width & 1) {
- dst_uv[0] = src_u[width - 1];
- dst_uv[1] = src_v[width - 1];
- }
-}
-
-void CopyRow_C(const uint8* src, uint8* dst, int count) {
- memcpy(dst, src, count);
-}
-
-void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
- memcpy(dst, src, count * 2);
-}
-
-void SetRow_C(uint8* dst, uint32 v8, int count) {
-#ifdef _MSC_VER
- // VC will generate rep stosb.
- int x;
- for (x = 0; x < count; ++x) {
- dst[x] = v8;
- }
-#else
- memset(dst, v8, count);
-#endif
-}
-
-void ARGBSetRows_C(uint8* dst, uint32 v32, int width,
- int dst_stride, int height) {
- int y;
- for (y = 0; y < height; ++y) {
- uint32* d = (uint32*)(dst);
- int x;
- for (x = 0; x < width; ++x) {
- d[x] = v32;
- }
- dst += dst_stride;
- }
-}
-
-// Filter 2 rows of YUY2 UV's (422) into U and V (420).
-void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
- // Output a row of UV values, filtering 2 rows of YUY2.
- int x;
- for (x = 0; x < width; x += 2) {
- dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
- dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
- src_yuy2 += 4;
- dst_u += 1;
- dst_v += 1;
- }
-}
-
-// Copy row of YUY2 UV's (422) into U and V (422).
-void YUY2ToUV422Row_C(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
- // Output a row of UV values.
- int x;
- for (x = 0; x < width; x += 2) {
- dst_u[0] = src_yuy2[1];
- dst_v[0] = src_yuy2[3];
- src_yuy2 += 4;
- dst_u += 1;
- dst_v += 1;
- }
-}
-
-// Copy row of YUY2 Y's (422) into Y (420/422).
-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
- // Output a row of Y values.
- int x;
- for (x = 0; x < width - 1; x += 2) {
- dst_y[x] = src_yuy2[0];
- dst_y[x + 1] = src_yuy2[2];
- src_yuy2 += 4;
- }
- if (width & 1) {
- dst_y[width - 1] = src_yuy2[0];
- }
-}
-
-// Filter 2 rows of UYVY UV's (422) into U and V (420).
-void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
- // Output a row of UV values.
- int x;
- for (x = 0; x < width; x += 2) {
- dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
- dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
- src_uyvy += 4;
- dst_u += 1;
- dst_v += 1;
- }
-}
-
-// Copy row of UYVY UV's (422) into U and V (422).
-void UYVYToUV422Row_C(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
- // Output a row of UV values.
- int x;
- for (x = 0; x < width; x += 2) {
- dst_u[0] = src_uyvy[0];
- dst_v[0] = src_uyvy[2];
- src_uyvy += 4;
- dst_u += 1;
- dst_v += 1;
- }
-}
-
-// Copy row of UYVY Y's (422) into Y (420/422).
-void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
- // Output a row of Y values.
- int x;
- for (x = 0; x < width - 1; x += 2) {
- dst_y[x] = src_uyvy[1];
- dst_y[x + 1] = src_uyvy[3];
- src_uyvy += 4;
- }
- if (width & 1) {
- dst_y[width - 1] = src_uyvy[1];
- }
-}
-
-#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
-
-// Blend src_argb0 over src_argb1 and store to dst_argb.
-// dst_argb may be src_argb0 or src_argb1.
-// This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- uint32 fb = src_argb0[0];
- uint32 fg = src_argb0[1];
- uint32 fr = src_argb0[2];
- uint32 a = src_argb0[3];
- uint32 bb = src_argb1[0];
- uint32 bg = src_argb1[1];
- uint32 br = src_argb1[2];
- dst_argb[0] = BLEND(fb, bb, a);
- dst_argb[1] = BLEND(fg, bg, a);
- dst_argb[2] = BLEND(fr, br, a);
- dst_argb[3] = 255u;
-
- fb = src_argb0[4 + 0];
- fg = src_argb0[4 + 1];
- fr = src_argb0[4 + 2];
- a = src_argb0[4 + 3];
- bb = src_argb1[4 + 0];
- bg = src_argb1[4 + 1];
- br = src_argb1[4 + 2];
- dst_argb[4 + 0] = BLEND(fb, bb, a);
- dst_argb[4 + 1] = BLEND(fg, bg, a);
- dst_argb[4 + 2] = BLEND(fr, br, a);
- dst_argb[4 + 3] = 255u;
- src_argb0 += 8;
- src_argb1 += 8;
- dst_argb += 8;
- }
-
- if (width & 1) {
- uint32 fb = src_argb0[0];
- uint32 fg = src_argb0[1];
- uint32 fr = src_argb0[2];
- uint32 a = src_argb0[3];
- uint32 bb = src_argb1[0];
- uint32 bg = src_argb1[1];
- uint32 br = src_argb1[2];
- dst_argb[0] = BLEND(fb, bb, a);
- dst_argb[1] = BLEND(fg, bg, a);
- dst_argb[2] = BLEND(fr, br, a);
- dst_argb[3] = 255u;
- }
-}
-#undef BLEND
-#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
-
-// Multiply source RGB by alpha and store to destination.
-// This code mimics the SSSE3 version for better testability.
-void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
- int i;
- for (i = 0; i < width - 1; i += 2) {
- uint32 b = src_argb[0];
- uint32 g = src_argb[1];
- uint32 r = src_argb[2];
- uint32 a = src_argb[3];
- dst_argb[0] = ATTENUATE(b, a);
- dst_argb[1] = ATTENUATE(g, a);
- dst_argb[2] = ATTENUATE(r, a);
- dst_argb[3] = a;
- b = src_argb[4];
- g = src_argb[5];
- r = src_argb[6];
- a = src_argb[7];
- dst_argb[4] = ATTENUATE(b, a);
- dst_argb[5] = ATTENUATE(g, a);
- dst_argb[6] = ATTENUATE(r, a);
- dst_argb[7] = a;
- src_argb += 8;
- dst_argb += 8;
- }
-
- if (width & 1) {
- const uint32 b = src_argb[0];
- const uint32 g = src_argb[1];
- const uint32 r = src_argb[2];
- const uint32 a = src_argb[3];
- dst_argb[0] = ATTENUATE(b, a);
- dst_argb[1] = ATTENUATE(g, a);
- dst_argb[2] = ATTENUATE(r, a);
- dst_argb[3] = a;
- }
-}
-#undef ATTENUATE
-
-// Divide source RGB by alpha and store to destination.
-// b = (b * 255 + (a / 2)) / a;
-// g = (g * 255 + (a / 2)) / a;
-// r = (r * 255 + (a / 2)) / a;
-// Reciprocal method is off by 1 on some values. ie 125
-// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
-#define T(a) 0x01000000 + (0x10000 / a)
-const uint32 fixed_invtbl8[256] = {
- 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
- T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
- T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
- T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
- T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
- T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
- T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
- T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
- T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
- T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
- T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
- T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
- T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
- T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
- T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
- T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
- T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
- T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
- T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
- T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
- T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
- T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
- T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
- T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
- T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
- T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
- T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
- T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
- T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
- T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
- T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
- T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
-#undef T
-
-void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
- int i;
- for (i = 0; i < width; ++i) {
- uint32 b = src_argb[0];
- uint32 g = src_argb[1];
- uint32 r = src_argb[2];
- const uint32 a = src_argb[3];
- const uint32 ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point
- b = (b * ia) >> 8;
- g = (g * ia) >> 8;
- r = (r * ia) >> 8;
- // Clamping should not be necessary but is free in assembly.
- dst_argb[0] = clamp255(b);
- dst_argb[1] = clamp255(g);
- dst_argb[2] = clamp255(r);
- dst_argb[3] = a;
- src_argb += 4;
- dst_argb += 4;
- }
-}
-
-void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
- const int32* previous_cumsum, int width) {
- int32 row_sum[4] = {0, 0, 0, 0};
- int x;
- for (x = 0; x < width; ++x) {
- row_sum[0] += row[x * 4 + 0];
- row_sum[1] += row[x * 4 + 1];
- row_sum[2] += row[x * 4 + 2];
- row_sum[3] += row[x * 4 + 3];
- cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
- cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
- cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
- cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
- }
-}
-
-void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
- int w, int area, uint8* dst, int count) {
- float ooa = 1.0f / area;
- int i;
- for (i = 0; i < count; ++i) {
- dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
- dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
- dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
- dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
- dst += 4;
- tl += 4;
- bl += 4;
- }
-}
-
-// Copy pixels from rotated source to destination row with a slope.
-LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
- uint8* dst_argb, const float* uv_dudv, int width) {
- int i;
- // Render a row of pixels from source into a buffer.
- float uv[2];
- uv[0] = uv_dudv[0];
- uv[1] = uv_dudv[1];
- for (i = 0; i < width; ++i) {
- int x = (int)(uv[0]);
- int y = (int)(uv[1]);
- *(uint32*)(dst_argb) =
- *(const uint32*)(src_argb + y * src_argb_stride +
- x * 4);
- dst_argb += 4;
- uv[0] += uv_dudv[2];
- uv[1] += uv_dudv[3];
- }
-}
-
-// Blend 2 rows into 1 for conversions such as I422ToI420.
-void HalfRow_C(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix) {
- int x;
- for (x = 0; x < pix; ++x) {
- dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
- }
-}
-
-void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
- uint16* dst_uv, int pix) {
- int x;
- for (x = 0; x < pix; ++x) {
- dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
- }
-}
-
-// C version 2x2 -> 2x1.
-void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride,
- int width, int source_y_fraction) {
- int y1_fraction = source_y_fraction;
- int y0_fraction = 256 - y1_fraction;
- const uint8* src_ptr1 = src_ptr + src_stride;
- int x;
- if (source_y_fraction == 0) {
- memcpy(dst_ptr, src_ptr, width);
- return;
- }
- if (source_y_fraction == 128) {
- HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width);
- return;
- }
- for (x = 0; x < width - 1; x += 2) {
- dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
- dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
- src_ptr += 2;
- src_ptr1 += 2;
- dst_ptr += 2;
- }
- if (width & 1) {
- dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
- }
-}
-
-void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
- ptrdiff_t src_stride,
- int width, int source_y_fraction) {
- int y1_fraction = source_y_fraction;
- int y0_fraction = 256 - y1_fraction;
- const uint16* src_ptr1 = src_ptr + src_stride;
- int x;
- if (source_y_fraction == 0) {
- memcpy(dst_ptr, src_ptr, width * 2);
- return;
- }
- if (source_y_fraction == 128) {
- HalfRow_16_C(src_ptr, (int)(src_stride), dst_ptr, width);
- return;
- }
- for (x = 0; x < width - 1; x += 2) {
- dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
- dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
- src_ptr += 2;
- src_ptr1 += 2;
- dst_ptr += 2;
- }
- if (width & 1) {
- dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
- }
-}
-
-// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG
-void ARGBToBayerRow_C(const uint8* src_argb,
- uint8* dst_bayer, uint32 selector, int pix) {
- int index0 = selector & 0xff;
- int index1 = (selector >> 8) & 0xff;
- // Copy a row of Bayer.
- int x;
- for (x = 0; x < pix - 1; x += 2) {
- dst_bayer[0] = src_argb[index0];
- dst_bayer[1] = src_argb[index1];
- src_argb += 8;
- dst_bayer += 2;
- }
- if (pix & 1) {
- dst_bayer[0] = src_argb[index0];
- }
-}
-
-// Select G channel from ARGB. e.g. GGGGGGGG
-void ARGBToBayerGGRow_C(const uint8* src_argb,
- uint8* dst_bayer, uint32 selector, int pix) {
- // Copy a row of G.
- int x;
- for (x = 0; x < pix - 1; x += 2) {
- dst_bayer[0] = src_argb[1];
- dst_bayer[1] = src_argb[5];
- src_argb += 8;
- dst_bayer += 2;
- }
- if (pix & 1) {
- dst_bayer[0] = src_argb[1];
- }
-}
-
-// Use first 4 shuffler values to reorder ARGB channels.
-void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int pix) {
- int index0 = shuffler[0];
- int index1 = shuffler[1];
- int index2 = shuffler[2];
- int index3 = shuffler[3];
- // Shuffle a row of ARGB.
- int x;
- for (x = 0; x < pix; ++x) {
- // To support in-place conversion.
- uint8 b = src_argb[index0];
- uint8 g = src_argb[index1];
- uint8 r = src_argb[index2];
- uint8 a = src_argb[index3];
- dst_argb[0] = b;
- dst_argb[1] = g;
- dst_argb[2] = r;
- dst_argb[3] = a;
- src_argb += 4;
- dst_argb += 4;
- }
-}
-
-void I422ToYUY2Row_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame, int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- dst_frame[0] = src_y[0];
- dst_frame[1] = src_u[0];
- dst_frame[2] = src_y[1];
- dst_frame[3] = src_v[0];
- dst_frame += 4;
- src_y += 2;
- src_u += 1;
- src_v += 1;
- }
- if (width & 1) {
- dst_frame[0] = src_y[0];
- dst_frame[1] = src_u[0];
- dst_frame[2] = src_y[0]; // duplicate last y
- dst_frame[3] = src_v[0];
- }
-}
-
-void I422ToUYVYRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame, int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- dst_frame[0] = src_u[0];
- dst_frame[1] = src_y[0];
- dst_frame[2] = src_v[0];
- dst_frame[3] = src_y[1];
- dst_frame += 4;
- src_y += 2;
- src_u += 1;
- src_v += 1;
- }
- if (width & 1) {
- dst_frame[0] = src_u[0];
- dst_frame[1] = src_y[0];
- dst_frame[2] = src_v[0];
- dst_frame[3] = src_y[0]; // duplicate last y
- }
-}
-
-#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3)
-// row_win.cc has asm version, but GCC uses 2 step wrapper.
-#if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
-void I422ToRGB565Row_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
- int width) {
- // Allocate a row of ARGB.
- align_buffer_64(row, width * 4);
- I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
- ARGBToRGB565Row_SSE2(row, rgb_buf, width);
- free_aligned_buffer_64(row);
-}
-#endif // !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
-
-#if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
-void I422ToARGB1555Row_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
- int width) {
- // Allocate a row of ARGB.
- align_buffer_64(row, width * 4);
- I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
- ARGBToARGB1555Row_SSE2(row, rgb_buf, width);
- free_aligned_buffer_64(row);
-}
-
-void I422ToARGB4444Row_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
- int width) {
- // Allocate a row of ARGB.
- align_buffer_64(row, width * 4);
- I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
- ARGBToARGB4444Row_SSE2(row, rgb_buf, width);
- free_aligned_buffer_64(row);
-}
-
-void NV12ToRGB565Row_SSSE3(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_rgb565,
- int width) {
- // Allocate a row of ARGB.
- align_buffer_64(row, width * 4);
- NV12ToARGBRow_SSSE3(src_y, src_uv, row, width);
- ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
- free_aligned_buffer_64(row);
-}
-
-void NV21ToRGB565Row_SSSE3(const uint8* src_y,
- const uint8* src_vu,
- uint8* dst_rgb565,
- int width) {
- // Allocate a row of ARGB.
- align_buffer_64(row, width * 4);
- NV21ToARGBRow_SSSE3(src_y, src_vu, row, width);
- ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
- free_aligned_buffer_64(row);
-}
-
-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
- uint8* dst_argb,
- int width) {
- // Allocate a rows of yuv.
- align_buffer_64(row_y, ((width + 63) & ~63) * 2);
- uint8* row_u = row_y + ((width + 63) & ~63);
- uint8* row_v = row_u + ((width + 63) & ~63) / 2;
- YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width);
- YUY2ToYRow_SSE2(src_yuy2, row_y, width);
- I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
- free_aligned_buffer_64(row_y);
-}
-
-void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
- uint8* dst_argb,
- int width) {
- // Allocate a rows of yuv.
- align_buffer_64(row_y, ((width + 63) & ~63) * 2);
- uint8* row_u = row_y + ((width + 63) & ~63);
- uint8* row_v = row_u + ((width + 63) & ~63) / 2;
- YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width);
- YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width);
- I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
- free_aligned_buffer_64(row_y);
-}
-
-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
- uint8* dst_argb,
- int width) {
- // Allocate a rows of yuv.
- align_buffer_64(row_y, ((width + 63) & ~63) * 2);
- uint8* row_u = row_y + ((width + 63) & ~63);
- uint8* row_v = row_u + ((width + 63) & ~63) / 2;
- UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width);
- UYVYToYRow_SSE2(src_uyvy, row_y, width);
- I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
- free_aligned_buffer_64(row_y);
-}
-
-void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
- uint8* dst_argb,
- int width) {
- // Allocate a rows of yuv.
- align_buffer_64(row_y, ((width + 63) & ~63) * 2);
- uint8* row_u = row_y + ((width + 63) & ~63);
- uint8* row_v = row_u + ((width + 63) & ~63) / 2;
- UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width);
- UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width);
- I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
- free_aligned_buffer_64(row_y);
-}
-
-#endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
-#endif // !defined(LIBYUV_DISABLE_X86)
-
-void ARGBPolynomialRow_C(const uint8* src_argb,
- uint8* dst_argb, const float* poly,
- int width) {
- int i;
- for (i = 0; i < width; ++i) {
- float b = (float)(src_argb[0]);
- float g = (float)(src_argb[1]);
- float r = (float)(src_argb[2]);
- float a = (float)(src_argb[3]);
- float b2 = b * b;
- float g2 = g * g;
- float r2 = r * r;
- float a2 = a * a;
- float db = poly[0] + poly[4] * b;
- float dg = poly[1] + poly[5] * g;
- float dr = poly[2] + poly[6] * r;
- float da = poly[3] + poly[7] * a;
- float b3 = b2 * b;
- float g3 = g2 * g;
- float r3 = r2 * r;
- float a3 = a2 * a;
- db += poly[8] * b2;
- dg += poly[9] * g2;
- dr += poly[10] * r2;
- da += poly[11] * a2;
- db += poly[12] * b3;
- dg += poly[13] * g3;
- dr += poly[14] * r3;
- da += poly[15] * a3;
-
- dst_argb[0] = Clamp((int32)(db));
- dst_argb[1] = Clamp((int32)(dg));
- dst_argb[2] = Clamp((int32)(dr));
- dst_argb[3] = Clamp((int32)(da));
- src_argb += 4;
- dst_argb += 4;
- }
-}
-
-void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
- const uint8* luma, uint32 lumacoeff) {
- uint32 bc = lumacoeff & 0xff;
- uint32 gc = (lumacoeff >> 8) & 0xff;
- uint32 rc = (lumacoeff >> 16) & 0xff;
-
- int i;
- for (i = 0; i < width - 1; i += 2) {
- // Luminance in rows, color values in columns.
- const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
- src_argb[2] * rc) & 0x7F00u) + luma;
- const uint8* luma1;
- dst_argb[0] = luma0[src_argb[0]];
- dst_argb[1] = luma0[src_argb[1]];
- dst_argb[2] = luma0[src_argb[2]];
- dst_argb[3] = src_argb[3];
- luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
- src_argb[6] * rc) & 0x7F00u) + luma;
- dst_argb[4] = luma1[src_argb[4]];
- dst_argb[5] = luma1[src_argb[5]];
- dst_argb[6] = luma1[src_argb[6]];
- dst_argb[7] = src_argb[7];
- src_argb += 8;
- dst_argb += 8;
- }
- if (width & 1) {
- // Luminance in rows, color values in columns.
- const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
- src_argb[2] * rc) & 0x7F00u) + luma;
- dst_argb[0] = luma0[src_argb[0]];
- dst_argb[1] = luma0[src_argb[1]];
- dst_argb[2] = luma0[src_argb[2]];
- dst_argb[3] = src_argb[3];
- }
-}
-
-void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
- int i;
- for (i = 0; i < width - 1; i += 2) {
- dst[3] = src[3];
- dst[7] = src[7];
- dst += 8;
- src += 8;
- }
- if (width & 1) {
- dst[3] = src[3];
- }
-}
-
-void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
- int i;
- for (i = 0; i < width - 1; i += 2) {
- dst[3] = src[0];
- dst[7] = src[1];
- dst += 8;
- src += 2;
- }
- if (width & 1) {
- dst[3] = src[0];
- }
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/row_mips.cc b/src/main/jni/libyuv/source/row_mips.cc
deleted file mode 100644
index da7183bc1..000000000
--- a/src/main/jni/libyuv/source/row_mips.cc
+++ /dev/null
@@ -1,994 +0,0 @@
-/*
- * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
- (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-#ifdef HAS_COPYROW_MIPS
-void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
- __asm__ __volatile__ (
- ".set noreorder \n"
- ".set noat \n"
- "slti $at, %[count], 8 \n"
- "bne $at ,$zero, $last8 \n"
- "xor $t8, %[src], %[dst] \n"
- "andi $t8, $t8, 0x3 \n"
-
- "bne $t8, $zero, unaligned \n"
- "negu $a3, %[dst] \n"
- // make dst/src aligned
- "andi $a3, $a3, 0x3 \n"
- "beq $a3, $zero, $chk16w \n"
- // word-aligned now count is the remining bytes count
- "subu %[count], %[count], $a3 \n"
-
- "lwr $t8, 0(%[src]) \n"
- "addu %[src], %[src], $a3 \n"
- "swr $t8, 0(%[dst]) \n"
- "addu %[dst], %[dst], $a3 \n"
-
- // Now the dst/src are mutually word-aligned with word-aligned addresses
- "$chk16w: \n"
- "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
- // t8 is the byte count after 64-byte chunks
- "beq %[count], $t8, chk8w \n"
- // There will be at most 1 32-byte chunk after it
- "subu $a3, %[count], $t8 \n" // the reminder
- // Here a3 counts bytes in 16w chunks
- "addu $a3, %[dst], $a3 \n"
- // Now a3 is the final dst after 64-byte chunks
- "addu $t0, %[dst], %[count] \n"
- // t0 is the "past the end" address
-
- // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
- // the "t0-32" address
- // This means: for x=128 the last "safe" a1 address is "t0-160"
- // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
- // we will use "pref 30,128(a1)", so "t0-160" is the limit
- "subu $t9, $t0, 160 \n"
- // t9 is the "last safe pref 30,128(a1)" address
- "pref 0, 0(%[src]) \n" // first line of src
- "pref 0, 32(%[src]) \n" // second line of src
- "pref 0, 64(%[src]) \n"
- "pref 30, 32(%[dst]) \n"
- // In case the a1 > t9 don't use "pref 30" at all
- "sgtu $v1, %[dst], $t9 \n"
- "bgtz $v1, $loop16w \n"
- "nop \n"
- // otherwise, start with using pref30
- "pref 30, 64(%[dst]) \n"
- "$loop16w: \n"
- "pref 0, 96(%[src]) \n"
- "lw $t0, 0(%[src]) \n"
- "bgtz $v1, $skip_pref30_96 \n" // skip
- "lw $t1, 4(%[src]) \n"
- "pref 30, 96(%[dst]) \n" // continue
- "$skip_pref30_96: \n"
- "lw $t2, 8(%[src]) \n"
- "lw $t3, 12(%[src]) \n"
- "lw $t4, 16(%[src]) \n"
- "lw $t5, 20(%[src]) \n"
- "lw $t6, 24(%[src]) \n"
- "lw $t7, 28(%[src]) \n"
- "pref 0, 128(%[src]) \n"
- // bring the next lines of src, addr 128
- "sw $t0, 0(%[dst]) \n"
- "sw $t1, 4(%[dst]) \n"
- "sw $t2, 8(%[dst]) \n"
- "sw $t3, 12(%[dst]) \n"
- "sw $t4, 16(%[dst]) \n"
- "sw $t5, 20(%[dst]) \n"
- "sw $t6, 24(%[dst]) \n"
- "sw $t7, 28(%[dst]) \n"
- "lw $t0, 32(%[src]) \n"
- "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)
- "lw $t1, 36(%[src]) \n"
- "pref 30, 128(%[dst]) \n" // set dest, addr 128
- "$skip_pref30_128: \n"
- "lw $t2, 40(%[src]) \n"
- "lw $t3, 44(%[src]) \n"
- "lw $t4, 48(%[src]) \n"
- "lw $t5, 52(%[src]) \n"
- "lw $t6, 56(%[src]) \n"
- "lw $t7, 60(%[src]) \n"
- "pref 0, 160(%[src]) \n"
- // bring the next lines of src, addr 160
- "sw $t0, 32(%[dst]) \n"
- "sw $t1, 36(%[dst]) \n"
- "sw $t2, 40(%[dst]) \n"
- "sw $t3, 44(%[dst]) \n"
- "sw $t4, 48(%[dst]) \n"
- "sw $t5, 52(%[dst]) \n"
- "sw $t6, 56(%[dst]) \n"
- "sw $t7, 60(%[dst]) \n"
-
- "addiu %[dst], %[dst], 64 \n" // adding 64 to dest
- "sgtu $v1, %[dst], $t9 \n"
- "bne %[dst], $a3, $loop16w \n"
- " addiu %[src], %[src], 64 \n" // adding 64 to src
- "move %[count], $t8 \n"
-
- // Here we have src and dest word-aligned but less than 64-bytes to go
-
- "chk8w: \n"
- "pref 0, 0x0(%[src]) \n"
- "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
- // the t8 is the reminder count past 32-bytes
- "beq %[count], $t8, chk1w \n"
- // count=t8,no 32-byte chunk
- " nop \n"
-
- "lw $t0, 0(%[src]) \n"
- "lw $t1, 4(%[src]) \n"
- "lw $t2, 8(%[src]) \n"
- "lw $t3, 12(%[src]) \n"
- "lw $t4, 16(%[src]) \n"
- "lw $t5, 20(%[src]) \n"
- "lw $t6, 24(%[src]) \n"
- "lw $t7, 28(%[src]) \n"
- "addiu %[src], %[src], 32 \n"
-
- "sw $t0, 0(%[dst]) \n"
- "sw $t1, 4(%[dst]) \n"
- "sw $t2, 8(%[dst]) \n"
- "sw $t3, 12(%[dst]) \n"
- "sw $t4, 16(%[dst]) \n"
- "sw $t5, 20(%[dst]) \n"
- "sw $t6, 24(%[dst]) \n"
- "sw $t7, 28(%[dst]) \n"
- "addiu %[dst], %[dst], 32 \n"
-
- "chk1w: \n"
- "andi %[count], $t8, 0x3 \n"
- // now count is the reminder past 1w chunks
- "beq %[count], $t8, $last8 \n"
- " subu $a3, $t8, %[count] \n"
- // a3 is count of bytes in 1w chunks
- "addu $a3, %[dst], $a3 \n"
- // now a3 is the dst address past the 1w chunks
- // copying in words (4-byte chunks)
- "$wordCopy_loop: \n"
- "lw $t3, 0(%[src]) \n"
- // the first t3 may be equal t0 ... optimize?
- "addiu %[src], %[src],4 \n"
- "addiu %[dst], %[dst],4 \n"
- "bne %[dst], $a3,$wordCopy_loop \n"
- " sw $t3, -4(%[dst]) \n"
-
- // For the last (<8) bytes
- "$last8: \n"
- "blez %[count], leave \n"
- " addu $a3, %[dst], %[count] \n" // a3 -last dst address
- "$last8loop: \n"
- "lb $v1, 0(%[src]) \n"
- "addiu %[src], %[src], 1 \n"
- "addiu %[dst], %[dst], 1 \n"
- "bne %[dst], $a3, $last8loop \n"
- " sb $v1, -1(%[dst]) \n"
-
- "leave: \n"
- " j $ra \n"
- " nop \n"
-
- //
- // UNALIGNED case
- //
-
- "unaligned: \n"
- // got here with a3="negu a1"
- "andi $a3, $a3, 0x3 \n" // a1 is word aligned?
- "beqz $a3, $ua_chk16w \n"
- " subu %[count], %[count], $a3 \n"
- // bytes left after initial a3 bytes
- "lwr $v1, 0(%[src]) \n"
- "lwl $v1, 3(%[src]) \n"
- "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3
- "swr $v1, 0(%[dst]) \n"
- "addu %[dst], %[dst], $a3 \n"
- // below the dst will be word aligned (NOTE1)
- "$ua_chk16w: \n"
- "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
- // t8 is the byte count after 64-byte chunks
- "beq %[count], $t8, ua_chk8w \n"
- // if a2==t8, no 64-byte chunks
- // There will be at most 1 32-byte chunk after it
- "subu $a3, %[count], $t8 \n" // the reminder
- // Here a3 counts bytes in 16w chunks
- "addu $a3, %[dst], $a3 \n"
- // Now a3 is the final dst after 64-byte chunks
- "addu $t0, %[dst], %[count] \n" // t0 "past the end"
- "subu $t9, $t0, 160 \n"
- // t9 is the "last safe pref 30,128(a1)" address
- "pref 0, 0(%[src]) \n" // first line of src
- "pref 0, 32(%[src]) \n" // second line addr 32
- "pref 0, 64(%[src]) \n"
- "pref 30, 32(%[dst]) \n"
- // safe, as we have at least 64 bytes ahead
- // In case the a1 > t9 don't use "pref 30" at all
- "sgtu $v1, %[dst], $t9 \n"
- "bgtz $v1, $ua_loop16w \n"
- // skip "pref 30,64(a1)" for too short arrays
- " nop \n"
- // otherwise, start with using pref30
- "pref 30, 64(%[dst]) \n"
- "$ua_loop16w: \n"
- "pref 0, 96(%[src]) \n"
- "lwr $t0, 0(%[src]) \n"
- "lwl $t0, 3(%[src]) \n"
- "lwr $t1, 4(%[src]) \n"
- "bgtz $v1, $ua_skip_pref30_96 \n"
- " lwl $t1, 7(%[src]) \n"
- "pref 30, 96(%[dst]) \n"
- // continue setting up the dest, addr 96
- "$ua_skip_pref30_96: \n"
- "lwr $t2, 8(%[src]) \n"
- "lwl $t2, 11(%[src]) \n"
- "lwr $t3, 12(%[src]) \n"
- "lwl $t3, 15(%[src]) \n"
- "lwr $t4, 16(%[src]) \n"
- "lwl $t4, 19(%[src]) \n"
- "lwr $t5, 20(%[src]) \n"
- "lwl $t5, 23(%[src]) \n"
- "lwr $t6, 24(%[src]) \n"
- "lwl $t6, 27(%[src]) \n"
- "lwr $t7, 28(%[src]) \n"
- "lwl $t7, 31(%[src]) \n"
- "pref 0, 128(%[src]) \n"
- // bring the next lines of src, addr 128
- "sw $t0, 0(%[dst]) \n"
- "sw $t1, 4(%[dst]) \n"
- "sw $t2, 8(%[dst]) \n"
- "sw $t3, 12(%[dst]) \n"
- "sw $t4, 16(%[dst]) \n"
- "sw $t5, 20(%[dst]) \n"
- "sw $t6, 24(%[dst]) \n"
- "sw $t7, 28(%[dst]) \n"
- "lwr $t0, 32(%[src]) \n"
- "lwl $t0, 35(%[src]) \n"
- "lwr $t1, 36(%[src]) \n"
- "bgtz $v1, ua_skip_pref30_128 \n"
- " lwl $t1, 39(%[src]) \n"
- "pref 30, 128(%[dst]) \n"
- // continue setting up the dest, addr 128
- "ua_skip_pref30_128: \n"
-
- "lwr $t2, 40(%[src]) \n"
- "lwl $t2, 43(%[src]) \n"
- "lwr $t3, 44(%[src]) \n"
- "lwl $t3, 47(%[src]) \n"
- "lwr $t4, 48(%[src]) \n"
- "lwl $t4, 51(%[src]) \n"
- "lwr $t5, 52(%[src]) \n"
- "lwl $t5, 55(%[src]) \n"
- "lwr $t6, 56(%[src]) \n"
- "lwl $t6, 59(%[src]) \n"
- "lwr $t7, 60(%[src]) \n"
- "lwl $t7, 63(%[src]) \n"
- "pref 0, 160(%[src]) \n"
- // bring the next lines of src, addr 160
- "sw $t0, 32(%[dst]) \n"
- "sw $t1, 36(%[dst]) \n"
- "sw $t2, 40(%[dst]) \n"
- "sw $t3, 44(%[dst]) \n"
- "sw $t4, 48(%[dst]) \n"
- "sw $t5, 52(%[dst]) \n"
- "sw $t6, 56(%[dst]) \n"
- "sw $t7, 60(%[dst]) \n"
-
- "addiu %[dst],%[dst],64 \n" // adding 64 to dest
- "sgtu $v1,%[dst],$t9 \n"
- "bne %[dst],$a3,$ua_loop16w \n"
- " addiu %[src],%[src],64 \n" // adding 64 to src
- "move %[count],$t8 \n"
-
- // Here we have src and dest word-aligned but less than 64-bytes to go
-
- "ua_chk8w: \n"
- "pref 0, 0x0(%[src]) \n"
- "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
- // the t8 is the reminder count
- "beq %[count], $t8, $ua_chk1w \n"
- // when count==t8, no 32-byte chunk
-
- "lwr $t0, 0(%[src]) \n"
- "lwl $t0, 3(%[src]) \n"
- "lwr $t1, 4(%[src]) \n"
- "lwl $t1, 7(%[src]) \n"
- "lwr $t2, 8(%[src]) \n"
- "lwl $t2, 11(%[src]) \n"
- "lwr $t3, 12(%[src]) \n"
- "lwl $t3, 15(%[src]) \n"
- "lwr $t4, 16(%[src]) \n"
- "lwl $t4, 19(%[src]) \n"
- "lwr $t5, 20(%[src]) \n"
- "lwl $t5, 23(%[src]) \n"
- "lwr $t6, 24(%[src]) \n"
- "lwl $t6, 27(%[src]) \n"
- "lwr $t7, 28(%[src]) \n"
- "lwl $t7, 31(%[src]) \n"
- "addiu %[src], %[src], 32 \n"
-
- "sw $t0, 0(%[dst]) \n"
- "sw $t1, 4(%[dst]) \n"
- "sw $t2, 8(%[dst]) \n"
- "sw $t3, 12(%[dst]) \n"
- "sw $t4, 16(%[dst]) \n"
- "sw $t5, 20(%[dst]) \n"
- "sw $t6, 24(%[dst]) \n"
- "sw $t7, 28(%[dst]) \n"
- "addiu %[dst], %[dst], 32 \n"
-
- "$ua_chk1w: \n"
- "andi %[count], $t8, 0x3 \n"
- // now count is the reminder past 1w chunks
- "beq %[count], $t8, ua_smallCopy \n"
- "subu $a3, $t8, %[count] \n"
- // a3 is count of bytes in 1w chunks
- "addu $a3, %[dst], $a3 \n"
- // now a3 is the dst address past the 1w chunks
-
- // copying in words (4-byte chunks)
- "$ua_wordCopy_loop: \n"
- "lwr $v1, 0(%[src]) \n"
- "lwl $v1, 3(%[src]) \n"
- "addiu %[src], %[src], 4 \n"
- "addiu %[dst], %[dst], 4 \n"
- // note: dst=a1 is word aligned here, see NOTE1
- "bne %[dst], $a3, $ua_wordCopy_loop \n"
- " sw $v1,-4(%[dst]) \n"
-
- // Now less than 4 bytes (value in count) left to copy
- "ua_smallCopy: \n"
- "beqz %[count], leave \n"
- " addu $a3, %[dst], %[count] \n" // a3 = last dst address
- "$ua_smallCopy_loop: \n"
- "lb $v1, 0(%[src]) \n"
- "addiu %[src], %[src], 1 \n"
- "addiu %[dst], %[dst], 1 \n"
- "bne %[dst],$a3,$ua_smallCopy_loop \n"
- " sb $v1, -1(%[dst]) \n"
-
- "j $ra \n"
- " nop \n"
- ".set at \n"
- ".set reorder \n"
- : [dst] "+r" (dst), [src] "+r" (src)
- : [count] "r" (count)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
- "t8", "t9", "a3", "v1", "at"
- );
-}
-#endif // HAS_COPYROW_MIPS
-
-// MIPS DSPR2 functions
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
- (__mips_dsp_rev >= 2) && \
- (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
-
-void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) {
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
- "srl $t4, %[width], 4 \n" // multiplies of 16
- "blez $t4, 2f \n"
- " andi %[width], %[width], 0xf \n" // residual
-
- ".p2align 2 \n"
- "1: \n"
- "addiu $t4, $t4, -1 \n"
- "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
- "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2
- "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4
- "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6
- "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8
- "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | U10
- "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | U12
- "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | U14
- "addiu %[src_uv], %[src_uv], 32 \n"
- "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
- "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
- "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
- "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
- "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
- "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
- "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12
- "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12
- "sw $t9, 0(%[dst_v]) \n"
- "sw $t0, 0(%[dst_u]) \n"
- "sw $t1, 4(%[dst_v]) \n"
- "sw $t2, 4(%[dst_u]) \n"
- "sw $t3, 8(%[dst_v]) \n"
- "sw $t5, 8(%[dst_u]) \n"
- "sw $t6, 12(%[dst_v]) \n"
- "sw $t7, 12(%[dst_u]) \n"
- "addiu %[dst_v], %[dst_v], 16 \n"
- "bgtz $t4, 1b \n"
- " addiu %[dst_u], %[dst_u], 16 \n"
-
- "beqz %[width], 3f \n"
- " nop \n"
-
- "2: \n"
- "lbu $t0, 0(%[src_uv]) \n"
- "lbu $t1, 1(%[src_uv]) \n"
- "addiu %[src_uv], %[src_uv], 2 \n"
- "addiu %[width], %[width], -1 \n"
- "sb $t0, 0(%[dst_u]) \n"
- "sb $t1, 0(%[dst_v]) \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "bgtz %[width], 2b \n"
- " addiu %[dst_v], %[dst_v], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src_uv] "+r" (src_uv),
- [width] "+r" (width),
- [dst_u] "+r" (dst_u),
- [dst_v] "+r" (dst_v)
- :
- : "t0", "t1", "t2", "t3",
- "t4", "t5", "t6", "t7", "t8", "t9"
- );
-}
-
-void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
- uint8* dst_v, int width) {
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
- "srl $t4, %[width], 4 \n" // multiplies of 16
- "blez $t4, 2f \n"
- " andi %[width], %[width], 0xf \n" // residual
-
- ".p2align 2 \n"
- "1: \n"
- "addiu $t4, $t4, -1 \n"
- "lwr $t0, 0(%[src_uv]) \n"
- "lwl $t0, 3(%[src_uv]) \n" // V1 | U1 | V0 | U0
- "lwr $t1, 4(%[src_uv]) \n"
- "lwl $t1, 7(%[src_uv]) \n" // V3 | U3 | V2 | U2
- "lwr $t2, 8(%[src_uv]) \n"
- "lwl $t2, 11(%[src_uv]) \n" // V5 | U5 | V4 | U4
- "lwr $t3, 12(%[src_uv]) \n"
- "lwl $t3, 15(%[src_uv]) \n" // V7 | U7 | V6 | U6
- "lwr $t5, 16(%[src_uv]) \n"
- "lwl $t5, 19(%[src_uv]) \n" // V9 | U9 | V8 | U8
- "lwr $t6, 20(%[src_uv]) \n"
- "lwl $t6, 23(%[src_uv]) \n" // V11 | U11 | V10 | U10
- "lwr $t7, 24(%[src_uv]) \n"
- "lwl $t7, 27(%[src_uv]) \n" // V13 | U13 | V12 | U12
- "lwr $t8, 28(%[src_uv]) \n"
- "lwl $t8, 31(%[src_uv]) \n" // V15 | U15 | V14 | U14
- "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
- "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
- "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
- "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
- "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
- "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
- "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12
- "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12
- "addiu %[src_uv], %[src_uv], 32 \n"
- "swr $t9, 0(%[dst_v]) \n"
- "swl $t9, 3(%[dst_v]) \n"
- "swr $t0, 0(%[dst_u]) \n"
- "swl $t0, 3(%[dst_u]) \n"
- "swr $t1, 4(%[dst_v]) \n"
- "swl $t1, 7(%[dst_v]) \n"
- "swr $t2, 4(%[dst_u]) \n"
- "swl $t2, 7(%[dst_u]) \n"
- "swr $t3, 8(%[dst_v]) \n"
- "swl $t3, 11(%[dst_v]) \n"
- "swr $t5, 8(%[dst_u]) \n"
- "swl $t5, 11(%[dst_u]) \n"
- "swr $t6, 12(%[dst_v]) \n"
- "swl $t6, 15(%[dst_v]) \n"
- "swr $t7, 12(%[dst_u]) \n"
- "swl $t7, 15(%[dst_u]) \n"
- "addiu %[dst_u], %[dst_u], 16 \n"
- "bgtz $t4, 1b \n"
- " addiu %[dst_v], %[dst_v], 16 \n"
-
- "beqz %[width], 3f \n"
- " nop \n"
-
- "2: \n"
- "lbu $t0, 0(%[src_uv]) \n"
- "lbu $t1, 1(%[src_uv]) \n"
- "addiu %[src_uv], %[src_uv], 2 \n"
- "addiu %[width], %[width], -1 \n"
- "sb $t0, 0(%[dst_u]) \n"
- "sb $t1, 0(%[dst_v]) \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "bgtz %[width], 2b \n"
- " addiu %[dst_v], %[dst_v], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src_uv] "+r" (src_uv),
- [width] "+r" (width),
- [dst_u] "+r" (dst_u),
- [dst_v] "+r" (dst_v)
- :
- : "t0", "t1", "t2", "t3",
- "t4", "t5", "t6", "t7", "t8", "t9"
- );
-}
-
-void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
-
- "srl $t4, %[width], 4 \n" // multiplies of 16
- "andi $t5, %[width], 0xf \n"
- "blez $t4, 2f \n"
- " addu %[src], %[src], %[width] \n" // src += width
-
- ".p2align 2 \n"
- "1: \n"
- "lw $t0, -16(%[src]) \n" // |3|2|1|0|
- "lw $t1, -12(%[src]) \n" // |7|6|5|4|
- "lw $t2, -8(%[src]) \n" // |11|10|9|8|
- "lw $t3, -4(%[src]) \n" // |15|14|13|12|
- "wsbh $t0, $t0 \n" // |2|3|0|1|
- "wsbh $t1, $t1 \n" // |6|7|4|5|
- "wsbh $t2, $t2 \n" // |10|11|8|9|
- "wsbh $t3, $t3 \n" // |14|15|12|13|
- "rotr $t0, $t0, 16 \n" // |0|1|2|3|
- "rotr $t1, $t1, 16 \n" // |4|5|6|7|
- "rotr $t2, $t2, 16 \n" // |8|9|10|11|
- "rotr $t3, $t3, 16 \n" // |12|13|14|15|
- "addiu %[src], %[src], -16 \n"
- "addiu $t4, $t4, -1 \n"
- "sw $t3, 0(%[dst]) \n" // |15|14|13|12|
- "sw $t2, 4(%[dst]) \n" // |11|10|9|8|
- "sw $t1, 8(%[dst]) \n" // |7|6|5|4|
- "sw $t0, 12(%[dst]) \n" // |3|2|1|0|
- "bgtz $t4, 1b \n"
- " addiu %[dst], %[dst], 16 \n"
- "beqz $t5, 3f \n"
- " nop \n"
-
- "2: \n"
- "lbu $t0, -1(%[src]) \n"
- "addiu $t5, $t5, -1 \n"
- "addiu %[src], %[src], -1 \n"
- "sb $t0, 0(%[dst]) \n"
- "bgez $t5, 2b \n"
- " addiu %[dst], %[dst], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src] "+r" (src), [dst] "+r" (dst)
- : [width] "r" (width)
- : "t0", "t1", "t2", "t3", "t4", "t5"
- );
-}
-
-void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) {
- int x = 0;
- int y = 0;
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
-
- "addu $t4, %[width], %[width] \n"
- "srl %[x], %[width], 4 \n"
- "andi %[y], %[width], 0xf \n"
- "blez %[x], 2f \n"
- " addu %[src_uv], %[src_uv], $t4 \n"
-
- ".p2align 2 \n"
- "1: \n"
- "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
- "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
- "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|
- "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12|
- "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16|
- "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20|
- "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24|
- "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28|
-
- "rotr $t0, $t0, 16 \n" // |1|0|3|2|
- "rotr $t1, $t1, 16 \n" // |5|4|7|6|
- "rotr $t2, $t2, 16 \n" // |9|8|11|10|
- "rotr $t3, $t3, 16 \n" // |13|12|15|14|
- "rotr $t4, $t4, 16 \n" // |17|16|19|18|
- "rotr $t6, $t6, 16 \n" // |21|20|23|22|
- "rotr $t7, $t7, 16 \n" // |25|24|27|26|
- "rotr $t8, $t8, 16 \n" // |29|28|31|30|
- "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6|
- "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7|
- "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14|
- "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15|
- "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22|
- "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23|
- "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30|
- "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31|
- "addiu %[src_uv], %[src_uv], -32 \n"
- "addiu %[x], %[x], -1 \n"
- "swr $t4, 0(%[dst_u]) \n"
- "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24|
- "swr $t6, 0(%[dst_v]) \n"
- "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25|
- "swr $t2, 4(%[dst_u]) \n"
- "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16|
- "swr $t3, 4(%[dst_v]) \n"
- "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17|
- "swr $t0, 8(%[dst_u]) \n"
- "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8|
- "swr $t1, 8(%[dst_v]) \n"
- "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9|
- "swr $t9, 12(%[dst_u]) \n"
- "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0|
- "swr $t5, 12(%[dst_v]) \n"
- "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1|
- "addiu %[dst_v], %[dst_v], 16 \n"
- "bgtz %[x], 1b \n"
- " addiu %[dst_u], %[dst_u], 16 \n"
- "beqz %[y], 3f \n"
- " nop \n"
- "b 2f \n"
- " nop \n"
-
- "2: \n"
- "lbu $t0, -2(%[src_uv]) \n"
- "lbu $t1, -1(%[src_uv]) \n"
- "addiu %[src_uv], %[src_uv], -2 \n"
- "addiu %[y], %[y], -1 \n"
- "sb $t0, 0(%[dst_u]) \n"
- "sb $t1, 0(%[dst_v]) \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "bgtz %[y], 2b \n"
- " addiu %[dst_v], %[dst_v], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src_uv] "+r" (src_uv),
- [dst_u] "+r" (dst_u),
- [dst_v] "+r" (dst_v),
- [x] "=&r" (x),
- [y] "+r" (y)
- : [width] "r" (width)
- : "t0", "t1", "t2", "t3", "t4",
- "t5", "t7", "t8", "t9"
- );
-}
-
-// Convert (4 Y and 2 VU) I422 and arrange RGB values into
-// t5 = | 0 | B0 | 0 | b0 |
-// t4 = | 0 | B1 | 0 | b1 |
-// t9 = | 0 | G0 | 0 | g0 |
-// t8 = | 0 | G1 | 0 | g1 |
-// t2 = | 0 | R0 | 0 | r0 |
-// t1 = | 0 | R1 | 0 | r1 |
-#define I422ToTransientMipsRGB \
- "lw $t0, 0(%[y_buf]) \n" \
- "lhu $t1, 0(%[u_buf]) \n" \
- "lhu $t2, 0(%[v_buf]) \n" \
- "preceu.ph.qbr $t1, $t1 \n" \
- "preceu.ph.qbr $t2, $t2 \n" \
- "preceu.ph.qbra $t3, $t0 \n" \
- "preceu.ph.qbla $t0, $t0 \n" \
- "subu.ph $t1, $t1, $s5 \n" \
- "subu.ph $t2, $t2, $s5 \n" \
- "subu.ph $t3, $t3, $s4 \n" \
- "subu.ph $t0, $t0, $s4 \n" \
- "mul.ph $t3, $t3, $s0 \n" \
- "mul.ph $t0, $t0, $s0 \n" \
- "shll.ph $t4, $t1, 0x7 \n" \
- "subu.ph $t4, $t4, $t1 \n" \
- "mul.ph $t6, $t1, $s1 \n" \
- "mul.ph $t1, $t2, $s2 \n" \
- "addq_s.ph $t5, $t4, $t3 \n" \
- "addq_s.ph $t4, $t4, $t0 \n" \
- "shra.ph $t5, $t5, 6 \n" \
- "shra.ph $t4, $t4, 6 \n" \
- "addiu %[u_buf], 2 \n" \
- "addiu %[v_buf], 2 \n" \
- "addu.ph $t6, $t6, $t1 \n" \
- "mul.ph $t1, $t2, $s3 \n" \
- "addu.ph $t9, $t6, $t3 \n" \
- "addu.ph $t8, $t6, $t0 \n" \
- "shra.ph $t9, $t9, 6 \n" \
- "shra.ph $t8, $t8, 6 \n" \
- "addu.ph $t2, $t1, $t3 \n" \
- "addu.ph $t1, $t1, $t0 \n" \
- "shra.ph $t2, $t2, 6 \n" \
- "shra.ph $t1, $t1, 6 \n" \
- "subu.ph $t5, $t5, $s5 \n" \
- "subu.ph $t4, $t4, $s5 \n" \
- "subu.ph $t9, $t9, $s5 \n" \
- "subu.ph $t8, $t8, $s5 \n" \
- "subu.ph $t2, $t2, $s5 \n" \
- "subu.ph $t1, $t1, $s5 \n" \
- "shll_s.ph $t5, $t5, 8 \n" \
- "shll_s.ph $t4, $t4, 8 \n" \
- "shll_s.ph $t9, $t9, 8 \n" \
- "shll_s.ph $t8, $t8, 8 \n" \
- "shll_s.ph $t2, $t2, 8 \n" \
- "shll_s.ph $t1, $t1, 8 \n" \
- "shra.ph $t5, $t5, 8 \n" \
- "shra.ph $t4, $t4, 8 \n" \
- "shra.ph $t9, $t9, 8 \n" \
- "shra.ph $t8, $t8, 8 \n" \
- "shra.ph $t2, $t2, 8 \n" \
- "shra.ph $t1, $t1, 8 \n" \
- "addu.ph $t5, $t5, $s5 \n" \
- "addu.ph $t4, $t4, $s5 \n" \
- "addu.ph $t9, $t9, $s5 \n" \
- "addu.ph $t8, $t8, $s5 \n" \
- "addu.ph $t2, $t2, $s5 \n" \
- "addu.ph $t1, $t1, $s5 \n"
-
-void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
- "beqz %[width], 2f \n"
- " repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
- "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
- "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
- "repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
- "repl.ph $s4, 16 \n" // |0|16|0|16|
- "repl.ph $s5, 128 \n" // |128|128| // clipping
- "lui $s6, 0xff00 \n"
- "ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|
-
- ".p2align 2 \n"
- "1: \n"
- I422ToTransientMipsRGB
-// Arranging into argb format
- "precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1|
- "precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0|
- "addiu %[width], -4 \n"
- "precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0|
- "precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0|
- "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
-
- "addiu %[y_buf], 4 \n"
- "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
- "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
- "or $t1, $t1, $s6 \n" // |ff|R1|ff|R0|
- "or $t2, $t2, $s6 \n" // |ff|r1|ff|r0|
- "precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1|
- "precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1|
- "sll $t9, $t9, 16 \n"
- "sll $t8, $t8, 16 \n"
- "packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0|
- "packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0|
-// Store results.
- "sw $t2, 0(%[rgb_buf]) \n"
- "sw $t0, 4(%[rgb_buf]) \n"
- "sw $t1, 8(%[rgb_buf]) \n"
- "sw $t3, 12(%[rgb_buf]) \n"
- "bnez %[width], 1b \n"
- " addiu %[rgb_buf], 16 \n"
- "2: \n"
- ".set pop \n"
- :[y_buf] "+r" (y_buf),
- [u_buf] "+r" (u_buf),
- [v_buf] "+r" (v_buf),
- [width] "+r" (width),
- [rgb_buf] "+r" (rgb_buf)
- :
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6", "t7", "t8", "t9",
- "s0", "s1", "s2", "s3",
- "s4", "s5", "s6"
- );
-}
-
-void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
- "beqz %[width], 2f \n"
- " repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
- "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
- "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
- "repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
- "repl.ph $s4, 16 \n" // |0|16|0|16|
- "repl.ph $s5, 128 \n" // |128|128|
- "lui $s6, 0xff00 \n"
- "ori $s6, 0xff00 \n" // |ff|00|ff|00|
-
- ".p2align 2 \n"
- "1: \n"
- I422ToTransientMipsRGB
-// Arranging into abgr format
- "precr.qb.ph $t0, $t8, $t1 \n" // |G1|g1|R1|r1|
- "precr.qb.ph $t3, $t9, $t2 \n" // |G0|g0|R0|r0|
- "precrq.qb.ph $t8, $t0, $t3 \n" // |G1|R1|G0|R0|
- "precr.qb.ph $t9, $t0, $t3 \n" // |g1|r1|g0|r0|
-
- "precr.qb.ph $t2, $t4, $t5 \n" // |B1|b1|B0|b0|
- "addiu %[width], -4 \n"
- "addiu %[y_buf], 4 \n"
- "preceu.ph.qbla $t1, $t2 \n" // |0 |B1|0 |B0|
- "preceu.ph.qbra $t2, $t2 \n" // |0 |b1|0 |b0|
- "or $t1, $t1, $s6 \n" // |ff|B1|ff|B0|
- "or $t2, $t2, $s6 \n" // |ff|b1|ff|b0|
- "precrq.ph.w $t0, $t2, $t9 \n" // |ff|b1|g1|r1|
- "precrq.ph.w $t3, $t1, $t8 \n" // |ff|B1|G1|R1|
- "sll $t9, $t9, 16 \n"
- "sll $t8, $t8, 16 \n"
- "packrl.ph $t2, $t2, $t9 \n" // |ff|b0|g0|r0|
- "packrl.ph $t1, $t1, $t8 \n" // |ff|B0|G0|R0|
-// Store results.
- "sw $t2, 0(%[rgb_buf]) \n"
- "sw $t0, 4(%[rgb_buf]) \n"
- "sw $t1, 8(%[rgb_buf]) \n"
- "sw $t3, 12(%[rgb_buf]) \n"
- "bnez %[width], 1b \n"
- " addiu %[rgb_buf], 16 \n"
- "2: \n"
- ".set pop \n"
- :[y_buf] "+r" (y_buf),
- [u_buf] "+r" (u_buf),
- [v_buf] "+r" (v_buf),
- [width] "+r" (width),
- [rgb_buf] "+r" (rgb_buf)
- :
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6", "t7", "t8", "t9",
- "s0", "s1", "s2", "s3",
- "s4", "s5", "s6"
- );
-}
-
-void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
- "beqz %[width], 2f \n"
- " repl.ph $s0, 74 \n" // |YG|YG| = |74 |74 |
- "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
- "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
- "repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
- "repl.ph $s4, 16 \n" // |0|16|0|16|
- "repl.ph $s5, 128 \n" // |128|128|
- "lui $s6, 0xff \n"
- "ori $s6, 0xff \n" // |00|ff|00|ff|
-
- ".p2align 2 \n"
- "1: \n"
- I422ToTransientMipsRGB
- // Arranging into bgra format
- "precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1|
- "precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0|
- "precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0|
- "precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0|
-
- "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
- "addiu %[width], -4 \n"
- "addiu %[y_buf], 4 \n"
- "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
- "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
- "sll $t1, $t1, 8 \n" // |R1|0 |R0|0 |
- "sll $t2, $t2, 8 \n" // |r1|0 |r0|0 |
- "or $t1, $t1, $s6 \n" // |R1|ff|R0|ff|
- "or $t2, $t2, $s6 \n" // |r1|ff|r0|ff|
- "precrq.ph.w $t0, $t9, $t2 \n" // |b1|g1|r1|ff|
- "precrq.ph.w $t3, $t8, $t1 \n" // |B1|G1|R1|ff|
- "sll $t1, $t1, 16 \n"
- "sll $t2, $t2, 16 \n"
- "packrl.ph $t2, $t9, $t2 \n" // |b0|g0|r0|ff|
- "packrl.ph $t1, $t8, $t1 \n" // |B0|G0|R0|ff|
-// Store results.
- "sw $t2, 0(%[rgb_buf]) \n"
- "sw $t0, 4(%[rgb_buf]) \n"
- "sw $t1, 8(%[rgb_buf]) \n"
- "sw $t3, 12(%[rgb_buf]) \n"
- "bnez %[width], 1b \n"
- " addiu %[rgb_buf], 16 \n"
- "2: \n"
- ".set pop \n"
- :[y_buf] "+r" (y_buf),
- [u_buf] "+r" (u_buf),
- [v_buf] "+r" (v_buf),
- [width] "+r" (width),
- [rgb_buf] "+r" (rgb_buf)
- :
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6", "t7", "t8", "t9",
- "s0", "s1", "s2", "s3",
- "s4", "s5", "s6"
- );
-}
-
-// Bilinear filter 8x2 -> 8x1
-void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- int y0_fraction = 256 - source_y_fraction;
- const uint8* src_ptr1 = src_ptr + src_stride;
-
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
-
- "replv.ph $t0, %[y0_fraction] \n"
- "replv.ph $t1, %[source_y_fraction] \n"
-
- ".p2align 2 \n"
- "1: \n"
- "lw $t2, 0(%[src_ptr]) \n"
- "lw $t3, 0(%[src_ptr1]) \n"
- "lw $t4, 4(%[src_ptr]) \n"
- "lw $t5, 4(%[src_ptr1]) \n"
- "muleu_s.ph.qbl $t6, $t2, $t0 \n"
- "muleu_s.ph.qbr $t7, $t2, $t0 \n"
- "muleu_s.ph.qbl $t8, $t3, $t1 \n"
- "muleu_s.ph.qbr $t9, $t3, $t1 \n"
- "muleu_s.ph.qbl $t2, $t4, $t0 \n"
- "muleu_s.ph.qbr $t3, $t4, $t0 \n"
- "muleu_s.ph.qbl $t4, $t5, $t1 \n"
- "muleu_s.ph.qbr $t5, $t5, $t1 \n"
- "addq.ph $t6, $t6, $t8 \n"
- "addq.ph $t7, $t7, $t9 \n"
- "addq.ph $t2, $t2, $t4 \n"
- "addq.ph $t3, $t3, $t5 \n"
- "shra.ph $t6, $t6, 8 \n"
- "shra.ph $t7, $t7, 8 \n"
- "shra.ph $t2, $t2, 8 \n"
- "shra.ph $t3, $t3, 8 \n"
- "precr.qb.ph $t6, $t6, $t7 \n"
- "precr.qb.ph $t2, $t2, $t3 \n"
- "addiu %[src_ptr], %[src_ptr], 8 \n"
- "addiu %[src_ptr1], %[src_ptr1], 8 \n"
- "addiu %[dst_width], %[dst_width], -8 \n"
- "sw $t6, 0(%[dst_ptr]) \n"
- "sw $t2, 4(%[dst_ptr]) \n"
- "bgtz %[dst_width], 1b \n"
- " addiu %[dst_ptr], %[dst_ptr], 8 \n"
-
- ".set pop \n"
- : [dst_ptr] "+r" (dst_ptr),
- [src_ptr1] "+r" (src_ptr1),
- [src_ptr] "+r" (src_ptr),
- [dst_width] "+r" (dst_width)
- : [source_y_fraction] "r" (source_y_fraction),
- [y0_fraction] "r" (y0_fraction),
- [src_stride] "r" (src_stride)
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6", "t7", "t8", "t9"
- );
-}
-#endif // __mips_dsp_rev >= 2
-
-#endif // defined(__mips__)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/row_neon.cc b/src/main/jni/libyuv/source/row_neon.cc
deleted file mode 100644
index 1392cf5fc..000000000
--- a/src/main/jni/libyuv/source/row_neon.cc
+++ /dev/null
@@ -1,3148 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
-
-// Read 8 Y, 4 U and 4 V from 422
-#define READYUV422 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- MEMACCESS(1) \
- "vld1.32 {d2[0]}, [%1]! \n" \
- MEMACCESS(2) \
- "vld1.32 {d2[1]}, [%2]! \n"
-
-// Read 8 Y, 2 U and 2 V from 422
-#define READYUV411 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- MEMACCESS(1) \
- "vld1.16 {d2[0]}, [%1]! \n" \
- MEMACCESS(2) \
- "vld1.16 {d2[1]}, [%2]! \n" \
- "vmov.u8 d3, d2 \n" \
- "vzip.u8 d2, d3 \n"
-
-// Read 8 Y, 8 U and 8 V from 444
-#define READYUV444 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- MEMACCESS(1) \
- "vld1.8 {d2}, [%1]! \n" \
- MEMACCESS(2) \
- "vld1.8 {d3}, [%2]! \n" \
- "vpaddl.u8 q1, q1 \n" \
- "vrshrn.u16 d2, q1, #1 \n"
-
-// Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- "vmov.u8 d2, #128 \n"
-
-// Read 8 Y and 4 UV from NV12
-#define READNV12 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- MEMACCESS(1) \
- "vld1.8 {d2}, [%1]! \n" \
- "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
- "vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n"
-
-// Read 8 Y and 4 VU from NV21
-#define READNV21 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- MEMACCESS(1) \
- "vld1.8 {d2}, [%1]! \n" \
- "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
- "vuzp.u8 d3, d2 \n" \
- "vtrn.u32 d2, d3 \n"
-
-// Read 8 YUY2
-#define READYUY2 \
- MEMACCESS(0) \
- "vld2.8 {d0, d2}, [%0]! \n" \
- "vmov.u8 d3, d2 \n" \
- "vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n"
-
-// Read 8 UYVY
-#define READUYVY \
- MEMACCESS(0) \
- "vld2.8 {d2, d3}, [%0]! \n" \
- "vmov.u8 d0, d3 \n" \
- "vmov.u8 d3, d2 \n" \
- "vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n"
-
-#define YUV422TORGB \
- "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\
- "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\
- "vmull.s8 q9, d2, d25 \n"/* u/v G component */\
- "vmov.u8 d1, #0 \n"/* split odd/even y apart */\
- "vtrn.u8 d0, d1 \n" \
- "vsub.s16 q0, q0, q15 \n"/* offset y */\
- "vmul.s16 q0, q0, q14 \n" \
- "vadd.s16 d18, d19 \n" \
- "vqadd.s16 d20, d0, d16 \n" /* B */ \
- "vqadd.s16 d21, d1, d16 \n" \
- "vqadd.s16 d22, d0, d17 \n" /* R */ \
- "vqadd.s16 d23, d1, d17 \n" \
- "vqadd.s16 d16, d0, d18 \n" /* G */ \
- "vqadd.s16 d17, d1, d18 \n" \
- "vqshrun.s16 d0, q10, #6 \n" /* B */ \
- "vqshrun.s16 d1, q11, #6 \n" /* G */ \
- "vqshrun.s16 d2, q8, #6 \n" /* R */ \
- "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\
- "vmovl.u8 q11, d1 \n" \
- "vmovl.u8 q8, d2 \n" \
- "vtrn.u8 d20, d21 \n" \
- "vtrn.u8 d22, d23 \n" \
- "vtrn.u8 d16, d17 \n" \
- "vmov.u8 d21, d16 \n"
-
-static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102,
- 0, 0, 0, 0, 0, 0, 0, 0 };
-static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
- 0, 0, 0, 0, 0, 0, 0, 0 };
-
-void I444ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV444
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n"
- MEMACCESS(3)
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void I422ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n"
- MEMACCESS(3)
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void I411ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV411
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n"
- MEMACCESS(3)
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void I422ToBGRARow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_bgra,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vswp.u8 d20, d22 \n"
- "vmov.u8 d19, #255 \n"
- MEMACCESS(3)
- "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_bgra), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void I422ToABGRRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_abgr,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vswp.u8 d20, d22 \n"
- "vmov.u8 d23, #255 \n"
- MEMACCESS(3)
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_abgr), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void I422ToRGBARow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgba,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d19, #255 \n"
- MEMACCESS(3)
- "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgba), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void I422ToRGB24Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb24,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- MEMACCESS(3)
- "vst3.8 {d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgb24), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void I422ToRAWRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_raw,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vswp.u8 d20, d22 \n"
- MEMACCESS(3)
- "vst3.8 {d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_raw), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-#define ARGBTORGB565 \
- "vshr.u8 d20, d20, #3 \n" /* B */ \
- "vshr.u8 d21, d21, #2 \n" /* G */ \
- "vshr.u8 d22, d22, #3 \n" /* R */ \
- "vmovl.u8 q8, d20 \n" /* B */ \
- "vmovl.u8 q9, d21 \n" /* G */ \
- "vmovl.u8 q10, d22 \n" /* R */ \
- "vshl.u16 q9, q9, #5 \n" /* G */ \
- "vshl.u16 q10, q10, #11 \n" /* R */ \
- "vorr q0, q8, q9 \n" /* BG */ \
- "vorr q0, q0, q10 \n" /* BGR */
-
-void I422ToRGB565Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb565,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- ARGBTORGB565
- MEMACCESS(3)
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgb565), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-#define ARGBTOARGB1555 \
- "vshr.u8 q10, q10, #3 \n" /* B */ \
- "vshr.u8 d22, d22, #3 \n" /* R */ \
- "vshr.u8 d23, d23, #7 \n" /* A */ \
- "vmovl.u8 q8, d20 \n" /* B */ \
- "vmovl.u8 q9, d21 \n" /* G */ \
- "vmovl.u8 q10, d22 \n" /* R */ \
- "vmovl.u8 q11, d23 \n" /* A */ \
- "vshl.u16 q9, q9, #5 \n" /* G */ \
- "vshl.u16 q10, q10, #10 \n" /* R */ \
- "vshl.u16 q11, q11, #15 \n" /* A */ \
- "vorr q0, q8, q9 \n" /* BG */ \
- "vorr q1, q10, q11 \n" /* RA */ \
- "vorr q0, q0, q1 \n" /* BGRA */
-
-void I422ToARGB1555Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb1555,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n"
- ARGBTOARGB1555
- MEMACCESS(3)
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb1555), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-#define ARGBTOARGB4444 \
- "vshr.u8 d20, d20, #4 \n" /* B */ \
- "vbic.32 d21, d21, d4 \n" /* G */ \
- "vshr.u8 d22, d22, #4 \n" /* R */ \
- "vbic.32 d23, d23, d4 \n" /* A */ \
- "vorr d0, d20, d21 \n" /* BG */ \
- "vorr d1, d22, d23 \n" /* RA */ \
- "vzip.u8 d0, d1 \n" /* BGRA */
-
-void I422ToARGB4444Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb4444,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n"
- ARGBTOARGB4444
- MEMACCESS(3)
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb4444), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void YToARGBRow_NEON(const uint8* src_y,
- uint8* dst_argb,
- int width) {
- asm volatile (
- MEMACCESS(3)
- "vld1.8 {d24}, [%3] \n"
- MEMACCESS(4)
- "vld1.8 {d25}, [%4] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV400
- YUV422TORGB
- "subs %2, %2, #8 \n"
- "vmov.u8 d23, #255 \n"
- MEMACCESS(1)
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(&kUVToRB), // %3
- "r"(&kUVToG) // %4
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void I400ToARGBRow_NEON(const uint8* src_y,
- uint8* dst_argb,
- int width) {
- asm volatile (
- ".p2align 2 \n"
- "vmov.u8 d23, #255 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {d20}, [%0]! \n"
- "vmov d21, d20 \n"
- "vmov d22, d20 \n"
- "subs %2, %2, #8 \n"
- MEMACCESS(1)
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d20", "d21", "d22", "d23"
- );
-}
-
-void NV12ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
- int width) {
- asm volatile (
- MEMACCESS(4)
- "vld1.8 {d24}, [%4] \n"
- MEMACCESS(5)
- "vld1.8 {d25}, [%5] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READNV12
- YUV422TORGB
- "subs %3, %3, #8 \n"
- "vmov.u8 d23, #255 \n"
- MEMACCESS(2)
- "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- : "r"(&kUVToRB), // %4
- "r"(&kUVToG) // %5
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void NV21ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
- int width) {
- asm volatile (
- MEMACCESS(4)
- "vld1.8 {d24}, [%4] \n"
- MEMACCESS(5)
- "vld1.8 {d25}, [%5] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READNV21
- YUV422TORGB
- "subs %3, %3, #8 \n"
- "vmov.u8 d23, #255 \n"
- MEMACCESS(2)
- "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- : "r"(&kUVToRB), // %4
- "r"(&kUVToG) // %5
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void NV12ToRGB565Row_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_rgb565,
- int width) {
- asm volatile (
- MEMACCESS(4)
- "vld1.8 {d24}, [%4] \n"
- MEMACCESS(5)
- "vld1.8 {d25}, [%5] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READNV12
- YUV422TORGB
- "subs %3, %3, #8 \n"
- ARGBTORGB565
- MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_rgb565), // %2
- "+r"(width) // %3
- : "r"(&kUVToRB), // %4
- "r"(&kUVToG) // %5
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void NV21ToRGB565Row_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_rgb565,
- int width) {
- asm volatile (
- MEMACCESS(4)
- "vld1.8 {d24}, [%4] \n"
- MEMACCESS(5)
- "vld1.8 {d25}, [%5] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READNV21
- YUV422TORGB
- "subs %3, %3, #8 \n"
- ARGBTORGB565
- MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_rgb565), // %2
- "+r"(width) // %3
- : "r"(&kUVToRB), // %4
- "r"(&kUVToG) // %5
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
- uint8* dst_argb,
- int width) {
- asm volatile (
- MEMACCESS(3)
- "vld1.8 {d24}, [%3] \n"
- MEMACCESS(4)
- "vld1.8 {d25}, [%4] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUY2
- YUV422TORGB
- "subs %2, %2, #8 \n"
- "vmov.u8 d23, #255 \n"
- MEMACCESS(1)
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(&kUVToRB), // %3
- "r"(&kUVToG) // %4
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
- uint8* dst_argb,
- int width) {
- asm volatile (
- MEMACCESS(3)
- "vld1.8 {d24}, [%3] \n"
- MEMACCESS(4)
- "vld1.8 {d25}, [%4] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READUYVY
- YUV422TORGB
- "subs %2, %2, #8 \n"
- "vmov.u8 d23, #255 \n"
- MEMACCESS(1)
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(&kUVToRB), // %3
- "r"(&kUVToG) // %4
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
- "subs %3, %3, #16 \n" // 16 processed per loop
- MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store U
- MEMACCESS(2)
- "vst1.8 {q1}, [%2]! \n" // store V
- "bgt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3 // Output registers
- : // Input registers
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-
-// Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
- int width) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load U
- MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n" // load V
- "subs %3, %3, #16 \n" // 16 processed per loop
- MEMACCESS(2)
- "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
- "bgt 1b \n"
- :
- "+r"(src_u), // %0
- "+r"(src_v), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3 // Output registers
- : // Input registers
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-
-// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
-void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
- "subs %2, %2, #32 \n" // 32 processed per loop
- MEMACCESS(1)
- "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(count) // %2 // Output registers
- : // Input registers
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-
-// SetRow8 writes 'count' bytes using a 32 bit value repeated.
-void SetRow_NEON(uint8* dst, uint32 v32, int count) {
- asm volatile (
- "vdup.u32 q0, %2 \n" // duplicate 4 ints
- "1: \n"
- "subs %1, %1, #16 \n" // 16 bytes per loop
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n" // store
- "bgt 1b \n"
- : "+r"(dst), // %0
- "+r"(count) // %1
- : "r"(v32) // %2
- : "cc", "memory", "q0"
- );
-}
-
-// TODO(fbarchard): Make fully assembler
-// SetRow32 writes 'count' words using a 32 bit value repeated.
-void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
- int dst_stride, int height) {
- for (int y = 0; y < height; ++y) {
- SetRow_NEON(dst, v32, width << 2);
- dst += dst_stride;
- }
-}
-
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
- asm volatile (
- // Start at end of source row.
- "mov r3, #-16 \n"
- "add %0, %0, %2 \n"
- "sub %0, #16 \n"
-
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0], r3 \n" // src -= 16
- "subs %2, #16 \n" // 16 pixels per loop.
- "vrev64.8 q0, q0 \n"
- MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // dst += 16
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "r3", "q0"
- );
-}
-
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) {
- asm volatile (
- // Start at end of source row.
- "mov r12, #-16 \n"
- "add %0, %0, %3, lsl #1 \n"
- "sub %0, #16 \n"
-
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
- "subs %3, #8 \n" // 8 pixels per loop.
- "vrev64.8 q0, q0 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // dst += 8
- MEMACCESS(2)
- "vst1.8 {d1}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "r12", "q0"
- );
-}
-
-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
- asm volatile (
- // Start at end of source row.
- "mov r3, #-16 \n"
- "add %0, %0, %2, lsl #2 \n"
- "sub %0, #16 \n"
-
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0], r3 \n" // src -= 16
- "subs %2, #4 \n" // 4 pixels per loop.
- "vrev64.32 q0, q0 \n"
- MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // dst += 16
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "r3", "q0"
- );
-}
-
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
- asm volatile (
- "vmov.u8 d4, #255 \n" // Alpha
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- MEMACCESS(1)
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
- );
-}
-
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
- asm volatile (
- "vmov.u8 d4, #255 \n" // Alpha
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- MEMACCESS(1)
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
- );
-}
-
-#define RGB565TOARGB \
- "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
- "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
- "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
- "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
- "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
- "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
- "vorr.u8 d0, d0, d4 \n" /* B */ \
- "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
- "vorr.u8 d2, d1, d5 \n" /* R */ \
- "vorr.u8 d1, d4, d6 \n" /* G */
-
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
- asm volatile (
- "vmov.u8 d3, #255 \n" // Alpha
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- RGB565TOARGB
- MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_rgb565), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-
-#define ARGB1555TOARGB \
- "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
- "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
- "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
- "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
- "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
- "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
- "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
- "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
- "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
- "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
- "vorr.u8 q1, q1, q3 \n" /* R,A */ \
- "vorr.u8 q0, q0, q2 \n" /* B,G */ \
-
-// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB \
- "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
- "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
- "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
- "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
- "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
- "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
- "vorr.u8 d0, d0, d4 \n" /* B */ \
- "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
- "vorr.u8 d2, d1, d5 \n" /* R */ \
- "vorr.u8 d1, d4, d6 \n" /* G */
-
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
- int pix) {
- asm volatile (
- "vmov.u8 d3, #255 \n" // Alpha
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGB1555TOARGB
- MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_argb1555), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-
-#define ARGB4444TOARGB \
- "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
- "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
- "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
- "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
- "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
- "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
- "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
- "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
-
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
- int pix) {
- asm volatile (
- "vmov.u8 d3, #255 \n" // Alpha
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGB4444TOARGB
- MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_argb4444), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q1", "q2" // Clobber List
- );
-}
-
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- MEMACCESS(1)
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_rgb24), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
- );
-}
-
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- MEMACCESS(1)
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_raw), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
- );
-}
-
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %2, %2, #16 \n" // 16 processed per loop.
- MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
- "bgt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %2, %2, #16 \n" // 16 processed per loop.
- MEMACCESS(1)
- "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
- "bgt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
- int pix) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
- MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // store 8 U.
- MEMACCESS(2)
- "vst1.8 {d3}, [%2]! \n" // store 8 V.
- "bgt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
- );
-}
-
-void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
- int pix) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 U.
- MEMACCESS(2)
- "vst1.8 {d2}, [%2]! \n" // store 8 V.
- "bgt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
- );
-}
-
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // stride + src_yuy2
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
- MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
- "vrhadd.u8 d1, d1, d5 \n" // average rows of U
- "vrhadd.u8 d3, d3, d7 \n" // average rows of V
- MEMACCESS(2)
- "vst1.8 {d1}, [%2]! \n" // store 8 U.
- MEMACCESS(3)
- "vst1.8 {d3}, [%3]! \n" // store 8 V.
- "bgt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(stride_yuy2), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
- );
-}
-
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // stride + src_uyvy
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
- MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
- "vrhadd.u8 d0, d0, d4 \n" // average rows of U
- "vrhadd.u8 d2, d2, d6 \n" // average rows of V
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 U.
- MEMACCESS(3)
- "vst1.8 {d2}, [%3]! \n" // store 8 V.
- "bgt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(stride_uyvy), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
- );
-}
-
-void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix) {
- asm volatile (
- // change the stride to row 2 pointer
- "add %1, %0 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels.
- "subs %3, %3, #16 \n" // 16 processed per loop
- MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels.
- "vrhadd.u8 q0, q1 \n" // average row 1 and 2
- MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(src_uv_stride), // %1
- "+r"(dst_uv), // %2
- "+r"(pix) // %3
- :
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-
-// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG
-void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) {
- asm volatile (
- "vmov.u32 d6[0], %3 \n" // selector
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
- "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
- "vtrn.u32 d4, d5 \n" // combine 8 pixels
- MEMACCESS(1)
- "vst1.8 {d4}, [%1]! \n" // store 8.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_bayer), // %1
- "+r"(pix) // %2
- : "r"(selector) // %3
- : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-
-// Select G channels from ARGB. e.g. GGGGGGGG
-void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
- uint32 /*selector*/, int pix) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop
- MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // store 8 G's.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_bayer), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int pix) {
- asm volatile (
- MEMACCESS(3)
- "vld1.8 {q2}, [%3] \n" // shuffler
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
- "subs %2, %2, #4 \n" // 4 processed per loop
- "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
- "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
- MEMACCESS(1)
- "vst1.8 {q1}, [%1]! \n" // store 4.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- : "r"(shuffler) // %3
- : "cc", "memory", "q0", "q1", "q2" // Clobber List
- );
-}
-
-void I422ToYUY2Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_yuy2, int width) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
- MEMACCESS(1)
- "vld1.8 {d1}, [%1]! \n" // load 8 Us
- MEMACCESS(2)
- "vld1.8 {d3}, [%2]! \n" // load 8 Vs
- "subs %4, %4, #16 \n" // 16 pixels
- MEMACCESS(3)
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_yuy2), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "d0", "d1", "d2", "d3"
- );
-}
-
-void I422ToUYVYRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uyvy, int width) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
- MEMACCESS(1)
- "vld1.8 {d0}, [%1]! \n" // load 8 Us
- MEMACCESS(2)
- "vld1.8 {d2}, [%2]! \n" // load 8 Vs
- "subs %4, %4, #16 \n" // 16 pixels
- MEMACCESS(3)
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_uyvy), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "d0", "d1", "d2", "d3"
- );
-}
-
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGBTORGB565
- MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_rgb565), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
- );
-}
-
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
- int pix) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGBTOARGB1555
- MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb1555), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
- );
-}
-
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
- int pix) {
- asm volatile (
- "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGBTOARGB4444
- MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb4444), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
- );
-}
-
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
- asm volatile (
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
- );
-}
-
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
- asm volatile (
- "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
- "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
- "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
- );
-}
-
-// 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int pix) {
- asm volatile (
- "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient
- "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
- "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
- "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
- "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlsl.u8 q2, d1, d25 \n" // G
- "vmlsl.u8 q2, d2, d26 \n" // R
- "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
-
- "vmull.u8 q3, d2, d24 \n" // R
- "vmlsl.u8 q3, d1, d28 \n" // G
- "vmlsl.u8 q3, d0, d27 \n" // B
- "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
-
- "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
-
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
- MEMACCESS(2)
- "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
- );
-}
-
-// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
-void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int pix) {
- asm volatile (
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
-
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
-
- "subs %3, %3, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q0, q10 \n" // B
- "vmls.s16 q8, q1, q11 \n" // G
- "vmls.s16 q8, q2, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
-
- "vmul.s16 q9, q2, q10 \n" // R
- "vmls.s16 q9, q1, q14 \n" // G
- "vmls.s16 q9, q0, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
-
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
-
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
- MEMACCESS(2)
- "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32.
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int pix) {
- asm volatile (
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(0)
- "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels.
- MEMACCESS(0)
- "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels.
- "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts.
- "vpadd.u16 d1, d8, d9 \n" // B
- "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts.
- "vpadd.u16 d3, d10, d11 \n" // G
- "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts.
- "vpadd.u16 d5, d12, d13 \n" // R
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %3, %3, #32 \n" // 32 processed per loop.
- "vmul.s16 q8, q0, q10 \n" // B
- "vmls.s16 q8, q1, q11 \n" // G
- "vmls.s16 q8, q2, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q2, q10 \n" // R
- "vmls.s16 q9, q1, q14 \n" // G
- "vmls.s16 q9, q0, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
- MEMACCESS(2)
- "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
-#define RGBTOUV(QB, QG, QR) \
- "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
- "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
- "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
- "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
- "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
- "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
- "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
- "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
- "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
- "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
-
-// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
- MEMACCESS(1)
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q0, q1, q2)
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stride_argb), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-// TODO(fbarchard): Subsample match C code.
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
- "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
- "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
- "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
- "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
- MEMACCESS(1)
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q0, q1, q2)
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stride_argb), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_bgra
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
- "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
- MEMACCESS(1)
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
- "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q1, q1, #1 \n" // 2x average
- "vrshr.u16 q2, q2, #1 \n"
- "vrshr.u16 q3, q3, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q3, q2, q1)
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(src_stride_bgra), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_abgr
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
- "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
- MEMACCESS(1)
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
- "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q2, q1, q0)
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(src_stride_abgr), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_rgba
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
- "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
- MEMACCESS(1)
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
- "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q0, q1, q2)
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_rgba), // %0
- "+r"(src_stride_rgba), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_rgb24
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
- MEMACCESS(0)
- "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
- MEMACCESS(1)
- "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q0, q1, q2)
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(src_stride_rgb24), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_raw
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
- MEMACCESS(0)
- "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
- "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
- MEMACCESS(1)
- "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
- "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q2, q1, q0)
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(src_stride_raw), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- RGB565TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
- RGB565TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
- RGB565TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
- RGB565TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_rgb565), // %0
- "+r"(src_stride_rgb565), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- RGB555TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
- RGB555TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
- RGB555TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
- RGB555TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb1555), // %0
- "+r"(src_stride_argb1555), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb4444), // %0
- "+r"(src_stride_argb4444), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
- asm volatile (
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- RGB565TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_rgb565), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
- );
-}
-
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
- asm volatile (
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGB1555TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_argb1555), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
- );
-}
-
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
- asm volatile (
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGB4444TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_argb4444), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
- );
-}
-
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
- asm volatile (
- "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d1, d4 \n" // R
- "vmlal.u8 q8, d2, d5 \n" // G
- "vmlal.u8 q8, d3, d6 \n" // B
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
- );
-}
-
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
- asm volatile (
- "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // R
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // B
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
- );
-}
-
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
- asm volatile (
- "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d1, d4 \n" // B
- "vmlal.u8 q8, d2, d5 \n" // G
- "vmlal.u8 q8, d3, d6 \n" // R
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_rgba), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
- );
-}
-
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
- asm volatile (
- "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // B
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // R
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
- );
-}
-
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
- asm volatile (
- "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // B
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // R
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
- );
-}
-
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_NEON(uint8* dst_ptr,
- const uint8* src_ptr, ptrdiff_t src_stride,
- int dst_width, int source_y_fraction) {
- asm volatile (
- "cmp %4, #0 \n"
- "beq 100f \n"
- "add %2, %1 \n"
- "cmp %4, #64 \n"
- "beq 75f \n"
- "cmp %4, #128 \n"
- "beq 50f \n"
- "cmp %4, #192 \n"
- "beq 25f \n"
-
- "vdup.8 d5, %4 \n"
- "rsb %4, #256 \n"
- "vdup.8 d4, %4 \n"
- // General purpose row blend.
- "1: \n"
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
- MEMACCESS(2)
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vmull.u8 q13, d0, d4 \n"
- "vmull.u8 q14, d1, d4 \n"
- "vmlal.u8 q13, d2, d5 \n"
- "vmlal.u8 q14, d3, d5 \n"
- "vrshrn.u16 d0, q13, #8 \n"
- "vrshrn.u16 d1, q14, #8 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 1b \n"
- "b 99f \n"
-
- // Blend 25 / 75.
- "25: \n"
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
- MEMACCESS(2)
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 25b \n"
- "b 99f \n"
-
- // Blend 50 / 50.
- "50: \n"
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
- MEMACCESS(2)
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 50b \n"
- "b 99f \n"
-
- // Blend 75 / 25.
- "75: \n"
- MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n"
- MEMACCESS(2)
- "vld1.8 {q0}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 75b \n"
- "b 99f \n"
-
- // Blend 100 / 0 - Copy row unchanged.
- "100: \n"
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
- "subs %3, %3, #16 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 100b \n"
-
- "99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(src_stride), // %2
- "+r"(dst_width), // %3
- "+r"(source_y_fraction) // %4
- :
- : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
- );
-}
-
-// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- asm volatile (
- "subs %3, #8 \n"
- "blt 89f \n"
- // Blend 8 pixels.
- "8: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
- MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q10, d4, d3 \n" // db * a
- "vmull.u8 q11, d5, d3 \n" // dg * a
- "vmull.u8 q12, d6, d3 \n" // dr * a
- "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
- "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
- "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
- "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
- "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
- "vqadd.u8 q0, q0, q2 \n" // + sbg
- "vqadd.u8 d2, d2, d6 \n" // + sr
- "vmov.u8 d3, #255 \n" // a = 255
- MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
- "bge 8b \n"
-
- "89: \n"
- "adds %3, #8-1 \n"
- "blt 99f \n"
-
- // Blend 1 pixels.
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
- MEMACCESS(1)
- "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
- "subs %3, %3, #1 \n" // 1 processed per loop.
- "vmull.u8 q10, d4, d3 \n" // db * a
- "vmull.u8 q11, d5, d3 \n" // dg * a
- "vmull.u8 q12, d6, d3 \n" // dr * a
- "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
- "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
- "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
- "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
- "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
- "vqadd.u8 q0, q0, q2 \n" // + sbg
- "vqadd.u8 d2, d2, d6 \n" // + sr
- "vmov.u8 d3, #255 \n" // a = 255
- MEMACCESS(2)
- "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
- "bge 1b \n"
-
- "99: \n"
-
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
- );
-}
-
-// Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile (
- // Attenuate 8 pixels.
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q10, d0, d3 \n" // b * a
- "vmull.u8 q11, d1, d3 \n" // g * a
- "vmull.u8 q12, d2, d3 \n" // r * a
- "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
- "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
- "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
- MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
- );
-}
-
-// Quantize 8 ARGB pixels (32 bytes).
-// dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
- int interval_offset, int width) {
- asm volatile (
- "vdup.u16 q8, %2 \n"
- "vshr.u16 q8, q8, #1 \n" // scale >>= 1
- "vdup.u16 q9, %3 \n" // interval multiply.
- "vdup.u16 q10, %4 \n" // interval add
-
- // 8 pixel loop.
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
- "subs %1, %1, #8 \n" // 8 processed per loop.
- "vmovl.u8 q0, d0 \n" // b (0 .. 255)
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q2, d4 \n"
- "vqdmulh.s16 q0, q0, q8 \n" // b * scale
- "vqdmulh.s16 q1, q1, q8 \n" // g
- "vqdmulh.s16 q2, q2, q8 \n" // r
- "vmul.u16 q0, q0, q9 \n" // b * interval_size
- "vmul.u16 q1, q1, q9 \n" // g
- "vmul.u16 q2, q2, q9 \n" // r
- "vadd.u16 q0, q0, q10 \n" // b + interval_offset
- "vadd.u16 q1, q1, q10 \n" // g
- "vadd.u16 q2, q2, q10 \n" // r
- "vqmovn.u16 d0, q0 \n"
- "vqmovn.u16 d2, q1 \n"
- "vqmovn.u16 d4, q2 \n"
- MEMACCESS(0)
- "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- : "r"(scale), // %2
- "r"(interval_size), // %3
- "r"(interval_offset) // %4
- : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
- );
-}
-
-// Shade 8 pixels at a time by specified value.
-// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
-// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
- uint32 value) {
- asm volatile (
- "vdup.u32 q0, %3 \n" // duplicate scale value.
- "vzip.u8 d0, d1 \n" // d0 aarrggbb.
- "vshr.u16 q0, q0, #1 \n" // scale / 2.
-
- // 8 pixel loop.
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmovl.u8 q10, d20 \n" // b (0 .. 255)
- "vmovl.u8 q11, d22 \n"
- "vmovl.u8 q12, d24 \n"
- "vmovl.u8 q13, d26 \n"
- "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
- "vqrdmulh.s16 q11, q11, d0[1] \n" // g
- "vqrdmulh.s16 q12, q12, d0[2] \n" // r
- "vqrdmulh.s16 q13, q13, d0[3] \n" // a
- "vqmovn.u16 d20, q10 \n"
- "vqmovn.u16 d22, q11 \n"
- "vqmovn.u16 d24, q12 \n"
- "vqmovn.u16 d26, q13 \n"
- MEMACCESS(1)
- "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(value) // %3
- : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
- );
-}
-
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-// Similar to ARGBToYJ but stores ARGB.
-// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile (
- "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
- "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
- "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
- "vmov d1, d0 \n" // G
- "vmov d2, d0 \n" // R
- MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
- );
-}
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-// b = (r * 35 + g * 68 + b * 17) >> 7
-// g = (r * 45 + g * 88 + b * 22) >> 7
-// r = (r * 50 + g * 98 + b * 24) >> 7
-void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
- asm volatile (
- "vmov.u8 d20, #17 \n" // BB coefficient
- "vmov.u8 d21, #68 \n" // BG coefficient
- "vmov.u8 d22, #35 \n" // BR coefficient
- "vmov.u8 d24, #22 \n" // GB coefficient
- "vmov.u8 d25, #88 \n" // GG coefficient
- "vmov.u8 d26, #45 \n" // GR coefficient
- "vmov.u8 d28, #24 \n" // BB coefficient
- "vmov.u8 d29, #98 \n" // BG coefficient
- "vmov.u8 d30, #50 \n" // BR coefficient
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
- "subs %1, %1, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d20 \n" // B to Sepia B
- "vmlal.u8 q2, d1, d21 \n" // G
- "vmlal.u8 q2, d2, d22 \n" // R
- "vmull.u8 q3, d0, d24 \n" // B to Sepia G
- "vmlal.u8 q3, d1, d25 \n" // G
- "vmlal.u8 q3, d2, d26 \n" // R
- "vmull.u8 q8, d0, d28 \n" // B to Sepia R
- "vmlal.u8 q8, d1, d29 \n" // G
- "vmlal.u8 q8, d2, d30 \n" // R
- "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
- "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
- "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
- MEMACCESS(0)
- "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- :
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
-// needs to saturate. Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
- const int8* matrix_argb, int width) {
- asm volatile (
- MEMACCESS(3)
- "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
- "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
- "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
-
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
- "vmovl.u8 q9, d18 \n" // g
- "vmovl.u8 q10, d20 \n" // r
- "vmovl.u8 q15, d22 \n" // a
- "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
- "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
- "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
- "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
- "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
- "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
- "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
- "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
- "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
- "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
- "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B
- "vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G
- "vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R
- "vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
- "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
- "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
- "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
- MEMACCESS(1)
- "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(matrix_argb) // %3
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
- "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
-// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
-#ifdef HAS_ARGBMULTIPLYROW_NEON
-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- asm volatile (
- // 8 pixel loop.
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(1)
- "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q0, d0, d1 \n" // multiply B
- "vmull.u8 q1, d2, d3 \n" // multiply G
- "vmull.u8 q2, d4, d5 \n" // multiply R
- "vmull.u8 q3, d6, d7 \n" // multiply A
- "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
- "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
- "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
- "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
- MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
-
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3"
- );
-}
-#endif // HAS_ARGBMULTIPLYROW_NEON
-
-// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- asm volatile (
- // 8 pixel loop.
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 q0, q0, q2 \n" // add B, G
- "vqadd.u8 q1, q1, q3 \n" // add R, A
- MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
-
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3"
- );
-}
-
-// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- asm volatile (
- // 8 pixel loop.
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqsub.u8 q0, q0, q2 \n" // subtract B, G
- "vqsub.u8 q1, q1, q3 \n" // subtract R, A
- MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
-
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3"
- );
-}
-
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
- asm volatile (
- "vmov.u8 d3, #255 \n" // alpha
- // 8 pixel loop.
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
- MEMACCESS(1)
- "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d0, d0, d1 \n" // add
- "vmov.u8 d1, d0 \n"
- "vmov.u8 d2, d0 \n"
- MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1"
- );
-}
-
-// Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_y, int width) {
- asm volatile (
- // 16 pixel loop.
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
- MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
- "subs %3, %3, #16 \n" // 16 processed per loop.
- "vqadd.u8 q0, q0, q1 \n" // add
- MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
- "bgt 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_y), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1"
- );
-}
-
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
- asm volatile (
- "vmov.u8 d3, #255 \n" // alpha
- // 8 pixel loop.
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
- MEMACCESS(1)
- "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d1, d0, d2 \n" // add
- MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1"
- );
-}
-
-// SobelX as a matrix is
-// -1 0 1
-// -2 0 2
-// -1 0 1
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
- const uint8* src_y2, uint8* dst_sobelx, int width) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {d0}, [%0],%5 \n" // top
- MEMACCESS(0)
- "vld1.8 {d1}, [%0],%6 \n"
- "vsubl.u8 q0, d0, d1 \n"
- MEMACCESS(1)
- "vld1.8 {d2}, [%1],%5 \n" // center * 2
- MEMACCESS(1)
- "vld1.8 {d3}, [%1],%6 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
- MEMACCESS(2)
- "vld1.8 {d2}, [%2],%5 \n" // bottom
- MEMACCESS(2)
- "vld1.8 {d3}, [%2],%6 \n"
- "subs %4, %4, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
- MEMACCESS(3)
- "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
- "bgt 1b \n"
- : "+r"(src_y0), // %0
- "+r"(src_y1), // %1
- "+r"(src_y2), // %2
- "+r"(dst_sobelx), // %3
- "+r"(width) // %4
- : "r"(2), // %5
- "r"(6) // %6
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-
-// SobelY as a matrix is
-// -1 -2 -1
-// 0 0 0
-// 1 2 1
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {d0}, [%0],%4 \n" // left
- MEMACCESS(1)
- "vld1.8 {d1}, [%1],%4 \n"
- "vsubl.u8 q0, d0, d1 \n"
- MEMACCESS(0)
- "vld1.8 {d2}, [%0],%4 \n" // center * 2
- MEMACCESS(1)
- "vld1.8 {d3}, [%1],%4 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
- MEMACCESS(0)
- "vld1.8 {d2}, [%0],%5 \n" // right
- MEMACCESS(1)
- "vld1.8 {d3}, [%1],%5 \n"
- "subs %3, %3, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 sobely
- "bgt 1b \n"
- : "+r"(src_y0), // %0
- "+r"(src_y1), // %1
- "+r"(dst_sobely), // %2
- "+r"(width) // %3
- : "r"(1), // %4
- "r"(6) // %5
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-#endif // __ARM_NEON__
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/row_neon64.cc b/src/main/jni/libyuv/source/row_neon64.cc
deleted file mode 100644
index 952e10d73..000000000
--- a/src/main/jni/libyuv/source/row_neon64.cc
+++ /dev/null
@@ -1,3327 +0,0 @@
-/*
- * Copyright 2014 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-// Read 8 Y, 4 U and 4 V from 422
-#define READYUV422 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- MEMACCESS(1) \
- "vld1.32 {d2[0]}, [%1]! \n" \
- MEMACCESS(2) \
- "vld1.32 {d2[1]}, [%2]! \n"
-
-// Read 8 Y, 2 U and 2 V from 422
-#define READYUV411 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- MEMACCESS(1) \
- "vld1.16 {d2[0]}, [%1]! \n" \
- MEMACCESS(2) \
- "vld1.16 {d2[1]}, [%2]! \n" \
- "vmov.u8 d3, d2 \n" \
- "vzip.u8 d2, d3 \n"
-
-// Read 8 Y, 8 U and 8 V from 444
-#define READYUV444 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- MEMACCESS(1) \
- "vld1.8 {d2}, [%1]! \n" \
- MEMACCESS(2) \
- "vld1.8 {d3}, [%2]! \n" \
- "vpaddl.u8 q1, q1 \n" \
- "vrshrn.u16 d2, q1, #1 \n"
-
-// Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- "vmov.u8 d2, #128 \n"
-
-// Read 8 Y and 4 UV from NV12
-#define READNV12 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- MEMACCESS(1) \
- "vld1.8 {d2}, [%1]! \n" \
- "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
- "vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n"
-
-// Read 8 Y and 4 VU from NV21
-#define READNV21 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- MEMACCESS(1) \
- "vld1.8 {d2}, [%1]! \n" \
- "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
- "vuzp.u8 d3, d2 \n" \
- "vtrn.u32 d2, d3 \n"
-
-// Read 8 YUY2
-#define READYUY2 \
- MEMACCESS(0) \
- "vld2.8 {d0, d2}, [%0]! \n" \
- "vmov.u8 d3, d2 \n" \
- "vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n"
-
-// Read 8 UYVY
-#define READUYVY \
- MEMACCESS(0) \
- "vld2.8 {d2, d3}, [%0]! \n" \
- "vmov.u8 d0, d3 \n" \
- "vmov.u8 d3, d2 \n" \
- "vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n"
-
-#define YUV422TORGB \
- "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\
- "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\
- "vmull.s8 q9, d2, d25 \n"/* u/v G component */\
- "vmov.u8 d1, #0 \n"/* split odd/even y apart */\
- "vtrn.u8 d0, d1 \n" \
- "vsub.s16 q0, q0, q15 \n"/* offset y */\
- "vmul.s16 q0, q0, q14 \n" \
- "vadd.s16 d18, d19 \n" \
- "vqadd.s16 d20, d0, d16 \n" /* B */ \
- "vqadd.s16 d21, d1, d16 \n" \
- "vqadd.s16 d22, d0, d17 \n" /* R */ \
- "vqadd.s16 d23, d1, d17 \n" \
- "vqadd.s16 d16, d0, d18 \n" /* G */ \
- "vqadd.s16 d17, d1, d18 \n" \
- "vqshrun.s16 d0, q10, #6 \n" /* B */ \
- "vqshrun.s16 d1, q11, #6 \n" /* G */ \
- "vqshrun.s16 d2, q8, #6 \n" /* R */ \
- "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\
- "vmovl.u8 q11, d1 \n" \
- "vmovl.u8 q8, d2 \n" \
- "vtrn.u8 d20, d21 \n" \
- "vtrn.u8 d22, d23 \n" \
- "vtrn.u8 d16, d17 \n" \
- "vmov.u8 d21, d16 \n"
-
-static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102,
- 0, 0, 0, 0, 0, 0, 0, 0 };
-static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
- 0, 0, 0, 0, 0, 0, 0, 0 };
-
-#ifdef HAS_I444TOARGBROW_NEON
-void I444ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV444
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n"
- MEMACCESS(3)
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_I444TOARGBROW_NEON
-
-#ifdef HAS_I422TOARGBROW_NEON
-void I422ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n"
- MEMACCESS(3)
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_I422TOARGBROW_NEON
-
-#ifdef HAS_I411TOARGBROW_NEON
-void I411ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV411
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n"
- MEMACCESS(3)
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_I411TOARGBROW_NEON
-
-#ifdef HAS_I422TOBGRAROW_NEON
-void I422ToBGRARow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_bgra,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vswp.u8 d20, d22 \n"
- "vmov.u8 d19, #255 \n"
- MEMACCESS(3)
- "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_bgra), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_I422TOBGRAROW_NEON
-
-#ifdef HAS_I422TOABGRROW_NEON
-void I422ToABGRRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_abgr,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vswp.u8 d20, d22 \n"
- "vmov.u8 d23, #255 \n"
- MEMACCESS(3)
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_abgr), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_I422TOABGRROW_NEON
-
-#ifdef HAS_I422TORGBAROW_NEON
-void I422ToRGBARow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgba,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d19, #255 \n"
- MEMACCESS(3)
- "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgba), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_I422TORGBAROW_NEON
-
-#ifdef HAS_I422TORGB24ROW_NEON
-void I422ToRGB24Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb24,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- MEMACCESS(3)
- "vst3.8 {d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgb24), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_I422TORGB24ROW_NEON
-
-#ifdef HAS_I422TORAWROW_NEON
-void I422ToRAWRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_raw,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vswp.u8 d20, d22 \n"
- MEMACCESS(3)
- "vst3.8 {d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_raw), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_I422TORAWROW_NEON
-
-#define ARGBTORGB565 \
- "vshr.u8 d20, d20, #3 \n" /* B */ \
- "vshr.u8 d21, d21, #2 \n" /* G */ \
- "vshr.u8 d22, d22, #3 \n" /* R */ \
- "vmovl.u8 q8, d20 \n" /* B */ \
- "vmovl.u8 q9, d21 \n" /* G */ \
- "vmovl.u8 q10, d22 \n" /* R */ \
- "vshl.u16 q9, q9, #5 \n" /* G */ \
- "vshl.u16 q10, q10, #11 \n" /* R */ \
- "vorr q0, q8, q9 \n" /* BG */ \
- "vorr q0, q0, q10 \n" /* BGR */
-
-#ifdef HAS_I422TORGB565ROW_NEON
-void I422ToRGB565Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb565,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- ARGBTORGB565
- MEMACCESS(3)
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgb565), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_I422TORGB565ROW_NEON
-
-#define ARGBTOARGB1555 \
- "vshr.u8 q10, q10, #3 \n" /* B */ \
- "vshr.u8 d22, d22, #3 \n" /* R */ \
- "vshr.u8 d23, d23, #7 \n" /* A */ \
- "vmovl.u8 q8, d20 \n" /* B */ \
- "vmovl.u8 q9, d21 \n" /* G */ \
- "vmovl.u8 q10, d22 \n" /* R */ \
- "vmovl.u8 q11, d23 \n" /* A */ \
- "vshl.u16 q9, q9, #5 \n" /* G */ \
- "vshl.u16 q10, q10, #10 \n" /* R */ \
- "vshl.u16 q11, q11, #15 \n" /* A */ \
- "vorr q0, q8, q9 \n" /* BG */ \
- "vorr q1, q10, q11 \n" /* RA */ \
- "vorr q0, q0, q1 \n" /* BGRA */
-
-#ifdef HAS_I422TOARGB1555ROW_NEON
-void I422ToARGB1555Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb1555,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n"
- ARGBTOARGB1555
- MEMACCESS(3)
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb1555), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_I422TOARGB1555ROW_NEON
-
-#define ARGBTOARGB4444 \
- "vshr.u8 d20, d20, #4 \n" /* B */ \
- "vbic.32 d21, d21, d4 \n" /* G */ \
- "vshr.u8 d22, d22, #4 \n" /* R */ \
- "vbic.32 d23, d23, d4 \n" /* A */ \
- "vorr d0, d20, d21 \n" /* BG */ \
- "vorr d1, d22, d23 \n" /* RA */ \
- "vzip.u8 d0, d1 \n" /* BGRA */
-
-#ifdef HAS_I422TOARGB4444ROW_NEON
-void I422ToARGB4444Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb4444,
- int width) {
- asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n"
- ARGBTOARGB4444
- MEMACCESS(3)
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb4444), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_I422TOARGB4444ROW_NEON
-
-#ifdef HAS_YTOARGBROW_NEON
-void YToARGBRow_NEON(const uint8* src_y,
- uint8* dst_argb,
- int width) {
- asm volatile (
- MEMACCESS(3)
- "vld1.8 {d24}, [%3] \n"
- MEMACCESS(4)
- "vld1.8 {d25}, [%4] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV400
- YUV422TORGB
- "subs %2, %2, #8 \n"
- "vmov.u8 d23, #255 \n"
- MEMACCESS(1)
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(&kUVToRB), // %3
- "r"(&kUVToG) // %4
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_YTOARGBROW_NEON
-
-#ifdef HAS_I400TOARGBROW_NEON
-void I400ToARGBRow_NEON(const uint8* src_y,
- uint8* dst_argb,
- int width) {
- asm volatile (
- ".p2align 2 \n"
- "vmov.u8 d23, #255 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {d20}, [%0]! \n"
- "vmov d21, d20 \n"
- "vmov d22, d20 \n"
- "subs %2, %2, #8 \n"
- MEMACCESS(1)
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d20", "d21", "d22", "d23"
- );
-}
-#endif // HAS_I400TOARGBROW_NEON
-
-#ifdef HAS_NV12TOARGBROW_NEON
-void NV12ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
- int width) {
- asm volatile (
- MEMACCESS(4)
- "vld1.8 {d24}, [%4] \n"
- MEMACCESS(5)
- "vld1.8 {d25}, [%5] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READNV12
- YUV422TORGB
- "subs %3, %3, #8 \n"
- "vmov.u8 d23, #255 \n"
- MEMACCESS(2)
- "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- : "r"(&kUVToRB), // %4
- "r"(&kUVToG) // %5
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_NV12TOARGBROW_NEON
-
-#ifdef HAS_NV21TOARGBROW_NEON
-void NV21ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
- int width) {
- asm volatile (
- MEMACCESS(4)
- "vld1.8 {d24}, [%4] \n"
- MEMACCESS(5)
- "vld1.8 {d25}, [%5] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READNV21
- YUV422TORGB
- "subs %3, %3, #8 \n"
- "vmov.u8 d23, #255 \n"
- MEMACCESS(2)
- "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- : "r"(&kUVToRB), // %4
- "r"(&kUVToG) // %5
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_NV21TOARGBROW_NEON
-
-#ifdef HAS_NV12TORGB565ROW_NEON
-void NV12ToRGB565Row_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_rgb565,
- int width) {
- asm volatile (
- MEMACCESS(4)
- "vld1.8 {d24}, [%4] \n"
- MEMACCESS(5)
- "vld1.8 {d25}, [%5] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READNV12
- YUV422TORGB
- "subs %3, %3, #8 \n"
- ARGBTORGB565
- MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_rgb565), // %2
- "+r"(width) // %3
- : "r"(&kUVToRB), // %4
- "r"(&kUVToG) // %5
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_NV12TORGB565ROW_NEON
-
-#ifdef HAS_NV21TORGB565ROW_NEON
-void NV21ToRGB565Row_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_rgb565,
- int width) {
- asm volatile (
- MEMACCESS(4)
- "vld1.8 {d24}, [%4] \n"
- MEMACCESS(5)
- "vld1.8 {d25}, [%5] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READNV21
- YUV422TORGB
- "subs %3, %3, #8 \n"
- ARGBTORGB565
- MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_rgb565), // %2
- "+r"(width) // %3
- : "r"(&kUVToRB), // %4
- "r"(&kUVToG) // %5
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_NV21TORGB565ROW_NEON
-
-#ifdef HAS_YUY2TOARGBROW_NEON
-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
- uint8* dst_argb,
- int width) {
- asm volatile (
- MEMACCESS(3)
- "vld1.8 {d24}, [%3] \n"
- MEMACCESS(4)
- "vld1.8 {d25}, [%4] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUY2
- YUV422TORGB
- "subs %2, %2, #8 \n"
- "vmov.u8 d23, #255 \n"
- MEMACCESS(1)
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(&kUVToRB), // %3
- "r"(&kUVToG) // %4
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_YUY2TOARGBROW_NEON
-
-#ifdef HAS_UYVYTOARGBROW_NEON
-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
- uint8* dst_argb,
- int width) {
- asm volatile (
- MEMACCESS(3)
- "vld1.8 {d24}, [%3] \n"
- MEMACCESS(4)
- "vld1.8 {d25}, [%4] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READUYVY
- YUV422TORGB
- "subs %2, %2, #8 \n"
- "vmov.u8 d23, #255 \n"
- MEMACCESS(1)
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(&kUVToRB), // %3
- "r"(&kUVToG) // %4
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_UYVYTOARGBROW_NEON
-
-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-#ifdef HAS_SPLITUVROW_NEON
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV
- "subs %3, %3, #16 \n" // 16 processed per loop
- MEMACCESS(1)
- "st1 {v0.16b}, [%1], #16 \n" // store U
- MEMACCESS(2)
- "st1 {v1.16b}, [%2], #16 \n" // store V
- "bgt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3 // Output registers
- : // Input registers
- : "cc", "memory", "v0", "v1" // Clobber List
- );
-}
-#endif // HAS_SPLITUVROW_NEON
-
-// Reads 16 U's and V's and writes out 16 pairs of UV.
-#ifdef HAS_MERGEUVROW_NEON
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
- int width) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load U
- MEMACCESS(1)
- "ld1 {v1.16b}, [%1], #16 \n" // load V
- "subs %3, %3, #16 \n" // 16 processed per loop
- MEMACCESS(2)
- "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV
- "bgt 1b \n"
- :
- "+r"(src_u), // %0
- "+r"(src_v), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3 // Output registers
- : // Input registers
- : "cc", "memory", "v0", "v1" // Clobber List
- );
-}
-#endif // HAS_MERGEUVROW_NEON
-
-// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
-#ifdef HAS_COPYROW_NEON
-void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32
- "subs %2, %2, #32 \n" // 32 processed per loop
- MEMACCESS(1)
- "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(count) // %2 // Output registers
- : // Input registers
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-#endif // HAS_COPYROW_NEON
-
-// SetRow8 writes 'count' bytes using a 32 bit value repeated.
-#ifdef HAS_SETROW_NEON
-void SetRow_NEON(uint8* dst, uint32 v32, int count) {
- asm volatile (
- "dup v0.4s, %w2 \n" // duplicate 4 ints
- "1: \n"
- "subs %1, %1, #16 \n" // 16 bytes per loop
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n" // store
- "bgt 1b \n"
- : "+r"(dst), // %0
- "+r"(count) // %1
- : "r"(v32) // %2
- : "cc", "memory", "v0"
- );
-}
-#endif // HAS_SETROW_NEON
-
-// TODO(fbarchard): Make fully assembler
-// SetRow32 writes 'count' words using a 32 bit value repeated.
-#ifdef HAS_ARGBSETROWS_NEON
-void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
- int dst_stride, int height) {
- for (int y = 0; y < height; ++y) {
- SetRow_NEON(dst, v32, width << 2);
- dst += dst_stride;
- }
-}
-#endif // HAS_ARGBSETROWS_NEON
-
-#ifdef HAS_MIRRORROW_NEON
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
- asm volatile (
- // Start at end of source row.
- "add %0, %0, %2 \n"
- "sub %0, %0, #16 \n"
-
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
- "subs %2, %2, #16 \n" // 16 pixels per loop.
- "rev64 v0.16b, v0.16b \n"
- MEMACCESS(1)
- "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
- MEMACCESS(1)
- "st1 {v0.D}[0], [%1], #8 \n"
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((ptrdiff_t)-16) // %3
- : "cc", "memory", "v0"
- );
-}
-#endif // HAS_MIRRORROW_NEON
-
-#ifdef HAS_MIRRORUVROW_NEON
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) {
- asm volatile (
- // Start at end of source row.
- "add %0, %0, %3, lsl #1 \n"
- "sub %0, %0, #16 \n"
-
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
- "subs %3, %3, #8 \n" // 8 pixels per loop.
- "rev64 v0.8b, v0.8b \n"
- "rev64 v1.8b, v1.8b \n"
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // dst += 8
- MEMACCESS(2)
- "st1 {v1.8b}, [%2], #8 \n"
- "bgt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- : "r"((ptrdiff_t)-16) // %4
- : "cc", "memory", "v0", "v1"
- );
-}
-#endif // HAS_MIRRORUVROW_NEON
-
-#ifdef HAS_ARGBMIRRORROW_NEON
-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
- asm volatile (
- // Start at end of source row.
- "add %0, %0, %2, lsl #2 \n"
- "sub %0, %0, #16 \n"
-
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
- "subs %2, %2, #4 \n" // 4 pixels per loop.
- "rev64 v0.4s, v0.4s \n"
- MEMACCESS(1)
- "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
- MEMACCESS(1)
- "st1 {v0.D}[0], [%1], #8 \n"
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((ptrdiff_t)-16) // %3
- : "cc", "memory", "v0"
- );
-}
-#endif // HAS_ARGBMIRRORROW_NEON
-
-#ifdef HAS_RGB24TOARGBROW_NEON
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
- asm volatile (
- "movi v4.8b, #255 \n" // Alpha
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- MEMACCESS(1)
- "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
- );
-}
-#endif // HAS_RGB24TOARGBROW_NEON
-
-#ifdef HAS_RAWTOARGBROW_NEON
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
- asm volatile (
- "movi v5.8b, #255 \n" // Alpha
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "mov v3.8b, v1.8b \n" // move g
- "mov v4.8b, v0.8b \n" // move r
- MEMACCESS(1)
- "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a
- "bgt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
- );
-}
-#endif // HAS_RAWTOARGBROW_NEON
-
-#define RGB565TOARGB \
- "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
- "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
- "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
- "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
- "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
- "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
- "vorr.u8 d0, d0, d4 \n" /* B */ \
- "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
- "vorr.u8 d2, d1, d5 \n" /* R */ \
- "vorr.u8 d1, d4, d6 \n" /* G */
-
-#ifdef HAS_RGB565TOARGBROW_NEON
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
- asm volatile (
- "vmov.u8 d3, #255 \n" // Alpha
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- RGB565TOARGB
- MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_rgb565), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-#endif // HAS_RGB565TOARGBROW_NEON
-
-#define ARGB1555TOARGB \
- "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
- "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
- "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
- "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
- "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
- "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
- "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
- "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
- "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
- "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
- "vorr.u8 q1, q1, q3 \n" /* R,A */ \
- "vorr.u8 q0, q0, q2 \n" /* B,G */ \
-
-// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB \
- "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
- "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
- "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
- "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
- "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
- "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
- "vorr.u8 d0, d0, d4 \n" /* B */ \
- "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
- "vorr.u8 d2, d1, d5 \n" /* R */ \
- "vorr.u8 d1, d4, d6 \n" /* G */
-
-#ifdef HAS_ARGB1555TOARGBROW_NEON
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
- int pix) {
- asm volatile (
- "vmov.u8 d3, #255 \n" // Alpha
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGB1555TOARGB
- MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_argb1555), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-#endif // HAS_ARGB1555TOARGBROW_NEON
-
-#define ARGB4444TOARGB \
- "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
- "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
- "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
- "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
- "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
- "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
- "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
- "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
-
-#ifdef HAS_ARGB4444TOARGBROW_NEON
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
- int pix) {
- asm volatile (
- "vmov.u8 d3, #255 \n" // Alpha
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGB4444TOARGB
- MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_argb4444), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q1", "q2" // Clobber List
- );
-}
-#endif // HAS_ARGB4444TOARGBROW_NEON
-
-#ifdef HAS_ARGBTORGB24ROW_NEON
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- MEMACCESS(1)
- "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_rgb24), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
- );
-}
-#endif // HAS_ARGBTORGB24ROW_NEON
-
-#ifdef HAS_ARGBTORAWROW_NEON
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "mov v4.8b, v2.8b \n" // mov g
- "mov v5.8b, v1.8b \n" // mov b
- MEMACCESS(1)
- "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_raw), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
- );
-}
-#endif // HAS_ARGBTORAWROW_NEON
-
-#ifdef HAS_YUY2TOYROW_NEON
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
- "subs %2, %2, #16 \n" // 16 processed per loop.
- MEMACCESS(1)
- "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
- "bgt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "v0", "v1" // Clobber List
- );
-}
-#endif // HAS_YUY2TOYROW_NEON
-
-#ifdef HAS_UYVYTOYROW_NEON
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
- "subs %2, %2, #16 \n" // 16 processed per loop.
- MEMACCESS(1)
- "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
- "bgt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "v0", "v1" // Clobber List
- );
-}
-#endif // HAS_UYVYTOYROW_NEON
-
-#ifdef HAS_YUY2TOUV422ROW_NEON
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
- int pix) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
- "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
- MEMACCESS(1)
- "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
- MEMACCESS(2)
- "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
- "bgt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-#endif // HAS_YUY2TOUV422ROW_NEON
-
-#ifdef HAS_UYVYTOUV422ROW_NEON
-void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
- int pix) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
- "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
- MEMACCESS(2)
- "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
- "bgt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-#endif // HAS_UYVYTOUV422ROW_NEON
-
-#ifdef HAS_YUY2TOUVROW_NEON
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
- "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
- MEMACCESS(1)
- "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2.
- "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
- "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
- MEMACCESS(2)
- "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
- MEMACCESS(3)
- "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
- "bgt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(stride_yuy2), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List
- );
-}
-#endif // HAS_YUY2TOUVROW_NEON
-
-#ifdef HAS_UYVYTOUVROW_NEON
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
- "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
- MEMACCESS(1)
- "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY.
- "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
- "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
- MEMACCESS(2)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
- MEMACCESS(3)
- "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
- "bgt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(stride_uyvy), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List
- );
-}
-#endif // HAS_UYVYTOUVROW_NEON
-
-#ifdef HAS_HALFROW_NEON
-void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix) {
- asm volatile (
- // change the stride to row 2 pointer
- "add %x1, %x0, %w1, sxtw \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels.
- "subs %3, %3, #16 \n" // 16 processed per loop
- MEMACCESS(1)
- "ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels.
- "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2
- MEMACCESS(2)
- "st1 {v0.16b}, [%2], #16 \n"
- "bgt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(src_uv_stride), // %1
- "+r"(dst_uv), // %2
- "+r"(pix) // %3
- :
- : "cc", "memory", "v0", "v1" // Clobber List
- );
-}
-#endif // HAS_HALFROW_NEON
-
-// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG
-#ifdef HAS_ARGBTOBAYERROW_NEON
-void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) {
- asm volatile (
- "mov v2.s[0], %w3 \n" // selector
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop
- "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels
- "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels
- "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels
- MEMACCESS(1)
- "st1 {v4.8b}, [%1], #8 \n" // store 8.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_bayer), // %1
- "+r"(pix) // %2
- : "r"(selector) // %3
- : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List
- );
-}
-#endif // HAS_ARGBTOBAYERROW_NEON
-
-// Select G channels from ARGB. e.g. GGGGGGGG
-#ifdef HAS_ARGBTOBAYERGGROW_NEON
-void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
- uint32 /*selector*/, int pix) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop
- MEMACCESS(1)
- "st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_bayer), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-#endif // HAS_ARGBTOBAYERGGROW_NEON
-
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-#ifdef HAS_ARGBSHUFFLEROW_NEON
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int pix) {
- asm volatile (
- MEMACCESS(3)
- "ld1 {v2.16b}, [%3] \n" // shuffler
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
- "subs %2, %2, #4 \n" // 4 processed per loop
- "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
- MEMACCESS(1)
- "st1 {v1.16b}, [%1], #16 \n" // store 4.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- : "r"(shuffler) // %3
- : "cc", "memory", "v0", "v1", "v2" // Clobber List
- );
-}
-#endif // HAS_ARGBSHUFFLEROW_NEON
-
-#ifdef HAS_I422TOYUY2ROW_NEON
-void I422ToYUY2Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_yuy2, int width) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
- "mov v2.8b, v1.8b \n"
- MEMACCESS(1)
- "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
- MEMACCESS(2)
- "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
- "subs %4, %4, #16 \n" // 16 pixels
- MEMACCESS(3)
- "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_yuy2), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3"
- );
-}
-#endif // HAS_I422TOYUY2ROW_NEON
-
-#ifdef HAS_I422TOUYVYROW_NEON
-void I422ToUYVYRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uyvy, int width) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys
- "mov v3.8b, v2.8b \n"
- MEMACCESS(1)
- "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
- MEMACCESS(2)
- "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
- "subs %4, %4, #16 \n" // 16 pixels
- MEMACCESS(3)
- "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_uyvy), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3"
- );
-}
-#endif // HAS_I422TOUYVYROW_NEON
-
-#ifdef HAS_ARGBTORGB565ROW_NEON
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGBTORGB565
- MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_rgb565), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
- );
-}
-#endif // HAS_ARGBTORGB565ROW_NEON
-
-#ifdef HAS_ARGBTOARGB1555ROW_NEON
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
- int pix) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGBTOARGB1555
- MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb1555), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
- );
-}
-#endif // HAS_ARGBTOARGB1555ROW_NEON
-
-#ifdef HAS_ARGBTOARGB4444ROW_NEON
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
- int pix) {
- asm volatile (
- "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGBTOARGB4444
- MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb4444), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
- );
-}
-#endif // HAS_ARGBTOARGB4444ROW_NEON
-
-#ifdef HAS_ARGBTOYROW_NEON
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
- asm volatile (
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
- );
-}
-#endif // HAS_ARGBTOYROW_NEON
-
-#ifdef HAS_ARGBTOYJROW_NEON
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
- asm volatile (
- "movi v4.8b, #15 \n" // B * 0.11400 coefficient
- "movi v5.8b, #75 \n" // G * 0.58700 coefficient
- "movi v6.8b, #38 \n" // R * 0.29900 coefficient
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
- );
-}
-#endif // HAS_ARGBTOYJROW_NEON
-
-// 8x1 pixels.
-#ifdef HAS_ARGBTOUV444ROW_NEON
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int pix) {
- asm volatile (
- "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient
- "movi v25.8b, #74 \n" // UG -0.5781 coefficient
- "movi v26.8b, #38 \n" // UR -0.2969 coefficient
- "movi v27.8b, #18 \n" // VB -0.1406 coefficient
- "movi v28.8b, #94 \n" // VG -0.7344 coefficient
- "movi v29.16b,#0x80 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v24.8b \n" // B
- "umlsl v4.8h, v1.8b, v25.8b \n" // G
- "umlsl v4.8h, v2.8b, v26.8b \n" // R
- "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
-
- "umull v3.8h, v2.8b, v24.8b \n" // R
- "umlsl v3.8h, v1.8b, v28.8b \n" // G
- "umlsl v3.8h, v0.8b, v27.8b \n" // B
- "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
-
- "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
-
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
- MEMACCESS(2)
- "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
- "v24", "v25", "v26", "v27", "v28", "v29"
- );
-}
-#endif // HAS_ARGBTOUV444ROW_NEON
-
-// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
-#ifdef HAS_ARGBTOUV422ROW_NEON
-void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int pix) {
- asm volatile (
- "movi v20.8h, #112 / 2 \n" // UB / VR 0.875 coefficient
- "movi v21.8h, #74 / 2 \n" // UG -0.5781 coefficient
- "movi v22.8h, #38 / 2 \n" // UR -0.2969 coefficient
- "movi v23.8h, #18 / 2 \n" // VB -0.1406 coefficient
- "movi v24.8h, #94 / 2 \n" // VG -0.7344 coefficient
- "movi v25.16b, #0x80 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.16b-v3.16b}, [%0], #64 \n" // load 16 ARGB pixels.
-
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
-
- "subs %3, %3, #16 \n" // 16 processed per loop.
- "mul v3.8h, v0.8h, v20.8h \n" // B
- "mls v3.8h, v1.8h, v21.8h \n" // G
- "mls v3.8h, v2.8h, v22.8h \n" // R
- "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
-
- "mul v4.8h, v2.8h, v20.8h \n" // R
- "mls v4.8h, v1.8h, v24.8h \n" // G
- "mls v4.8h, v0.8h, v23.8h \n" // B
- "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned
-
- "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V
-
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
- MEMACCESS(2)
- "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v23", "v24", "v25"
- );
-}
-#endif // HAS_ARGBTOUV422ROW_NEON
-
-// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32.
-#ifdef HAS_ARGBTOUV411ROW_NEON
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int pix) {
- asm volatile (
- "movi v20.8h, #112 / 2 \n" // UB / VR 0.875 coefficient
- "movi v21.8h, #74 / 2 \n" // UG -0.5781 coefficient
- "movi v22.8h, #38 / 2 \n" // UR -0.2969 coefficient
- "movi v23.8h, #18 / 2 \n" // VB -0.1406 coefficient
- "movi v24.8h, #94 / 2 \n" // VG -0.7344 coefficient
- "movi v25.16b, #0x80 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.16b-v3.16b}, [%0], #64 \n" // load 16 ARGB pixels.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(0)
- "ld4 {v4.16b-v7.16b}, [%0], #64 \n" // load next 16 ARGB pixels.
- "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
-
- "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts.
- "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts.
- "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %3, %3, #32 \n" // 32 processed per loop.
- "mul v3.8h, v0.8h, v20.8h \n" // B
- "mls v3.8h, v1.8h, v21.8h \n" // G
- "mls v3.8h, v2.8h, v22.8h \n" // R
- "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
- "mul v4.8h, v2.8h, v20.8h \n" // R
- "mls v4.8h, v1.8h, v24.8h \n" // G
- "mls v4.8h, v0.8h, v23.8h \n" // B
- "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
- MEMACCESS(2)
- "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v23", "v24", "v25"
- );
-}
-#endif // HAS_ARGBTOUV411ROW_NEON
-
-// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
-#define RGBTOUV(QB, QG, QR) \
- "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
- "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
- "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
- "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
- "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
- "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
- "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
- "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
- "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
- "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
-
-// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
-#ifdef HAS_ARGBTOUVROW_NEON
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
- MEMACCESS(1)
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q0, q1, q2)
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stride_argb), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_ARGBTOUVROW_NEON
-
-// TODO(fbarchard): Subsample match C code.
-#ifdef HAS_ARGBTOUVJROW_NEON
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
- "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
- "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
- "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
- "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
- MEMACCESS(1)
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q0, q1, q2)
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stride_argb), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_ARGBTOUVJROW_NEON
-
-#ifdef HAS_BGRATOUVROW_NEON
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_bgra
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
- "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
- MEMACCESS(1)
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
- "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q1, q1, #1 \n" // 2x average
- "vrshr.u16 q2, q2, #1 \n"
- "vrshr.u16 q3, q3, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q3, q2, q1)
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(src_stride_bgra), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_BGRATOUVROW_NEON
-
-#ifdef HAS_ABGRTOUVROW_NEON
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_abgr
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
- "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
- MEMACCESS(1)
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
- "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q2, q1, q0)
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(src_stride_abgr), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_ABGRTOUVROW_NEON
-
-#ifdef HAS_RGBATOUVROW_NEON
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_rgba
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
- "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
- MEMACCESS(1)
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
- "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q0, q1, q2)
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_rgba), // %0
- "+r"(src_stride_rgba), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_RGBATOUVROW_NEON
-
-#ifdef HAS_RGB24TOUVROW_NEON
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_rgb24
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
- MEMACCESS(0)
- "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
- MEMACCESS(1)
- "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q0, q1, q2)
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(src_stride_rgb24), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_RGB24TOUVROW_NEON
-
-#ifdef HAS_RAWTOUVROW_NEON
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_raw
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
- MEMACCESS(0)
- "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
- "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
- MEMACCESS(1)
- "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
- "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q2, q1, q0)
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(src_stride_raw), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_RAWTOUVROW_NEON
-
-// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
-#ifdef HAS_RGB565TOUVROW_NEON
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- RGB565TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
- RGB565TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
- RGB565TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
- RGB565TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_rgb565), // %0
- "+r"(src_stride_rgb565), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_RGB565TOUVROW_NEON
-
-// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
-#ifdef HAS_ARGB1555TOUVROW_NEON
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- RGB555TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
- RGB555TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
- RGB555TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
- RGB555TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb1555), // %0
- "+r"(src_stride_argb1555), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_ARGB1555TOUVROW_NEON
-
-// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
-#ifdef HAS_ARGB4444TOUVROW_NEON
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb4444), // %0
- "+r"(src_stride_argb4444), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_ARGB4444TOUVROW_NEON
-
-#ifdef HAS_RGB565TOYROW_NEON
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
- asm volatile (
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- RGB565TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_rgb565), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
- );
-}
-#endif // HAS_RGB565TOYROW_NEON
-
-#ifdef HAS_ARGB1555TOYROW_NEON
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
- asm volatile (
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGB1555TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_argb1555), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
- );
-}
-#endif // HAS_ARGB1555TOYROW_NEON
-
-#ifdef HAS_ARGB4444TOYROW_NEON
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
- asm volatile (
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGB4444TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_argb4444), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
- );
-}
-#endif // HAS_ARGB4444TOYROW_NEON
-
-#ifdef HAS_BGRATOYROW_NEON
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
- asm volatile (
- "movi v4.8b, #33 \n" // R * 0.2578 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #13 \n" // B * 0.1016 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of BGRA.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v1.8b, v4.8b \n" // R
- "umlal v16.8h, v2.8b, v5.8b \n" // G
- "umlal v16.8h, v3.8b, v6.8b \n" // B
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
- );
-}
-#endif // HAS_BGRATOYROW_NEON
-
-#ifdef HAS_ABGRTOYROW_NEON
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
- asm volatile (
- "movi v4.8b, #33 \n" // R * 0.2578 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #13 \n" // B * 0.1016 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ABGR.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // R
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // B
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
- );
-}
-#endif // HAS_ABGRTOYROW_NEON
-
-#ifdef HAS_RGBATOYROW_NEON
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
- asm volatile (
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of RGBA.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v1.8b, v4.8b \n" // B
- "umlal v16.8h, v2.8b, v5.8b \n" // G
- "umlal v16.8h, v3.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_rgba), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
- );
-}
-#endif // HAS_RGBATOYROW_NEON
-
-#ifdef HAS_RGB24TOYROW_NEON
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
- asm volatile (
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // B
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
- );
-}
-#endif // HAS_RGB24TOYROW_NEON
-
-#ifdef HAS_RAWTOYROW_NEON
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
- asm volatile (
- "movi v4.8b, #33 \n" // R * 0.2578 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #13 \n" // B * 0.1016 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // B
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
- );
-}
-#endif // HAS_RAWTOYROW_NEON
-
-// Bilinear filter 16x2 -> 16x1
-#ifdef HAS_INTERPOLATEROW_NEON
-void InterpolateRow_NEON(uint8* dst_ptr,
- const uint8* src_ptr, ptrdiff_t src_stride,
- int dst_width, int source_y_fraction) {
- int y1_fraction = source_y_fraction;
- int y0_fraction = 256 - y1_fraction;
- const uint8* src_ptr1 = src_ptr + src_stride;
- asm volatile (
- "cmp %4, #0 \n"
- "beq 100f \n"
- "cmp %4, #64 \n"
- "beq 75f \n"
- "cmp %4, #128 \n"
- "beq 50f \n"
- "cmp %4, #192 \n"
- "beq 25f \n"
-
- "dup v5.16b, %w4 \n"
- "dup v4.16b, %w5 \n"
- // General purpose row blend.
- "1: \n"
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n"
- MEMACCESS(2)
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %3, %3, #16 \n"
- "umull v2.8h, v0.8b, v4.8b \n"
- "umull2 v3.8h, v0.16b, v4.16b \n"
- "umlal v2.8h, v1.8b, v5.8b \n"
- "umlal2 v3.8h, v1.16b, v5.16b \n"
- "rshrn v0.8b, v2.8h, #8 \n"
- "rshrn2 v0.16b, v3.8h, #8 \n"
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n"
- "bgt 1b \n"
- "b 99f \n"
-
- // Blend 25 / 75.
- "25: \n"
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n"
- MEMACCESS(2)
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %3, %3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n"
- "bgt 25b \n"
- "b 99f \n"
-
- // Blend 50 / 50.
- "50: \n"
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n"
- MEMACCESS(2)
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %3, %3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n"
- "bgt 50b \n"
- "b 99f \n"
-
- // Blend 75 / 25.
- "75: \n"
- MEMACCESS(1)
- "ld1 {v1.16b}, [%1], #16 \n"
- MEMACCESS(2)
- "ld1 {v0.16b}, [%2], #16 \n"
- "subs %3, %3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n"
- "bgt 75b \n"
- "b 99f \n"
-
- // Blend 100 / 0 - Copy row unchanged.
- "100: \n"
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n"
- "subs %3, %3, #16 \n"
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n"
- "bgt 100b \n"
-
- "99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(src_ptr1), // %2
- "+r"(dst_width), // %3
- "+r"(y1_fraction), // %4
- "+r"(y0_fraction) // %5
- :
- : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
- );
-}
-#endif // HAS_INTERPOLATEROW_NEON
-
-// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-#ifdef HAS_ARGBBLENDROW_NEON
-void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- asm volatile (
- "subs %3, %3, #8 \n"
- "blt 89f \n"
- // Blend 8 pixels.
- "8: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB0.
- MEMACCESS(1)
- "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 pixels of ARGB1.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "umull v16.8h, v4.8b, v3.8b \n" // db * a
- "umull v17.8h, v5.8b, v3.8b \n" // dg * a
- "umull v18.8h, v6.8b, v3.8b \n" // dr * a
- "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
- "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
- "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
- "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
- "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
- "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
- "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
- "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
- "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
- "movi v3.8b, #255 \n" // a = 255
- MEMACCESS(2)
- "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 pixels of ARGB.
- "bge 8b \n"
-
- "89: \n"
- "adds %3, %3, #8-1 \n"
- "blt 99f \n"
-
- // Blend 1 pixels.
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.b-v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
- MEMACCESS(1)
- "ld4 {v4.b-v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
- "subs %3, %3, #1 \n" // 1 processed per loop.
- "umull v16.8h, v4.8b, v3.8b \n" // db * a
- "umull v17.8h, v5.8b, v3.8b \n" // dg * a
- "umull v18.8h, v6.8b, v3.8b \n" // dr * a
- "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
- "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
- "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
- "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
- "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
- "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
- "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
- "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
- "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
- "movi v3.8b, #255 \n" // a = 255
- MEMACCESS(2)
- "st4 {v0.b-v3.b}[0], [%2], #4 \n" // store 1 pixel.
- "bge 1b \n"
-
- "99: \n"
-
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v16", "v17", "v18"
- );
-}
-#endif // HAS_ARGBBLENDROW_NEON
-
-// Attenuate 8 pixels at a time.
-#ifdef HAS_ARGBATTENUATEROW_NEON
-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile (
- // Attenuate 8 pixels.
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v3.8b \n" // b * a
- "umull v5.8h, v1.8b, v3.8b \n" // g * a
- "umull v6.8h, v2.8b, v3.8b \n" // r * a
- "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
- "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
- "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
- MEMACCESS(1)
- "st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
- );
-}
-#endif // HAS_ARGBATTENUATEROW_NEON
-
-// Quantize 8 ARGB pixels (32 bytes).
-// dst = (dst * scale >> 16) * interval_size + interval_offset;
-#ifdef HAS_ARGBQUANTIZEROW_NEON
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
- int interval_offset, int width) {
- asm volatile (
- "dup v4.8h, %w2 \n"
- "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
- "dup v5.8h, %w3 \n" // interval multiply.
- "dup v6.8h, %w4 \n" // interval add
-
- // 8 pixel loop.
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0] \n" // load 8 pixels of ARGB.
- "subs %1, %1, #8 \n" // 8 processed per loop.
- "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
- "uxtl v1.8h, v1.8b \n"
- "uxtl v2.8h, v2.8b \n"
- "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
- "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
- "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
- "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
- "mul v1.8h, v1.8h, v5.8h \n" // g
- "mul v2.8h, v2.8h, v5.8h \n" // r
- "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
- "add v1.8h, v1.8h, v6.8h \n" // g
- "add v2.8h, v2.8h, v6.8h \n" // r
- "uqxtn v0.8b, v0.8h \n"
- "uqxtn v1.8b, v1.8h \n"
- "uqxtn v2.8b, v2.8h \n"
- MEMACCESS(0)
- "st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- : "r"(scale), // %2
- "r"(interval_size), // %3
- "r"(interval_offset) // %4
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
- );
-}
-#endif // HAS_ARGBQUANTIZEROW_NEON
-
-// Shade 8 pixels at a time by specified value.
-// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
-// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-#ifdef HAS_ARGBSHADEROW_NEON
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
- uint32 value) {
- asm volatile (
- "dup v0.4s, %w3 \n" // duplicate scale value.
- "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
- "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
-
- // 8 pixel loop.
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v4.8b-v7.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
- "uxtl v5.8h, v5.8b \n"
- "uxtl v6.8h, v6.8b \n"
- "uxtl v7.8h, v7.8b \n"
- "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
- "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
- "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
- "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
- "uqxtn v4.8b, v4.8h \n"
- "uqxtn v5.8b, v5.8h \n"
- "uqxtn v6.8b, v6.8h \n"
- "uqxtn v7.8b, v7.8h \n"
- MEMACCESS(1)
- "st4 {v4.8b-v7.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(value) // %3
- : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
- );
-}
-#endif // HAS_ARGBSHADEROW_NEON
-
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-// Similar to ARGBToYJ but stores ARGB.
-// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
-#ifdef HAS_ARGBGRAYROW_NEON
-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile (
- "movi v24.8b, #15 \n" // B * 0.11400 coefficient
- "movi v25.8b, #75 \n" // G * 0.58700 coefficient
- "movi v26.8b, #38 \n" // R * 0.29900 coefficient
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v24.8b \n" // B
- "umlal v4.8h, v1.8b, v25.8b \n" // G
- "umlal v4.8h, v2.8b, v26.8b \n" // R
- "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
- "mov v1.8b, v0.8b \n" // G
- "mov v2.8b, v0.8b \n" // R
- MEMACCESS(1)
- "st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
- );
-}
-#endif // HAS_ARGBGRAYROW_NEON
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-// b = (r * 35 + g * 68 + b * 17) >> 7
-// g = (r * 45 + g * 88 + b * 22) >> 7
-// r = (r * 50 + g * 98 + b * 24) >> 7
-
-#ifdef HAS_ARGBSEPIAROW_NEON
-void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
- asm volatile (
- "movi v20.8b, #17 \n" // BB coefficient
- "movi v21.8b, #68 \n" // BG coefficient
- "movi v22.8b, #35 \n" // BR coefficient
- "movi v24.8b, #22 \n" // GB coefficient
- "movi v25.8b, #88 \n" // GG coefficient
- "movi v26.8b, #45 \n" // GR coefficient
- "movi v28.8b, #24 \n" // BB coefficient
- "movi v29.8b, #98 \n" // BG coefficient
- "movi v30.8b, #50 \n" // BR coefficient
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0] \n" // load 8 ARGB pixels.
- "subs %1, %1, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
- "umlal v4.8h, v1.8b, v21.8b \n" // G
- "umlal v4.8h, v2.8b, v22.8b \n" // R
- "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
- "umlal v5.8h, v1.8b, v25.8b \n" // G
- "umlal v5.8h, v2.8b, v26.8b \n" // R
- "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
- "umlal v6.8h, v1.8b, v29.8b \n" // G
- "umlal v6.8h, v2.8b, v30.8b \n" // R
- "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
- "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
- "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
- MEMACCESS(0)
- "st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
- );
-}
-#endif // HAS_ARGBSEPIAROW_NEON
-
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
-// needs to saturate. Consider doing a non-saturating version.
-#ifdef HAS_ARGBCOLORMATRIXROW_NEON
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
- const int8* matrix_argb, int width) {
- asm volatile (
- MEMACCESS(3)
- "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
- "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
- "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
-
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v16.8b-v19.8b}, [%0], #32 \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
- "uxtl v17.8h, v17.8b \n" // g
- "uxtl v18.8h, v18.8b \n" // r
- "uxtl v19.8h, v19.8b \n" // a
- "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
- "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
- "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
- "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
- "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
- "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
- "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
- "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
- "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
- "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
- "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
- "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
- "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
- "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
- "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
- "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
- "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
- MEMACCESS(1)
- "st4 {v16.8b-v19.8b}, [%1], #32 \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(matrix_argb) // %3
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
- "v18", "v19", "v22", "v23", "v24", "v25"
- );
-}
-#endif // HAS_ARGBCOLORMATRIXROW_NEON
-
-// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-#ifdef HAS_ARGBMULTIPLYROW_NEON
-void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- asm volatile (
- // 8 pixel loop.
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
- MEMACCESS(1)
- "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "umull v0.8h, v0.8b, v4.8b \n" // multiply B
- "umull v1.8h, v1.8b, v5.8b \n" // multiply G
- "umull v2.8h, v2.8b, v6.8b \n" // multiply R
- "umull v3.8h, v3.8b, v7.8b \n" // multiply A
- "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
- "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
- "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
- "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
- MEMACCESS(2)
- "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
- "bgt 1b \n"
-
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
- );
-}
-#endif // HAS_ARGBMULTIPLYROW_NEON
-
-// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-#ifdef HAS_ARGBADDROW_NEON
-void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- asm volatile (
- // 8 pixel loop.
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
- MEMACCESS(1)
- "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "uqadd v0.8b, v0.8b, v4.8b \n"
- "uqadd v1.8b, v1.8b, v5.8b \n"
- "uqadd v2.8b, v2.8b, v6.8b \n"
- "uqadd v3.8b, v3.8b, v7.8b \n"
- MEMACCESS(2)
- "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
- "bgt 1b \n"
-
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
- );
-}
-#endif // HAS_ARGBADDROW_NEON
-
-// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-#ifdef HAS_ARGBSUBTRACTROW_NEON
-void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- asm volatile (
- // 8 pixel loop.
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
- MEMACCESS(1)
- "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "uqsub v0.8b, v0.8b, v4.8b \n"
- "uqsub v1.8b, v1.8b, v5.8b \n"
- "uqsub v2.8b, v2.8b, v6.8b \n"
- "uqsub v3.8b, v3.8b, v7.8b \n"
- MEMACCESS(2)
- "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
- "bgt 1b \n"
-
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
- );
-}
-#endif // HAS_ARGBSUBTRACTROW_NEON
-
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-#ifdef HAS_SOBELROW_NEON
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
- asm volatile (
- "movi v3.8b, #255 \n" // alpha
- // 8 pixel loop.
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
- MEMACCESS(1)
- "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "uqadd v0.8b, v0.8b, v1.8b \n" // add
- "mov v1.8b, v0.8b \n"
- "mov v2.8b, v0.8b \n"
- MEMACCESS(2)
- "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3"
- );
-}
-#endif // HAS_SOBELROW_NEON
-
-// Adds Sobel X and Sobel Y and stores Sobel into plane.
-#ifdef HAS_SOBELTOPLANEROW_NEON
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_y, int width) {
- asm volatile (
- // 16 pixel loop.
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
- MEMACCESS(1)
- "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
- "subs %3, %3, #16 \n" // 16 processed per loop.
- "uqadd v0.16b, v0.16b, v1.16b \n" // add
- MEMACCESS(2)
- "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
- "bgt 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_y), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1"
- );
-}
-#endif // HAS_SOBELTOPLANEROW_NEON
-
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-#ifdef HAS_SOBELXYROW_NEON
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
- asm volatile (
- "movi v3.8b, #255 \n" // alpha
- // 8 pixel loop.
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
- MEMACCESS(1)
- "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "uqadd v1.8b, v0.8b, v2.8b \n" // add
- MEMACCESS(2)
- "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3"
- );
-}
-#endif // HAS_SOBELXYROW_NEON
-
-// SobelX as a matrix is
-// -1 0 1
-// -2 0 2
-// -1 0 1
-#ifdef HAS_SOBELXROW_NEON
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
- const uint8* src_y2, uint8* dst_sobelx, int width) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.8b}, [%0],%5 \n" // top
- MEMACCESS(0)
- "ld1 {v1.8b}, [%0],%6 \n"
- "usubl v0.8h, v0.8b, v1.8b \n"
- MEMACCESS(1)
- "ld1 {v2.8b}, [%1],%5 \n" // center * 2
- MEMACCESS(1)
- "ld1 {v3.8b}, [%1],%6 \n"
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "add v0.8h, v0.8h, v1.8h \n"
- MEMACCESS(2)
- "ld1 {v2.8b}, [%2],%5 \n" // bottom
- MEMACCESS(2)
- "ld1 {v3.8b}, [%2],%6 \n"
- "subs %4, %4, #8 \n" // 8 pixels
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "abs v0.8h, v0.8h \n"
- "uqxtn v0.8b, v0.8h \n"
- MEMACCESS(3)
- "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
- "bgt 1b \n"
- : "+r"(src_y0), // %0
- "+r"(src_y1), // %1
- "+r"(src_y2), // %2
- "+r"(dst_sobelx), // %3
- "+r"(width) // %4
- : "r"(2), // %5
- "r"(6) // %6
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-#endif // HAS_SOBELXROW_NEON
-
-// SobelY as a matrix is
-// -1 -2 -1
-// 0 0 0
-// 1 2 1
-#ifdef HAS_SOBELYROW_NEON
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.8b}, [%0],%4 \n" // left
- MEMACCESS(1)
- "ld1 {v1.8b}, [%1],%4 \n"
- "usubl v0.8h, v0.8b, v1.8b \n"
- MEMACCESS(0)
- "ld1 {v2.8b}, [%0],%4 \n" // center * 2
- MEMACCESS(1)
- "ld1 {v3.8b}, [%1],%4 \n"
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "add v0.8h, v0.8h, v1.8h \n"
- MEMACCESS(0)
- "ld1 {v2.8b}, [%0],%5 \n" // right
- MEMACCESS(1)
- "ld1 {v3.8b}, [%1],%5 \n"
- "subs %3, %3, #8 \n" // 8 pixels
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "abs v0.8h, v0.8h \n"
- "uqxtn v0.8b, v0.8h \n"
- MEMACCESS(2)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
- "bgt 1b \n"
- : "+r"(src_y0), // %0
- "+r"(src_y1), // %1
- "+r"(dst_sobely), // %2
- "+r"(width) // %3
- : "r"(1), // %4
- "r"(6) // %5
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-#endif // HAS_SOBELYROW_NEON
-#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/row_posix.cc b/src/main/jni/libyuv/source/row_posix.cc
deleted file mode 100644
index 106fda568..000000000
--- a/src/main/jni/libyuv/source/row_posix.cc
+++ /dev/null
@@ -1,6443 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-
-#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
-
-// Constants for ARGB
-static vec8 kARGBToY = {
- 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
-
-// JPeg full range.
-static vec8 kARGBToYJ = {
- 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
-};
-#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
-
-#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
-
-static vec8 kARGBToU = {
- 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
-};
-
-static vec8 kARGBToUJ = {
- 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
-};
-
-static vec8 kARGBToV = {
- -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
-};
-
-static vec8 kARGBToVJ = {
- -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
-};
-
-// Constants for BGRA
-static vec8 kBGRAToY = {
- 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
-};
-
-static vec8 kBGRAToU = {
- 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
-};
-
-static vec8 kBGRAToV = {
- 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
-};
-
-// Constants for ABGR
-static vec8 kABGRToY = {
- 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
-};
-
-static vec8 kABGRToU = {
- -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
-};
-
-static vec8 kABGRToV = {
- 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
-};
-
-// Constants for RGBA.
-static vec8 kRGBAToY = {
- 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
-};
-
-static vec8 kRGBAToU = {
- 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
-};
-
-static vec8 kRGBAToV = {
- 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
-};
-
-static uvec8 kAddY16 = {
- 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
-
-static vec16 kAddYJ64 = {
- 64, 64, 64, 64, 64, 64, 64, 64
-};
-
-static uvec8 kAddUV128 = {
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
-static uvec16 kAddUVJ128 = {
- 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
-};
-#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
-
-#ifdef HAS_RGB24TOARGBROW_SSSE3
-
-// Shuffle table for converting RGB24 to ARGB.
-static uvec8 kShuffleMaskRGB24ToARGB = {
- 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
-};
-
-// Shuffle table for converting RAW to ARGB.
-static uvec8 kShuffleMaskRAWToARGB = {
- 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
-};
-
-// Shuffle table for converting ARGB to RGB24.
-static uvec8 kShuffleMaskARGBToRGB24 = {
- 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting ARGB to RAW.
-static uvec8 kShuffleMaskARGBToRAW = {
- 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
-static uvec8 kShuffleMaskARGBToRGB24_0 = {
- 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
-};
-
-// Shuffle table for converting ARGB to RAW.
-static uvec8 kShuffleMaskARGBToRAW_0 = {
- 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
-};
-#endif // HAS_RGB24TOARGBROW_SSSE3
-
-#if defined(TESTING) && defined(__x86_64__)
-void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
- asm volatile (
- ".p2align 5 \n"
- "mov %%eax,%%eax \n"
- "mov %%ebx,%%ebx \n"
- "mov %%ecx,%%ecx \n"
- "mov %%edx,%%edx \n"
- "mov %%esi,%%esi \n"
- "mov %%edi,%%edi \n"
- "mov %%ebp,%%ebp \n"
- "mov %%esp,%%esp \n"
- ".p2align 5 \n"
- "mov %%r8d,%%r8d \n"
- "mov %%r9d,%%r9d \n"
- "mov %%r10d,%%r10d \n"
- "mov %%r11d,%%r11d \n"
- "mov %%r12d,%%r12d \n"
- "mov %%r13d,%%r13d \n"
- "mov %%r14d,%%r14d \n"
- "mov %%r15d,%%r15d \n"
- ".p2align 5 \n"
- "lea (%%rax),%%eax \n"
- "lea (%%rbx),%%ebx \n"
- "lea (%%rcx),%%ecx \n"
- "lea (%%rdx),%%edx \n"
- "lea (%%rsi),%%esi \n"
- "lea (%%rdi),%%edi \n"
- "lea (%%rbp),%%ebp \n"
- "lea (%%rsp),%%esp \n"
- ".p2align 5 \n"
- "lea (%%r8),%%r8d \n"
- "lea (%%r9),%%r9d \n"
- "lea (%%r10),%%r10d \n"
- "lea (%%r11),%%r11d \n"
- "lea (%%r12),%%r12d \n"
- "lea (%%r13),%%r13d \n"
- "lea (%%r14),%%r14d \n"
- "lea (%%r15),%%r15d \n"
-
- ".p2align 5 \n"
- "lea 0x10(%%rax),%%eax \n"
- "lea 0x10(%%rbx),%%ebx \n"
- "lea 0x10(%%rcx),%%ecx \n"
- "lea 0x10(%%rdx),%%edx \n"
- "lea 0x10(%%rsi),%%esi \n"
- "lea 0x10(%%rdi),%%edi \n"
- "lea 0x10(%%rbp),%%ebp \n"
- "lea 0x10(%%rsp),%%esp \n"
- ".p2align 5 \n"
- "lea 0x10(%%r8),%%r8d \n"
- "lea 0x10(%%r9),%%r9d \n"
- "lea 0x10(%%r10),%%r10d \n"
- "lea 0x10(%%r11),%%r11d \n"
- "lea 0x10(%%r12),%%r12d \n"
- "lea 0x10(%%r13),%%r13d \n"
- "lea 0x10(%%r14),%%r14d \n"
- "lea 0x10(%%r15),%%r15d \n"
-
- ".p2align 5 \n"
- "add 0x10,%%eax \n"
- "add 0x10,%%ebx \n"
- "add 0x10,%%ecx \n"
- "add 0x10,%%edx \n"
- "add 0x10,%%esi \n"
- "add 0x10,%%edi \n"
- "add 0x10,%%ebp \n"
- "add 0x10,%%esp \n"
- ".p2align 5 \n"
- "add 0x10,%%r8d \n"
- "add 0x10,%%r9d \n"
- "add 0x10,%%r10d \n"
- "add 0x10,%%r11d \n"
- "add 0x10,%%r12d \n"
- "add 0x10,%%r13d \n"
- "add 0x10,%%r14d \n"
- "add 0x10,%%r15d \n"
-
- ".p2align 2 \n"
- "1: \n"
- "movq " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-#endif // TESTING
-
-#ifdef HAS_I400TOARGBROW_SSE2
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm1 \n"
- "por %%xmm5,%%xmm0 \n"
- "por %%xmm5,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
- int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm1 \n"
- "por %%xmm5,%%xmm0 \n"
- "por %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-#endif // HAS_I400TOARGBROW_SSE2
-
-#ifdef HAS_RGB24TOARGBROW_SSSE3
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
- "pslld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
- "lea " MEMLEA(0x30,0) ",%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
- "por %%xmm5,%%xmm3 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n"
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- : "m"(kShuffleMaskRGB24ToARGB) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
- "pslld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
- "lea " MEMLEA(0x30,0) ",%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
- "por %%xmm5,%%xmm3 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n"
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- : "m"(kShuffleMaskRAWToARGB) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
- asm volatile (
- "mov $0x1080108,%%eax \n"
- "movd %%eax,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x20802080,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psllw $0xb,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0xa,%%xmm4 \n"
- "psrlw $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psllw $0x8,%%xmm7 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm3,%%xmm1 \n"
- "psllw $0xb,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "psllw $0x8,%%xmm1 \n"
- "por %%xmm2,%%xmm1 \n"
- "pand %%xmm4,%%xmm0 \n"
- "pmulhuw %%xmm6,%%xmm0 \n"
- "por %%xmm7,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2)
- MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2)
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc", "eax"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
- asm volatile (
- "mov $0x1080108,%%eax \n"
- "movd %%eax,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x42004200,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psllw $0xb,%%xmm3 \n"
- "movdqa %%xmm3,%%xmm4 \n"
- "psrlw $0x6,%%xmm4 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psllw $0x8,%%xmm7 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psllw $0x1,%%xmm1 \n"
- "psllw $0xb,%%xmm2 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "psllw $0x8,%%xmm1 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm4,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "pmulhuw %%xmm6,%%xmm0 \n"
- "pand %%xmm7,%%xmm2 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2)
- MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2)
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc", "eax"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
- asm volatile (
- "mov $0xf0f0f0f,%%eax \n"
- "movd %%eax,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "pslld $0x4,%%xmm5 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm4,%%xmm0 \n"
- "pand %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "psllw $0x4,%%xmm1 \n"
- "psrlw $0x4,%%xmm3 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2)
- MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2)
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc", "eax"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
- asm volatile (
- "movdqa %3,%%xmm6 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "pshufb %%xmm6,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "pshufb %%xmm6,%%xmm2 \n"
- "pshufb %%xmm6,%%xmm3 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "psrldq $0x4,%%xmm1 \n"
- "pslldq $0xc,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm5 \n"
- "por %%xmm4,%%xmm0 \n"
- "pslldq $0x8,%%xmm5 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "por %%xmm5,%%xmm1 \n"
- "psrldq $0x8,%%xmm2 \n"
- "pslldq $0x4,%%xmm3 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
- "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
- "lea " MEMLEA(0x30,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(pix) // %2
- : "m"(kShuffleMaskARGBToRGB24) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
- );
-}
-
-void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
- asm volatile (
- "movdqa %3,%%xmm6 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "pshufb %%xmm6,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "pshufb %%xmm6,%%xmm2 \n"
- "pshufb %%xmm6,%%xmm3 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "psrldq $0x4,%%xmm1 \n"
- "pslldq $0xc,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm5 \n"
- "por %%xmm4,%%xmm0 \n"
- "pslldq $0x8,%%xmm5 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "por %%xmm5,%%xmm1 \n"
- "psrldq $0x8,%%xmm2 \n"
- "pslldq $0x4,%%xmm3 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
- "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
- "lea " MEMLEA(0x30,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(pix) // %2
- : "m"(kShuffleMaskARGBToRAW) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
- );
-}
-
-void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
- asm volatile (
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psrld $0x1b,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1a,%%xmm4 \n"
- "pslld $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0xb,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pslld $0x8,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x5,%%xmm2 \n"
- "psrad $0x10,%%xmm0 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pand %%xmm4,%%xmm2 \n"
- "pand %%xmm5,%%xmm0 \n"
- "por %%xmm2,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
- asm volatile (
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1b,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "pslld $0x5,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "pslld $0xa,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "pslld $0xf,%%xmm7 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "psrad $0x10,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x6,%%xmm2 \n"
- "psrld $0x9,%%xmm3 \n"
- "pand %%xmm7,%%xmm0 \n"
- "pand %%xmm4,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm6,%%xmm3 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMACCESS2(0x8,1) ",%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
- asm volatile (
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0xc,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm3 \n"
- "psrlw $0x8,%%xmm3 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm3,%%xmm0 \n"
- "pand %%xmm4,%%xmm1 \n"
- "psrlq $0x4,%%xmm0 \n"
- "psrlq $0x8,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
- );
-}
-#endif // HAS_RGB24TOARGBROW_SSSE3
-
-#ifdef HAS_ARGBTOYROW_SSSE3
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kARGBToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kARGBToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBTOYROW_SSSE3
-
-#ifdef HAS_ARGBTOYJROW_SSSE3
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kARGBToYJ), // %3
- "m"(kAddYJ64) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kARGBToYJ), // %3
- "m"(kAddYJ64) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBTOYJROW_SSSE3
-
-#ifdef HAS_ARGBTOUVROW_SSSE3
-// TODO(fbarchard): pass xmm constants to single block of assembly.
-// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
-// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
-// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
-// and considered unsafe.
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToU), // %0
- "m"(kARGBToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
- MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
- MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
- MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToUJ), // %0
- "m"(kARGBToVJ), // %1
- "m"(kAddUVJ128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
- MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
- MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
- MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToU), // %0
- "m"(kARGBToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm0 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm1 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm2 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToUJ), // %0
- "m"(kARGBToVJ), // %1
- "m"(kAddUVJ128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm0 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm1 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm2 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb))
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToU), // %0
- "m"(kARGBToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm6 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm2 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "packsswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- "pmaddubsw %%xmm3,%%xmm0 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm2 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "packsswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6"
-#endif
- );
-}
-
-void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
- uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToU), // %0
- "m"(kARGBToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm6 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm2 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "packsswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- "pmaddubsw %%xmm3,%%xmm0 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm2 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "packsswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6"
-#endif
- );
-}
-
-void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToU), // %0
- "m"(kARGBToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToU), // %0
- "m"(kARGBToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kBGRAToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kBGRAToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kBGRAToU), // %0
- "m"(kBGRAToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
- MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
- MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
- MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_bgra0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_bgra)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kBGRAToU), // %0
- "m"(kBGRAToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm0 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm1 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm2 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_bgra0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_bgra)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kABGRToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kABGRToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_rgba), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kRGBAToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_rgba), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kRGBAToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kABGRToU), // %0
- "m"(kABGRToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
- MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
- MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
- MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_abgr0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_abgr)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kABGRToU), // %0
- "m"(kABGRToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm0 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm1 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm2 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_abgr0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_abgr)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kRGBAToU), // %0
- "m"(kRGBAToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
- MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
- MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
- MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_rgba0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_rgba))
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kRGBAToU), // %0
- "m"(kRGBAToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm0 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm1 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm2 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_rgba0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_rgba)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-#endif // HAS_ARGBTOUVROW_SSSE3
-
-#ifdef HAS_I422TOARGBROW_SSSE3
-#define UB 127 /* min(63,(int8)(2.018 * 64)) */
-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
-#define UR 0
-
-#define VB 0
-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
-
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
-
-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
-
-struct {
- vec8 kUVToB; // 0
- vec8 kUVToG; // 16
- vec8 kUVToR; // 32
- vec16 kUVBiasB; // 48
- vec16 kUVBiasG; // 64
- vec16 kUVBiasR; // 80
- vec16 kYSub16; // 96
- vec16 kYToRgb; // 112
- vec8 kVUToB; // 128
- vec8 kVUToG; // 144
- vec8 kVUToR; // 160
-} static SIMD_ALIGNED(kYuvConstants) = {
- { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
- { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
- { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
- { BB, BB, BB, BB, BB, BB, BB, BB },
- { BG, BG, BG, BG, BG, BG, BG, BG },
- { BR, BR, BR, BR, BR, BR, BR, BR },
- { 16, 16, 16, 16, 16, 16, 16, 16 },
- { YG, YG, YG, YG, YG, YG, YG, YG },
- { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
- { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
- { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
-};
-
-
-// Read 8 UV from 411
-#define READYUV444 \
- "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
- BUNDLEALIGN \
- MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
- "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n"
-
-// Read 4 UV from 422, upsample to 8 UV
-#define READYUV422 \
- "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
- BUNDLEALIGN \
- MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
- "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "punpcklwd %%xmm0,%%xmm0 \n"
-
-// Read 2 UV from 411, upsample to 8 UV
-#define READYUV411 \
- "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
- BUNDLEALIGN \
- MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
- "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
- "punpckldq %%xmm0,%%xmm0 \n"
-
-// Read 4 UV from NV12, upsample to 8 UV
-#define READNV12 \
- "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
- "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
- "punpcklwd %%xmm0,%%xmm0 \n"
-
-// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB \
- "movdqa %%xmm0,%%xmm1 \n" \
- "movdqa %%xmm0,%%xmm2 \n" \
- "pmaddubsw " MEMACCESS([kYuvConstants]) ",%%xmm0 \n" \
- "pmaddubsw " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1 \n" \
- "pmaddubsw " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2 \n" \
- "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \
- "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \
- "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \
- "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \
- "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
- "punpcklbw %%xmm4,%%xmm3 \n" \
- "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \
- "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \
- "paddsw %%xmm3,%%xmm0 \n" \
- "paddsw %%xmm3,%%xmm1 \n" \
- "paddsw %%xmm3,%%xmm2 \n" \
- "psraw $0x6,%%xmm0 \n" \
- "psraw $0x6,%%xmm1 \n" \
- "psraw $0x6,%%xmm2 \n" \
- "packuswb %%xmm0,%%xmm0 \n" \
- "packuswb %%xmm1,%%xmm1 \n" \
- "packuswb %%xmm2,%%xmm2 \n"
-
-// Convert 8 pixels: 8 VU and 8 Y
-#define YVUTORGB \
- "movdqa %%xmm0,%%xmm1 \n" \
- "movdqa %%xmm0,%%xmm2 \n" \
- "pmaddubsw " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0 \n" \
- "pmaddubsw " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1 \n" \
- "pmaddubsw " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2 \n" \
- "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \
- "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \
- "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \
- "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \
- "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
- "punpcklbw %%xmm4,%%xmm3 \n" \
- "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \
- "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \
- "paddsw %%xmm3,%%xmm0 \n" \
- "paddsw %%xmm3,%%xmm1 \n" \
- "paddsw %%xmm3,%%xmm2 \n" \
- "psraw $0x6,%%xmm0 \n" \
- "psraw $0x6,%%xmm1 \n" \
- "psraw $0x6,%%xmm2 \n" \
- "packuswb %%xmm0,%%xmm0 \n" \
- "packuswb %%xmm1,%%xmm1 \n" \
- "packuswb %%xmm2,%%xmm2 \n"
-
-void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV444
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS([dst_argb]) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgb24,
- int width) {
-// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
-#if defined(__i386__)
- asm volatile (
- "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
- "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
- :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
- [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
-#endif
-
- asm volatile (
-#if !defined(__i386__)
- "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
- "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
-#endif
- "sub %[u_buf],%[v_buf] \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
- "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
-#if !defined(__i386__)
- , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
- [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
-#endif
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
- );
-}
-
-void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_raw,
- int width) {
-// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
-#if defined(__i386__)
- asm volatile (
- "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
- "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
- :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
- [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
-#endif
-
- asm volatile (
-#if !defined(__i386__)
- "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
- "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
-#endif
- "sub %[u_buf],%[v_buf] \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS([dst_raw]) " \n"
- "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
- "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_raw]"+r"(dst_raw), // %[dst_raw]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
-#if !defined(__i386__)
- , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
- [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
-#endif
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
- );
-}
-
-void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV411
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READNV12
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [uv_buf]"+r"(uv_buf), // %[uv_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
- // Does not use r14.
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READNV12
- YVUTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [uv_buf]"+r"(uv_buf), // %[uv_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
- // Does not use r14.
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV444
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV411
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READNV12
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [uv_buf]"+r"(uv_buf), // %[uv_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
- // Does not use r14.
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READNV12
- YVUTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [uv_buf]"+r"(uv_buf), // %[uv_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
- // Does not use r14.
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_bgra,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm5 \n"
- "movdqa %%xmm5,%%xmm0 \n"
- "punpcklwd %%xmm1,%%xmm5 \n"
- "punpckhwd %%xmm1,%%xmm0 \n"
- "movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n"
- "movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
- "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_abgr,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm2 \n"
- "punpckhwd %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n"
- "movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
- "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgba,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm2,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "movdqa %%xmm5,%%xmm0 \n"
- "punpcklwd %%xmm1,%%xmm5 \n"
- "punpckhwd %%xmm1,%%xmm0 \n"
- "movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n"
- "movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
- "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_bgra,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm5 \n"
- "movdqa %%xmm5,%%xmm0 \n"
- "punpcklwd %%xmm1,%%xmm5 \n"
- "punpckhwd %%xmm1,%%xmm0 \n"
- "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n"
- "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
- "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_abgr,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm2 \n"
- "punpckhwd %%xmm0,%%xmm1 \n"
- "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
- "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgba,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm2,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "movdqa %%xmm5,%%xmm0 \n"
- "punpcklwd %%xmm1,%%xmm5 \n"
- "punpckhwd %%xmm1,%%xmm0 \n"
- "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n"
- "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
- "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-#endif // HAS_I422TOARGBROW_SSSE3
-
-#ifdef HAS_YTOARGBROW_SSE2
-void YToARGBRow_SSE2(const uint8* y_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "pxor %%xmm5,%%xmm5 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
- "mov $0x00100010,%%eax \n"
- "movd %%eax,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "mov $0x004a004a,%%eax \n"
- "movd %%eax,%%xmm2 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
- LABELALIGN
- "1: \n"
- // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
- "movq " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "psubusw %%xmm3,%%xmm0 \n"
- "pmullw %%xmm2,%%xmm0 \n"
- "psrlw $6, %%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
-
- // Step 2: Weave into ARGB
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "por %%xmm4,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
-
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(y_buf), // %0
- "+r"(dst_argb), // %1
- "+rm"(width) // %2
- :
- : "memory", "cc", "eax"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
- );
-}
-#endif // HAS_YTOARGBROW_SSE2
-
-#ifdef HAS_MIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-static uvec8 kShuffleMirror = {
- 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
-
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
- intptr_t temp_width = (intptr_t)(width);
- asm volatile (
- "movdqa %3,%%xmm5 \n"
- "lea " MEMLEA(-0x10,0) ",%0 \n"
- LABELALIGN
- "1: \n"
- MEMOPREG(movdqa,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0
- "pshufb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(temp_width) // %2
- : "m"(kShuffleMirror) // %3
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm5"
-#endif
- );
-}
-#endif // HAS_MIRRORROW_SSSE3
-
-#ifdef HAS_MIRRORROW_SSE2
-void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
- intptr_t temp_width = (intptr_t)(width);
- asm volatile (
- "lea " MEMLEA(-0x10,0) ",%0 \n"
- LABELALIGN
- "1: \n"
- MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0
- "movdqa %%xmm0,%%xmm1 \n"
- "psllw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
- "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
- "pshufd $0x4e,%%xmm0,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1)",%1 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(temp_width) // %2
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-#endif // HAS_MIRRORROW_SSE2
-
-#ifdef HAS_MIRRORROW_UV_SSSE3
-// Shuffle table for reversing the bytes of UV channels.
-static uvec8 kShuffleMirrorUV = {
- 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
-void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
- int width) {
- intptr_t temp_width = (intptr_t)(width);
- asm volatile (
- "movdqa %4,%%xmm1 \n"
- "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(-0x10,0) ",%0 \n"
- "pshufb %%xmm1,%%xmm0 \n"
- "sub $8,%3 \n"
- "movlpd %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(temp_width) // %3
- : "m"(kShuffleMirrorUV) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-#endif // HAS_MIRRORROW_UV_SSSE3
-
-#ifdef HAS_ARGBMIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-static uvec8 kARGBShuffleMirror = {
- 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
-};
-
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
- intptr_t temp_width = (intptr_t)(width);
- asm volatile (
- "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
- "movdqa %3,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "lea " MEMLEA(-0x10,0) ",%0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(temp_width) // %2
- : "m"(kARGBShuffleMirror) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBMIRRORROW_SSSE3
-
-#ifdef HAS_SPLITUVROW_SSE2
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-
-void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-#endif // HAS_SPLITUVROW_SSE2
-
-#ifdef HAS_MERGEUVROW_SSE2
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
- int width) {
- asm volatile (
- "sub %0,%1 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm2 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
- "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n"
- "lea " MEMLEA(0x20,2) ",%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_u), // %0
- "+r"(src_v), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2"
-#endif
- );
-}
-
-void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
- uint8* dst_uv, int width) {
- asm volatile (
- "sub %0,%1 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm2 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
- "lea " MEMLEA(0x20,2) ",%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_u), // %0
- "+r"(src_v), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2"
-#endif
- );
-}
-#endif // HAS_MERGEUVROW_SSE2
-
-#ifdef HAS_COPYROW_SSE2
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(count) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-#endif // HAS_COPYROW_SSE2
-
-#ifdef HAS_COPYROW_X86
-void CopyRow_X86(const uint8* src, uint8* dst, int width) {
- size_t width_tmp = (size_t)(width);
- asm volatile (
- "shr $0x2,%2 \n"
- "rep movsl " MEMMOVESTRING(0,1) " \n"
- : "+S"(src), // %0
- "+D"(dst), // %1
- "+c"(width_tmp) // %2
- :
- : "memory", "cc"
- );
-}
-#endif // HAS_COPYROW_X86
-
-#ifdef HAS_COPYROW_ERMS
-// Unaligned Multiple of 1.
-void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
- size_t width_tmp = (size_t)(width);
- asm volatile (
- "rep movsb " MEMMOVESTRING(0,1) " \n"
- : "+S"(src), // %0
- "+D"(dst), // %1
- "+c"(width_tmp) // %2
- :
- : "memory", "cc"
- );
-}
-#endif // HAS_COPYROW_ERMS
-
-#ifdef HAS_ARGBCOPYALPHAROW_SSE2
-// width in pixels
-void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
- asm volatile (
- "pcmpeqb %%xmm0,%%xmm0 \n"
- "pslld $0x18,%%xmm0 \n"
- "pcmpeqb %%xmm1,%%xmm1 \n"
- "psrld $0x8,%%xmm1 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa " MEMACCESS(1) ",%%xmm4 \n"
- "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n"
- "pand %%xmm0,%%xmm2 \n"
- "pand %%xmm0,%%xmm3 \n"
- "pand %%xmm1,%%xmm4 \n"
- "pand %%xmm1,%%xmm5 \n"
- "por %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqa %%xmm2," MEMACCESS(1) " \n"
- "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBCOPYALPHAROW_SSE2
-
-#ifdef HAS_ARGBCOPYALPHAROW_AVX2
-// width in pixels
-void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
- asm volatile (
- "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
- "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm1," MEMACCESS(1) " \n"
- "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2"
-#endif
- );
-}
-#endif // HAS_ARGBCOPYALPHAROW_AVX2
-
-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
-// width in pixels
-void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
- asm volatile (
- "pcmpeqb %%xmm0,%%xmm0 \n"
- "pslld $0x18,%%xmm0 \n"
- "pcmpeqb %%xmm1,%%xmm1 \n"
- "psrld $0x8,%%xmm1 \n"
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(0) ",%%xmm2 \n"
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "punpckhwd %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm2,%%xmm2 \n"
- "movdqa " MEMACCESS(1) ",%%xmm4 \n"
- "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n"
- "pand %%xmm0,%%xmm2 \n"
- "pand %%xmm0,%%xmm3 \n"
- "pand %%xmm1,%%xmm4 \n"
- "pand %%xmm1,%%xmm5 \n"
- "por %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqa %%xmm2," MEMACCESS(1) " \n"
- "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
-
-#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
-// width in pixels
-void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
- asm volatile (
- "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
- LABELALIGN
- "1: \n"
- "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
- "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "vpslld $0x18,%%ymm1,%%ymm1 \n"
- "vpslld $0x18,%%ymm2,%%ymm2 \n"
- "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
- "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm1," MEMACCESS(1) " \n"
- "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2"
-#endif
- );
-}
-#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
-
-#ifdef HAS_SETROW_X86
-void SetRow_X86(uint8* dst, uint32 v32, int width) {
- size_t width_tmp = (size_t)(width);
- asm volatile (
- "shr $0x2,%1 \n"
- "rep stosl " MEMSTORESTRING(eax,0) " \n"
- : "+D"(dst), // %0
- "+c"(width_tmp) // %1
- : "a"(v32) // %2
- : "memory", "cc");
-}
-
-void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
- int dst_stride, int height) {
- for (int y = 0; y < height; ++y) {
- size_t width_tmp = (size_t)(width);
- uint32* d = (uint32*)(dst);
- asm volatile (
- "rep stosl " MEMSTORESTRING(eax,0) " \n"
- : "+D"(d), // %0
- "+c"(width_tmp) // %1
- : "a"(v32) // %2
- : "memory", "cc");
- dst += dst_stride;
- }
-}
-#endif // HAS_SETROW_X86
-
-#ifdef HAS_YUY2TOYROW_SSE2
-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- BUNDLEALIGN
- MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
- MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- : "r"((intptr_t)(stride_yuy2)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_y, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
- int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
- MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- : "r"((intptr_t)(stride_yuy2)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- BUNDLEALIGN
- MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
- MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- : "r"((intptr_t)(stride_uyvy)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_y, int pix) {
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
- MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- : "r"((intptr_t)(stride_uyvy)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-#endif // HAS_YUY2TOYROW_SSE2
-
-#ifdef HAS_ARGBBLENDROW_SSE2
-// Blend 8 pixels at a time.
-void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- asm volatile (
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $0xf,%%xmm7 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x8,%%xmm6 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psllw $0x8,%%xmm5 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
- "sub $0x1,%3 \n"
- "je 91f \n"
- "jl 99f \n"
-
- // 1 pixel loop until destination pointer is aligned.
- "10: \n"
- "test $0xf,%2 \n"
- "je 19f \n"
- "movd " MEMACCESS(0) ",%%xmm3 \n"
- "lea " MEMLEA(0x4,0) ",%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movd " MEMACCESS(1) ",%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
- "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movd " MEMACCESS(1) ",%%xmm1 \n"
- "lea " MEMLEA(0x4,1) ",%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x1,%3 \n"
- "movd %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x4,2) ",%2 \n"
- "jge 10b \n"
-
- "19: \n"
- "add $1-4,%3 \n"
- "jl 49f \n"
-
- // 4 pixel loop.
- LABELALIGN
- "41: \n"
- "movdqu " MEMACCESS(0) ",%%xmm3 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movdqu " MEMACCESS(1) ",%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
- "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movdqu " MEMACCESS(1) ",%%xmm1 \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "jge 41b \n"
-
- "49: \n"
- "add $0x3,%3 \n"
- "jl 99f \n"
-
- // 1 pixel loop.
- "91: \n"
- "movd " MEMACCESS(0) ",%%xmm3 \n"
- "lea " MEMLEA(0x4,0) ",%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movd " MEMACCESS(1) ",%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
- "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movd " MEMACCESS(1) ",%%xmm1 \n"
- "lea " MEMLEA(0x4,1) ",%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x1,%3 \n"
- "movd %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x4,2) ",%2 \n"
- "jge 91b \n"
- "99: \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-#endif // HAS_ARGBBLENDROW_SSE2
-
-#ifdef HAS_ARGBBLENDROW_SSSE3
-// Shuffle table for isolating alpha.
-static uvec8 kShuffleAlpha = {
- 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
- 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
-};
-
-// Blend 8 pixels at a time
-// Shuffle table for reversing the bytes.
-
-// Same as SSE2, but replaces
-// psrlw xmm3, 8 // alpha
-// pshufhw xmm3, xmm3,0F5h // 8 alpha words
-// pshuflw xmm3, xmm3,0F5h
-// with..
-// pshufb xmm3, kShuffleAlpha // alpha
-
-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- asm volatile (
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $0xf,%%xmm7 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x8,%%xmm6 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psllw $0x8,%%xmm5 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
- "sub $0x1,%3 \n"
- "je 91f \n"
- "jl 99f \n"
-
- // 1 pixel loop until destination pointer is aligned.
- "10: \n"
- "test $0xf,%2 \n"
- "je 19f \n"
- "movd " MEMACCESS(0) ",%%xmm3 \n"
- "lea " MEMLEA(0x4,0) ",%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movd " MEMACCESS(1) ",%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movd " MEMACCESS(1) ",%%xmm1 \n"
- "lea " MEMLEA(0x4,1) ",%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x1,%3 \n"
- "movd %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x4,2) ",%2 \n"
- "jge 10b \n"
-
- "19: \n"
- "add $1-4,%3 \n"
- "jl 49f \n"
- "test $0xf,%0 \n"
- "jne 41f \n"
- "test $0xf,%1 \n"
- "jne 41f \n"
-
- // 4 pixel loop.
- LABELALIGN
- "40: \n"
- "movdqa " MEMACCESS(0) ",%%xmm3 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movdqa " MEMACCESS(1) ",%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movdqa " MEMACCESS(1) ",%%xmm1 \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "jge 40b \n"
- "jmp 49f \n"
-
- // 4 pixel unaligned loop.
- LABELALIGN
- "41: \n"
- "movdqu " MEMACCESS(0) ",%%xmm3 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movdqu " MEMACCESS(1) ",%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movdqu " MEMACCESS(1) ",%%xmm1 \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "jge 41b \n"
-
- "49: \n"
- "add $0x3,%3 \n"
- "jl 99f \n"
-
- // 1 pixel loop.
- "91: \n"
- "movd " MEMACCESS(0) ",%%xmm3 \n"
- "lea " MEMLEA(0x4,0) ",%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movd " MEMACCESS(1) ",%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movd " MEMACCESS(1) ",%%xmm1 \n"
- "lea " MEMLEA(0x4,1) ",%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x1,%3 \n"
- "movd %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x4,2) ",%2 \n"
- "jge 91b \n"
- "99: \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- : "m"(kShuffleAlpha) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-#endif // HAS_ARGBBLENDROW_SSSE3
-
-#ifdef HAS_ARGBATTENUATEROW_SSE2
-// Attenuate 4 pixels at a time.
-// aligned to 16 bytes
-void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile (
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrld $0x8,%%xmm5 \n"
-
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "pshufhw $0xff,%%xmm0,%%xmm2 \n"
- "pshuflw $0xff,%%xmm2,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm1 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "pshufhw $0xff,%%xmm1,%%xmm2 \n"
- "pshuflw $0xff,%%xmm2,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "movdqa " MEMACCESS(0) ",%%xmm2 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "pand %%xmm4,%%xmm2 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBATTENUATEROW_SSE2
-
-#ifdef HAS_ARGBATTENUATEROW_SSSE3
-// Shuffle table duplicating alpha
-static uvec8 kShuffleAlpha0 = {
- 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
-};
-static uvec8 kShuffleAlpha1 = {
- 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
- 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
-};
-// Attenuate 4 pixels at a time.
-// aligned to 16 bytes
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile (
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "pslld $0x18,%%xmm3 \n"
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
-
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu " MEMACCESS(0) ",%%xmm1 \n"
- "punpcklbw %%xmm1,%%xmm1 \n"
- "pmulhuw %%xmm1,%%xmm0 \n"
- "movdqu " MEMACCESS(0) ",%%xmm1 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "movdqu " MEMACCESS(0) ",%%xmm2 \n"
- "punpckhbw %%xmm2,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "movdqu " MEMACCESS(0) ",%%xmm2 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "pand %%xmm3,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "m"(kShuffleAlpha0), // %3
- "m"(kShuffleAlpha1) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBATTENUATEROW_SSSE3
-
-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
-// Unattenuate 4 pixels at a time.
-// aligned to 16 bytes
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
- int width) {
- uintptr_t alpha = 0;
- asm volatile (
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movzb " MEMACCESS2(0x03,0) ",%3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
- "movzb " MEMACCESS2(0x07,0) ",%3 \n"
- MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "movlhps %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "movdqu " MEMACCESS(0) ",%%xmm1 \n"
- "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- BUNDLEALIGN
- MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
- "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
- MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "movlhps %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width), // %2
- "+r"(alpha) // %3
- : "r"(fixed_invtbl8) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBUNATTENUATEROW_SSE2
-
-#ifdef HAS_ARGBGRAYROW_SSSE3
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile (
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "psrld $0x18,%%xmm2 \n"
- "psrld $0x18,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpcklbw %%xmm2,%%xmm3 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm3,%%xmm0 \n"
- "punpckhwd %%xmm3,%%xmm1 \n"
- "sub $0x8,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "m"(kARGBToYJ), // %3
- "m"(kAddYJ64) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBGRAYROW_SSSE3
-
-#ifdef HAS_ARGBSEPIAROW_SSSE3
-// b = (r * 35 + g * 68 + b * 17) >> 7
-// g = (r * 45 + g * 88 + b * 22) >> 7
-// r = (r * 50 + g * 98 + b * 24) >> 7
-// Constant for ARGB color to sepia tone
-static vec8 kARGBToSepiaB = {
- 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
-};
-
-static vec8 kARGBToSepiaG = {
- 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
-};
-
-static vec8 kARGBToSepiaR = {
- 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
-};
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
- asm volatile (
- "movdqa %2,%%xmm2 \n"
- "movdqa %3,%%xmm3 \n"
- "movdqa %4,%%xmm4 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm2,%%xmm6 \n"
- "phaddw %%xmm6,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm5 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm5 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm5 \n"
- "psrlw $0x7,%%xmm5 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm5 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm5 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm5 \n"
- "psrlw $0x7,%%xmm5 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "movdqa " MEMACCESS(0) ",%%xmm6 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "psrld $0x18,%%xmm6 \n"
- "psrld $0x18,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm5 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm5,%%xmm0 \n"
- "punpckhwd %%xmm5,%%xmm1 \n"
- "sub $0x8,%1 \n"
- "movdqa %%xmm0," MEMACCESS(0) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "jg 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- : "m"(kARGBToSepiaB), // %2
- "m"(kARGBToSepiaG), // %3
- "m"(kARGBToSepiaR) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
- );
-}
-#endif // HAS_ARGBSEPIAROW_SSSE3
-
-#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const int8* matrix_argb, int width) {
- asm volatile (
- "movdqu " MEMACCESS(3) ",%%xmm5 \n"
- "pshufd $0x00,%%xmm5,%%xmm2 \n"
- "pshufd $0x55,%%xmm5,%%xmm3 \n"
- "pshufd $0xaa,%%xmm5,%%xmm4 \n"
- "pshufd $0xff,%%xmm5,%%xmm5 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm2,%%xmm7 \n"
- "movdqa " MEMACCESS(0) ",%%xmm6 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "phaddsw %%xmm7,%%xmm0 \n"
- "phaddsw %%xmm1,%%xmm6 \n"
- "psraw $0x6,%%xmm0 \n"
- "psraw $0x6,%%xmm6 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm7 \n"
- "phaddsw %%xmm7,%%xmm1 \n"
- "movdqa " MEMACCESS(0) ",%%xmm6 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm7 \n"
- "phaddsw %%xmm7,%%xmm6 \n"
- "psraw $0x6,%%xmm1 \n"
- "psraw $0x6,%%xmm6 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "punpcklwd %%xmm1,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm6 \n"
- "sub $0x8,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm6," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(matrix_argb) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
-
-#ifdef HAS_ARGBQUANTIZEROW_SSE2
-// Quantize 4 ARGB pixels (16 bytes).
-// aligned to 16 bytes
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
- int interval_offset, int width) {
- asm volatile (
- "movd %2,%%xmm2 \n"
- "movd %3,%%xmm3 \n"
- "movd %4,%%xmm4 \n"
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshufd $0x44,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "pshufd $0x44,%%xmm3,%%xmm3 \n"
- "pshuflw $0x40,%%xmm4,%%xmm4 \n"
- "pshufd $0x44,%%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "pslld $0x18,%%xmm6 \n"
-
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm1 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "pmullw %%xmm3,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm7 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "pand %%xmm6,%%xmm7 \n"
- "paddw %%xmm4,%%xmm0 \n"
- "paddw %%xmm4,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "por %%xmm7,%%xmm0 \n"
- "sub $0x4,%1 \n"
- "movdqa %%xmm0," MEMACCESS(0) " \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "jg 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- : "r"(scale), // %2
- "r"(interval_size), // %3
- "r"(interval_offset) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-#endif // HAS_ARGBQUANTIZEROW_SSE2
-
-#ifdef HAS_ARGBSHADEROW_SSE2
-// Shade 4 pixels at a time by specified value.
-// Aligned to 16 bytes.
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
- uint32 value) {
- asm volatile (
- "movd %3,%%xmm2 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "punpcklqdq %%xmm2,%%xmm2 \n"
-
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(value) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2"
-#endif
- );
-}
-#endif // HAS_ARGBSHADEROW_SSE2
-
-#ifdef HAS_ARGBMULTIPLYROW_SSE2
-// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- asm volatile (
- "pxor %%xmm5,%%xmm5 \n"
-
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqu " MEMACCESS(1) ",%%xmm2 \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "movdqu %%xmm0,%%xmm1 \n"
- "movdqu %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpckhbw %%xmm5,%%xmm3 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBMULTIPLYROW_SSE2
-
-#ifdef HAS_ARGBADDROW_SSE2
-// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- asm volatile (
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqu " MEMACCESS(1) ",%%xmm1 \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-#endif // HAS_ARGBADDROW_SSE2
-
-#ifdef HAS_ARGBSUBTRACTROW_SSE2
-// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
-void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- asm volatile (
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqu " MEMACCESS(1) ",%%xmm1 \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "psubusb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-#endif // HAS_ARGBSUBTRACTROW_SSE2
-
-#ifdef HAS_SOBELXROW_SSE2
-// SobelX as a matrix is
-// -1 0 1
-// -2 0 2
-// -1 0 1
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
- const uint8* src_y2, uint8* dst_sobelx, int width) {
- asm volatile (
- "sub %0,%1 \n"
- "sub %0,%2 \n"
- "sub %0,%3 \n"
- "pxor %%xmm5,%%xmm5 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(0) ",%%xmm0 \n"
- "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "psubw %%xmm1,%%xmm0 \n"
- BUNDLEALIGN
- MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
- MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "psubw %%xmm2,%%xmm1 \n"
- BUNDLEALIGN
- MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
- MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "psubw %%xmm3,%%xmm2 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "psubw %%xmm0,%%xmm1 \n"
- "pmaxsw %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "sub $0x8,%4 \n"
- BUNDLEALIGN
- MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "jg 1b \n"
- : "+r"(src_y0), // %0
- "+r"(src_y1), // %1
- "+r"(src_y2), // %2
- "+r"(dst_sobelx), // %3
- "+r"(width) // %4
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-#endif // HAS_SOBELXROW_SSE2
-
-#ifdef HAS_SOBELYROW_SSE2
-// SobelY as a matrix is
-// -1 -2 -1
-// 0 0 0
-// 1 2 1
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width) {
- asm volatile (
- "sub %0,%1 \n"
- "sub %0,%2 \n"
- "pxor %%xmm5,%%xmm5 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "psubw %%xmm1,%%xmm0 \n"
- BUNDLEALIGN
- "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
- MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "psubw %%xmm2,%%xmm1 \n"
- BUNDLEALIGN
- "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
- MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "psubw %%xmm3,%%xmm2 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "psubw %%xmm0,%%xmm1 \n"
- "pmaxsw %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "sub $0x8,%3 \n"
- BUNDLEALIGN
- MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "jg 1b \n"
- : "+r"(src_y0), // %0
- "+r"(src_y1), // %1
- "+r"(dst_sobely), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-#endif // HAS_SOBELYROW_SSE2
-
-#ifdef HAS_SOBELROW_SSE2
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
- asm volatile (
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm2 \n"
- "punpckhbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm1 \n"
- "punpckhwd %%xmm2,%%xmm2 \n"
- "por %%xmm5,%%xmm1 \n"
- "por %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklwd %%xmm0,%%xmm3 \n"
- "punpckhwd %%xmm0,%%xmm0 \n"
- "por %%xmm5,%%xmm3 \n"
- "por %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movdqa %%xmm1," MEMACCESS(2) " \n"
- "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n"
- "movdqa %%xmm3," MEMACCESS2(0x20,2) " \n"
- "movdqa %%xmm0," MEMACCESS2(0x30,2) " \n"
- "lea " MEMLEA(0x40,2) ",%2 \n"
- "jg 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-#endif // HAS_SOBELROW_SSE2
-
-#ifdef HAS_SOBELTOPLANEROW_SSE2
-// Adds Sobel X and Sobel Y and stores Sobel into a plane.
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_y, int width) {
- asm volatile (
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "jg 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_y), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-#endif // HAS_SOBELTOPLANEROW_SSE2
-
-#ifdef HAS_SOBELXYROW_SSE2
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
- asm volatile (
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "paddusb %%xmm1,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "punpckhbw %%xmm5,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "punpcklbw %%xmm2,%%xmm4 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "punpcklwd %%xmm3,%%xmm6 \n"
- "punpckhwd %%xmm3,%%xmm4 \n"
- "movdqa %%xmm1,%%xmm7 \n"
- "punpcklwd %%xmm0,%%xmm7 \n"
- "punpckhwd %%xmm0,%%xmm1 \n"
- "sub $0x10,%3 \n"
- "movdqa %%xmm6," MEMACCESS(2) " \n"
- "movdqa %%xmm4," MEMACCESS2(0x10,2) " \n"
- "movdqa %%xmm7," MEMACCESS2(0x20,2) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x30,2) " \n"
- "lea " MEMLEA(0x40,2) ",%2 \n"
- "jg 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-#endif // HAS_SOBELXYROW_SSE2
-
-#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
-// Creates a table of cumulative sums where each value is a sum of all values
-// above and to the left of the value, inclusive of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
- const int32* previous_cumsum, int width) {
- asm volatile (
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
- "test $0xf,%1 \n"
- "jne 49f \n"
-
- // 4 pixel loop \n"
- LABELALIGN
- "40: \n"
- "movdqu " MEMACCESS(0) ",%%xmm2 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm2,%%xmm4 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm1,%%xmm2 \n"
- "punpckhwd %%xmm1,%%xmm3 \n"
- "punpckhbw %%xmm1,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "punpcklwd %%xmm1,%%xmm4 \n"
- "punpckhwd %%xmm1,%%xmm5 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "movdqa " MEMACCESS(2) ",%%xmm2 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n"
- "paddd %%xmm0,%%xmm3 \n"
- "paddd %%xmm4,%%xmm0 \n"
- "movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n"
- "paddd %%xmm0,%%xmm4 \n"
- "paddd %%xmm5,%%xmm0 \n"
- "movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n"
- "lea " MEMLEA(0x40,2) ",%2 \n"
- "paddd %%xmm0,%%xmm5 \n"
- "movdqa %%xmm2," MEMACCESS(1) " \n"
- "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
- "movdqa %%xmm4," MEMACCESS2(0x20,1) " \n"
- "movdqa %%xmm5," MEMACCESS2(0x30,1) " \n"
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
-
- "49: \n"
- "add $0x3,%3 \n"
- "jl 19f \n"
-
- // 1 pixel loop \n"
- LABELALIGN
- "10: \n"
- "movd " MEMACCESS(0) ",%%xmm2 \n"
- "lea " MEMLEA(0x4,0) ",%0 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "punpcklwd %%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "movdqu " MEMACCESS(2) ",%%xmm2 \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "movdqu %%xmm2," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x1,%3 \n"
- "jge 10b \n"
-
- "19: \n"
- : "+r"(row), // %0
- "+r"(cumsum), // %1
- "+r"(previous_cumsum), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
-
-#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
- int width, int area, uint8* dst,
- int count) {
- asm volatile (
- "movd %5,%%xmm5 \n"
- "cvtdq2ps %%xmm5,%%xmm5 \n"
- "rcpss %%xmm5,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
- "cmpl $0x80,%5 \n"
- "ja 40f \n"
-
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrld $0x10,%%xmm6 \n"
- "cvtdq2ps %%xmm6,%%xmm6 \n"
- "addps %%xmm6,%%xmm5 \n"
- "mulps %%xmm4,%%xmm5 \n"
- "cvtps2dq %%xmm5,%%xmm5 \n"
- "packssdw %%xmm5,%%xmm5 \n"
-
- // 4 pixel small loop \n"
- LABELALIGN
- "4: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- BUNDLEALIGN
- MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
- MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
- MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
- MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "psubd " MEMACCESS(1) ",%%xmm0 \n"
- "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
- "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
- "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
- BUNDLEALIGN
- MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
- MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
- MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
- MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "packssdw %%xmm1,%%xmm0 \n"
- "packssdw %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm0 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "sub $0x4,%3 \n"
- "jge 4b \n"
- "jmp 49f \n"
-
- // 4 pixel loop \n"
- LABELALIGN
- "40: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- BUNDLEALIGN
- MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
- MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
- MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
- MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "psubd " MEMACCESS(1) ",%%xmm0 \n"
- "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
- "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
- "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
- BUNDLEALIGN
- MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
- MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
- MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
- MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "cvtdq2ps %%xmm1,%%xmm1 \n"
- "mulps %%xmm4,%%xmm0 \n"
- "mulps %%xmm4,%%xmm1 \n"
- "cvtdq2ps %%xmm2,%%xmm2 \n"
- "cvtdq2ps %%xmm3,%%xmm3 \n"
- "mulps %%xmm4,%%xmm2 \n"
- "mulps %%xmm4,%%xmm3 \n"
- "cvtps2dq %%xmm0,%%xmm0 \n"
- "cvtps2dq %%xmm1,%%xmm1 \n"
- "cvtps2dq %%xmm2,%%xmm2 \n"
- "cvtps2dq %%xmm3,%%xmm3 \n"
- "packssdw %%xmm1,%%xmm0 \n"
- "packssdw %%xmm3,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
-
- "49: \n"
- "add $0x3,%3 \n"
- "jl 19f \n"
-
- // 1 pixel loop \n"
- LABELALIGN
- "10: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "psubd " MEMACCESS(1) ",%%xmm0 \n"
- BUNDLEALIGN
- MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "mulps %%xmm4,%%xmm0 \n"
- "cvtps2dq %%xmm0,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x4,2) ",%2 \n"
- "sub $0x1,%3 \n"
- "jge 10b \n"
- "19: \n"
- : "+r"(topleft), // %0
- "+r"(botleft), // %1
- "+r"(dst), // %2
- "+rm"(count) // %3
- : "r"((intptr_t)(width)), // %4
- "rm"(area) // %5
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
- );
-}
-#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-
-#ifdef HAS_ARGBAFFINEROW_SSE2
-// Copy ARGB pixels from source image with slope to a row of destination.
-LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
- uint8* dst_argb, const float* src_dudv, int width) {
- intptr_t src_argb_stride_temp = src_argb_stride;
- intptr_t temp = 0;
- asm volatile (
- "movq " MEMACCESS(3) ",%%xmm2 \n"
- "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
- "shl $0x10,%1 \n"
- "add $0x4,%1 \n"
- "movd %1,%%xmm5 \n"
- "sub $0x4,%4 \n"
- "jl 49f \n"
-
- "pshufd $0x44,%%xmm7,%%xmm7 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "addps %%xmm7,%%xmm0 \n"
- "movlhps %%xmm0,%%xmm2 \n"
- "movdqa %%xmm7,%%xmm4 \n"
- "addps %%xmm4,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "addps %%xmm4,%%xmm3 \n"
- "addps %%xmm4,%%xmm4 \n"
-
- // 4 pixel loop \n"
- LABELALIGN
- "40: \n"
- "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2
- "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2
- "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
- "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride
- "movd %%xmm0,%k1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd %%xmm0,%k5 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- BUNDLEALIGN
- MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
- MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
- "punpckldq %%xmm6,%%xmm1 \n"
- "addps %%xmm4,%%xmm2 \n"
- "movq %%xmm1," MEMACCESS(2) " \n"
- "movd %%xmm0,%k1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd %%xmm0,%k5 \n"
- BUNDLEALIGN
- MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
- MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
- "punpckldq %%xmm6,%%xmm0 \n"
- "addps %%xmm4,%%xmm3 \n"
- "sub $0x4,%4 \n"
- "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "jge 40b \n"
-
- "49: \n"
- "add $0x3,%4 \n"
- "jl 19f \n"
-
- // 1 pixel loop \n"
- LABELALIGN
- "10: \n"
- "cvttps2dq %%xmm2,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "pmaddwd %%xmm5,%%xmm0 \n"
- "addps %%xmm7,%%xmm2 \n"
- "movd %%xmm0,%k1 \n"
- BUNDLEALIGN
- MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
- "sub $0x1,%4 \n"
- "movd %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x04,2) ",%2 \n"
- "jge 10b \n"
- "19: \n"
- : "+r"(src_argb), // %0
- "+r"(src_argb_stride_temp), // %1
- "+r"(dst_argb), // %2
- "+r"(src_dudv), // %3
- "+rm"(width), // %4
- "+r"(temp) // %5
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-#endif // HAS_ARGBAFFINEROW_SSE2
-
-#ifdef HAS_INTERPOLATEROW_SSSE3
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- asm volatile (
- "sub %1,%0 \n"
- "shr %3 \n"
- "cmp $0x0,%3 \n"
- "je 100f \n"
- "cmp $0x20,%3 \n"
- "je 75f \n"
- "cmp $0x40,%3 \n"
- "je 50f \n"
- "cmp $0x60,%3 \n"
- "je 25f \n"
-
- "movd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x80,%3 \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
-
- // General purpose row blend.
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,1,4,1,xmm2)
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "pmaddubsw %%xmm5,%%xmm0 \n"
- "pmaddubsw %%xmm5,%%xmm1 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- "jmp 99f \n"
-
- // Blend 25 / 75.
- LABELALIGN
- "25: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,1,4,1,xmm1)
- "pavgb %%xmm1,%%xmm0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 25b \n"
- "jmp 99f \n"
-
- // Blend 50 / 50.
- LABELALIGN
- "50: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,1,4,1,xmm1)
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 50b \n"
- "jmp 99f \n"
-
- // Blend 75 / 25.
- LABELALIGN
- "75: \n"
- "movdqa " MEMACCESS(1) ",%%xmm1 \n"
- MEMOPREG(movdqa,0x00,1,4,1,xmm0)
- "pavgb %%xmm1,%%xmm0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 75b \n"
- "jmp 99f \n"
-
- // Blend 100 / 0 - Copy row unchanged.
- LABELALIGN
- "100: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
- "sub $0x10,%2 \n"
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 100b \n"
-
- "99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm5"
-#endif
- );
-}
-#endif // HAS_INTERPOLATEROW_SSSE3
-
-#ifdef HAS_INTERPOLATEROW_SSE2
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- asm volatile (
- "sub %1,%0 \n"
- "shr %3 \n"
- "cmp $0x0,%3 \n"
- "je 100f \n"
- "cmp $0x20,%3 \n"
- "je 75f \n"
- "cmp $0x40,%3 \n"
- "je 50f \n"
- "cmp $0x60,%3 \n"
- "je 25f \n"
-
- "movd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x80,%3 \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
-
- // General purpose row blend.
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,1,4,1,xmm2) // movdqa (%1,%4,1),%%xmm2
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm4,%%xmm2 \n"
- "punpckhbw %%xmm4,%%xmm3 \n"
- "punpcklbw %%xmm4,%%xmm0 \n"
- "punpckhbw %%xmm4,%%xmm1 \n"
- "psubw %%xmm0,%%xmm2 \n"
- "psubw %%xmm1,%%xmm3 \n"
- "paddw %%xmm2,%%xmm2 \n"
- "paddw %%xmm3,%%xmm3 \n"
- "pmulhw %%xmm5,%%xmm2 \n"
- "pmulhw %%xmm5,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- "jmp 99f \n"
-
- // Blend 25 / 75.
- LABELALIGN
- "25: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1
- "pavgb %%xmm1,%%xmm0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 25b \n"
- "jmp 99f \n"
-
- // Blend 50 / 50.
- LABELALIGN
- "50: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 50b \n"
- "jmp 99f \n"
-
- // Blend 75 / 25.
- LABELALIGN
- "75: \n"
- "movdqa " MEMACCESS(1) ",%%xmm1 \n"
- MEMOPREG(movdqa,0x00,1,4,1,xmm0) // movdqa (%1,%4,1),%%xmm0
- "pavgb %%xmm1,%%xmm0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 75b \n"
- "jmp 99f \n"
-
- // Blend 100 / 0 - Copy row unchanged.
- LABELALIGN
- "100: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
- "sub $0x10,%2 \n"
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 100b \n"
-
- "99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_INTERPOLATEROW_SSE2
-
-#ifdef HAS_INTERPOLATEROW_SSSE3
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- asm volatile (
- "sub %1,%0 \n"
- "shr %3 \n"
- "cmp $0x0,%3 \n"
- "je 100f \n"
- "cmp $0x20,%3 \n"
- "je 75f \n"
- "cmp $0x40,%3 \n"
- "je 50f \n"
- "cmp $0x60,%3 \n"
- "je 25f \n"
-
- "movd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x80,%3 \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
-
- // General purpose row blend.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm2)
- "movdqu %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "pmaddubsw %%xmm5,%%xmm0 \n"
- "pmaddubsw %%xmm5,%%xmm1 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- "jmp 99f \n"
-
- // Blend 25 / 75.
- LABELALIGN
- "25: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm1)
- "pavgb %%xmm1,%%xmm0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 25b \n"
- "jmp 99f \n"
-
- // Blend 50 / 50.
- LABELALIGN
- "50: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm1)
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 50b \n"
- "jmp 99f \n"
-
- // Blend 75 / 25.
- LABELALIGN
- "75: \n"
- "movdqu " MEMACCESS(1) ",%%xmm1 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm0)
- "pavgb %%xmm1,%%xmm0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 75b \n"
- "jmp 99f \n"
-
- // Blend 100 / 0 - Copy row unchanged.
- LABELALIGN
- "100: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- "sub $0x10,%2 \n"
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 100b \n"
-
- "99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm5"
-#endif
- );
-}
-#endif // HAS_INTERPOLATEROW_SSSE3
-
-#ifdef HAS_INTERPOLATEROW_SSE2
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- asm volatile (
- "sub %1,%0 \n"
- "shr %3 \n"
- "cmp $0x0,%3 \n"
- "je 100f \n"
- "cmp $0x20,%3 \n"
- "je 75f \n"
- "cmp $0x40,%3 \n"
- "je 50f \n"
- "cmp $0x60,%3 \n"
- "je 25f \n"
-
- "movd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x80,%3 \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
-
- // General purpose row blend.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2
- "movdqu %%xmm0,%%xmm1 \n"
- "movdqu %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm4,%%xmm2 \n"
- "punpckhbw %%xmm4,%%xmm3 \n"
- "punpcklbw %%xmm4,%%xmm0 \n"
- "punpckhbw %%xmm4,%%xmm1 \n"
- "psubw %%xmm0,%%xmm2 \n"
- "psubw %%xmm1,%%xmm3 \n"
- "paddw %%xmm2,%%xmm2 \n"
- "paddw %%xmm3,%%xmm3 \n"
- "pmulhw %%xmm5,%%xmm2 \n"
- "pmulhw %%xmm5,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- "jmp 99f \n"
-
- // Blend 25 / 75.
- LABELALIGN
- "25: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
- "pavgb %%xmm1,%%xmm0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 25b \n"
- "jmp 99f \n"
-
- // Blend 50 / 50.
- LABELALIGN
- "50: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 50b \n"
- "jmp 99f \n"
-
- // Blend 75 / 25.
- LABELALIGN
- "75: \n"
- "movdqu " MEMACCESS(1) ",%%xmm1 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0
- "pavgb %%xmm1,%%xmm0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 75b \n"
- "jmp 99f \n"
-
- // Blend 100 / 0 - Copy row unchanged.
- LABELALIGN
- "100: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- "sub $0x10,%2 \n"
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 100b \n"
-
- "99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_INTERPOLATEROW_SSE2
-
-#ifdef HAS_HALFROW_SSE2
-void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix) {
- asm volatile (
- "sub %0,%1 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3),%%xmm0
- "sub $0x10,%2 \n"
- MEMOPMEM(movdqa,xmm0,0x00,0,1,1) // movdqa %%xmm0,(%0,%1)
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "jg 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_uv), // %1
- "+r"(pix) // %2
- : "r"((intptr_t)(src_uv_stride)) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0"
-#endif
- );
-}
-#endif // HAS_HALFROW_SSE2
-
-#ifdef HAS_ARGBTOBAYERROW_SSSE3
-void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) {
- asm volatile (
- // NaCL caveat - assumes movd is from GPR
- "movd %3,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "sub $0x8,%2 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_bayer), // %1
- "+r"(pix) // %2
- : "g"(selector) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBTOBAYERROW_SSSE3
-
-#ifdef HAS_ARGBTOBAYERGGROW_SSE2
-void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrld $0x18,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "psrld $0x8,%%xmm0 \n"
- "psrld $0x8,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packssdw %%xmm1,%%xmm0 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x8,%2 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_bayer), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBTOBAYERGGROW_SSE2
-
-#ifdef HAS_ARGBSHUFFLEROW_SSSE3
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int pix) {
- asm volatile (
- "movdqa " MEMACCESS(3) ",%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "sub $0x8,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- : "r"(shuffler) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int pix) {
- asm volatile (
- "movdqa " MEMACCESS(3) ",%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "sub $0x8,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- : "r"(shuffler) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBSHUFFLEROW_SSSE3
-
-#ifdef HAS_ARGBSHUFFLEROW_AVX2
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int pix) {
- asm volatile (
- "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
- "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
- "sub $0x10,%2 \n"
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- : "r"(shuffler) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBSHUFFLEROW_AVX2
-
-#ifdef HAS_ARGBSHUFFLEROW_SSE2
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int pix) {
- uintptr_t pixel_temp = 0u;
- asm volatile (
- "pxor %%xmm5,%%xmm5 \n"
- "mov " MEMACCESS(4) ",%k2 \n"
- "cmp $0x3000102,%k2 \n"
- "je 3012f \n"
- "cmp $0x10203,%k2 \n"
- "je 123f \n"
- "cmp $0x30201,%k2 \n"
- "je 321f \n"
- "cmp $0x2010003,%k2 \n"
- "je 2103f \n"
-
- LABELALIGN
- "1: \n"
- "movzb " MEMACCESS(4) ",%2 \n"
- MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
- "mov %b2," MEMACCESS(1) " \n"
- "movzb " MEMACCESS2(0x1,4) ",%2 \n"
- MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
- "mov %b2," MEMACCESS2(0x1,1) " \n"
- BUNDLEALIGN
- "movzb " MEMACCESS2(0x2,4) ",%2 \n"
- MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
- "mov %b2," MEMACCESS2(0x2,1) " \n"
- "movzb " MEMACCESS2(0x3,4) ",%2 \n"
- MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
- "mov %b2," MEMACCESS2(0x3,1) " \n"
- "lea " MEMLEA(0x4,0) ",%0 \n"
- "lea " MEMLEA(0x4,1) ",%1 \n"
- "sub $0x1,%3 \n"
- "jg 1b \n"
- "jmp 99f \n"
-
- LABELALIGN
- "123: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpckhbw %%xmm5,%%xmm1 \n"
- "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
- "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
- "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
- "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 123b \n"
- "jmp 99f \n"
-
- LABELALIGN
- "321: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpckhbw %%xmm5,%%xmm1 \n"
- "pshufhw $0x39,%%xmm0,%%xmm0 \n"
- "pshuflw $0x39,%%xmm0,%%xmm0 \n"
- "pshufhw $0x39,%%xmm1,%%xmm1 \n"
- "pshuflw $0x39,%%xmm1,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 321b \n"
- "jmp 99f \n"
-
- LABELALIGN
- "2103: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpckhbw %%xmm5,%%xmm1 \n"
- "pshufhw $0x93,%%xmm0,%%xmm0 \n"
- "pshuflw $0x93,%%xmm0,%%xmm0 \n"
- "pshufhw $0x93,%%xmm1,%%xmm1 \n"
- "pshuflw $0x93,%%xmm1,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 2103b \n"
- "jmp 99f \n"
-
- LABELALIGN
- "3012: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpckhbw %%xmm5,%%xmm1 \n"
- "pshufhw $0xc6,%%xmm0,%%xmm0 \n"
- "pshuflw $0xc6,%%xmm0,%%xmm0 \n"
- "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
- "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 3012b \n"
-
- "99: \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+d"(pixel_temp), // %2
- "+r"(pix) // %3
- : "r"(shuffler) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBSHUFFLEROW_SSE2
-
-#ifdef HAS_I422TOYUY2ROW_SSE2
-void I422ToYUY2Row_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame, int width) {
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(1) ",%%xmm2 \n"
- MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS(3) " \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n"
- "lea " MEMLEA(0x20,3) ",%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_frame), // %3
- "+rm"(width) // %4
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
- );
-}
-#endif // HAS_I422TOYUY2ROW_SSE2
-
-#ifdef HAS_I422TOUYVYROW_SSE2
-void I422ToUYVYRow_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame, int width) {
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(1) ",%%xmm2 \n"
- MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqu %%xmm1," MEMACCESS(3) " \n"
- "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n"
- "lea " MEMLEA(0x20,3) ",%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_frame), // %3
- "+rm"(width) // %4
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
- );
-}
-#endif // HAS_I422TOUYVYROW_SSE2
-
-#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
- uint8* dst_argb, const float* poly,
- int width) {
- asm volatile (
- "pxor %%xmm3,%%xmm3 \n"
-
- // 2 pixel loop.
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "punpcklbw %%xmm3,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "punpcklwd %%xmm3,%%xmm0 \n"
- "punpckhwd %%xmm3,%%xmm4 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "cvtdq2ps %%xmm4,%%xmm4 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n"
- "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n"
- "addps " MEMACCESS(3) ",%%xmm0 \n"
- "addps " MEMACCESS(3) ",%%xmm4 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "movdqa %%xmm5,%%xmm6 \n"
- "mulps %%xmm1,%%xmm2 \n"
- "mulps %%xmm5,%%xmm6 \n"
- "mulps %%xmm2,%%xmm1 \n"
- "mulps %%xmm6,%%xmm5 \n"
- "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n"
- "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n"
- "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n"
- "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n"
- "addps %%xmm2,%%xmm0 \n"
- "addps %%xmm6,%%xmm4 \n"
- "addps %%xmm1,%%xmm0 \n"
- "addps %%xmm5,%%xmm4 \n"
- "cvttps2dq %%xmm0,%%xmm0 \n"
- "cvttps2dq %%xmm4,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "sub $0x2,%2 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(poly) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
- );
-}
-#endif // HAS_ARGBPOLYNOMIALROW_SSE2
-
-#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
- uint8* dst_argb, const float* poly,
- int width) {
- asm volatile (
- "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
- "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
- "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
- "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
-
- // 2 pixel loop.
- LABELALIGN
- "1: \n"
- "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
- "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
- "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
- "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
- "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
- "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
- "vcvttps2dq %%ymm0,%%ymm0 \n"
- "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
- "sub $0x2,%2 \n"
- "vmovq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(poly) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
-// TODO(fbarchard): declare ymm usage when applicable.
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-#endif // HAS_ARGBPOLYNOMIALROW_AVX2
-
-#ifdef HAS_ARGBCOLORTABLEROW_X86
-// Tranform ARGB pixels with color table.
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
- int width) {
- uintptr_t pixel_temp = 0u;
- asm volatile (
- // 1 pixel loop.
- LABELALIGN
- "1: \n"
- "movzb " MEMACCESS(0) ",%1 \n"
- "lea " MEMLEA(0x4,0) ",%0 \n"
- MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
- "mov %b1," MEMACCESS2(-0x4,0) " \n"
- "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
- MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
- "mov %b1," MEMACCESS2(-0x3,0) " \n"
- "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
- MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
- "mov %b1," MEMACCESS2(-0x2,0) " \n"
- "movzb " MEMACCESS2(-0x1,0) ",%1 \n"
- MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1
- "mov %b1," MEMACCESS2(-0x1,0) " \n"
- "dec %2 \n"
- "jg 1b \n"
- : "+r"(dst_argb), // %0
- "+d"(pixel_temp), // %1
- "+r"(width) // %2
- : "r"(table_argb) // %3
- : "memory", "cc");
-}
-#endif // HAS_ARGBCOLORTABLEROW_X86
-
-#ifdef HAS_RGBCOLORTABLEROW_X86
-// Tranform RGB pixels with color table.
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
- uintptr_t pixel_temp = 0u;
- asm volatile (
- // 1 pixel loop.
- LABELALIGN
- "1: \n"
- "movzb " MEMACCESS(0) ",%1 \n"
- "lea " MEMLEA(0x4,0) ",%0 \n"
- MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
- "mov %b1," MEMACCESS2(-0x4,0) " \n"
- "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
- MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
- "mov %b1," MEMACCESS2(-0x3,0) " \n"
- "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
- MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
- "mov %b1," MEMACCESS2(-0x2,0) " \n"
- "dec %2 \n"
- "jg 1b \n"
- : "+r"(dst_argb), // %0
- "+d"(pixel_temp), // %1
- "+r"(width) // %2
- : "r"(table_argb) // %3
- : "memory", "cc");
-}
-#endif // HAS_RGBCOLORTABLEROW_X86
-
-#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
-// Tranform RGB pixels with luma table.
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- int width,
- const uint8* luma, uint32 lumacoeff) {
- uintptr_t pixel_temp = 0u;
- uintptr_t table_temp = 0u;
- asm volatile (
- "movd %6,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0x8,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
-
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(2) ",%%xmm0 \n"
- "pmaddubsw %%xmm3,%%xmm0 \n"
- "phaddw %%xmm0,%%xmm0 \n"
- "pand %%xmm4,%%xmm0 \n"
- "punpcklwd %%xmm5,%%xmm0 \n"
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
-
- "movzb " MEMACCESS(2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS(3) " \n"
- "movzb " MEMACCESS2(0x1,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0x1,3) " \n"
- "movzb " MEMACCESS2(0x2,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0x2,3) " \n"
- "movzb " MEMACCESS2(0x3,2) ",%0 \n"
- "mov %b0," MEMACCESS2(0x3,3) " \n"
-
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
-
- "movzb " MEMACCESS2(0x4,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0x4,3) " \n"
- BUNDLEALIGN
- "movzb " MEMACCESS2(0x5,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0x5,3) " \n"
- "movzb " MEMACCESS2(0x6,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0x6,3) " \n"
- "movzb " MEMACCESS2(0x7,2) ",%0 \n"
- "mov %b0," MEMACCESS2(0x7,3) " \n"
-
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
-
- "movzb " MEMACCESS2(0x8,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0x8,3) " \n"
- "movzb " MEMACCESS2(0x9,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0x9,3) " \n"
- "movzb " MEMACCESS2(0xa,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0xa,3) " \n"
- "movzb " MEMACCESS2(0xb,2) ",%0 \n"
- "mov %b0," MEMACCESS2(0xb,3) " \n"
-
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
-
- "movzb " MEMACCESS2(0xc,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0xc,3) " \n"
- "movzb " MEMACCESS2(0xd,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0xd,3) " \n"
- "movzb " MEMACCESS2(0xe,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0xe,3) " \n"
- "movzb " MEMACCESS2(0xf,2) ",%0 \n"
- "mov %b0," MEMACCESS2(0xf,3) " \n"
- "sub $0x4,%4 \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "lea " MEMLEA(0x10,3) ",%3 \n"
- "jg 1b \n"
- : "+d"(pixel_temp), // %0
- "+a"(table_temp), // %1
- "+r"(src_argb), // %2
- "+r"(dst_argb), // %3
- "+rm"(width) // %4
- : "r"(luma), // %5
- "rm"(lumacoeff) // %6
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
-
-#endif // defined(__x86_64__) || defined(__i386__)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/row_win.cc b/src/main/jni/libyuv/source/row_win.cc
deleted file mode 100644
index f58fc5138..000000000
--- a/src/main/jni/libyuv/source/row_win.cc
+++ /dev/null
@@ -1,7402 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#if defined (_M_X64) && !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
-#include <emmintrin.h>
-#include <tmmintrin.h> // For _mm_maddubs_epi16
-#endif
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Visual C.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
-
-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
-
-#define UB 127 /* min(127,(int8)(2.018 * 64)) */
-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
-#define UR 0
-
-#define VB 0
-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
-
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
-
-static const vec8 kUVToB = {
- UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
-};
-
-static const vec8 kUVToR = {
- UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
-};
-
-static const vec8 kUVToG = {
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
-};
-
-static const vec8 kVUToB = {
- VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
-};
-
-static const vec8 kVUToR = {
- VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
-};
-
-static const vec8 kVUToG = {
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-};
-
-static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
-static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
-static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
-static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
-static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
-
-// 64 bit
-#if defined(_M_X64)
-
-// Aligned destination version.
-__declspec(align(16))
-void I422ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- __m128i xmm0, xmm1, xmm2, xmm3;
- const __m128i xmm5 = _mm_set1_epi8(-1);
- const __m128i xmm4 = _mm_setzero_si128();
- const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
-
- while (width > 0) {
- xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
- xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
- xmm1 = _mm_load_si128(&xmm0);
- xmm2 = _mm_load_si128(&xmm0);
- xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
- xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
- xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
- xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
- xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
- xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
- xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
- xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
- xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
- xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
- xmm0 = _mm_adds_epi16(xmm0, xmm3);
- xmm1 = _mm_adds_epi16(xmm1, xmm3);
- xmm2 = _mm_adds_epi16(xmm2, xmm3);
- xmm0 = _mm_srai_epi16(xmm0, 6);
- xmm1 = _mm_srai_epi16(xmm1, 6);
- xmm2 = _mm_srai_epi16(xmm2, 6);
- xmm0 = _mm_packus_epi16(xmm0, xmm0);
- xmm1 = _mm_packus_epi16(xmm1, xmm1);
- xmm2 = _mm_packus_epi16(xmm2, xmm2);
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
- xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
- xmm1 = _mm_load_si128(&xmm0);
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
- xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
-
- _mm_store_si128((__m128i *)dst_argb, xmm0);
- _mm_store_si128((__m128i *)(dst_argb + 16), xmm1);
-
- y_buf += 8;
- u_buf += 4;
- dst_argb += 32;
- width -= 8;
- }
-}
-
-// Unaligned destination version.
-void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- __m128i xmm0, xmm1, xmm2, xmm3;
- const __m128i xmm5 = _mm_set1_epi8(-1);
- const __m128i xmm4 = _mm_setzero_si128();
- const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
-
- while (width > 0) {
- xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
- xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
- xmm1 = _mm_load_si128(&xmm0);
- xmm2 = _mm_load_si128(&xmm0);
- xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
- xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
- xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
- xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
- xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
- xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
- xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
- xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
- xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
- xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
- xmm0 = _mm_adds_epi16(xmm0, xmm3);
- xmm1 = _mm_adds_epi16(xmm1, xmm3);
- xmm2 = _mm_adds_epi16(xmm2, xmm3);
- xmm0 = _mm_srai_epi16(xmm0, 6);
- xmm1 = _mm_srai_epi16(xmm1, 6);
- xmm2 = _mm_srai_epi16(xmm2, 6);
- xmm0 = _mm_packus_epi16(xmm0, xmm0);
- xmm1 = _mm_packus_epi16(xmm1, xmm1);
- xmm2 = _mm_packus_epi16(xmm2, xmm2);
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
- xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
- xmm1 = _mm_load_si128(&xmm0);
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
- xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
-
- _mm_storeu_si128((__m128i *)dst_argb, xmm0);
- _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
-
- y_buf += 8;
- u_buf += 4;
- dst_argb += 32;
- width -= 8;
- }
-}
-// 32 bit
-#else // defined(_M_X64)
-
-#ifdef HAS_ARGBTOYROW_SSSE3
-
-// Constants for ARGB.
-static const vec8 kARGBToY = {
- 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
-
-// JPeg full range.
-static const vec8 kARGBToYJ = {
- 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
-};
-
-static const vec8 kARGBToU = {
- 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
-};
-
-static const vec8 kARGBToUJ = {
- 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
-};
-
-static const vec8 kARGBToV = {
- -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
-};
-
-static const vec8 kARGBToVJ = {
- -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
-};
-
-// vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {
- 0, 4, 1, 5, 2, 6, 3, 7
-};
-
-// vpshufb for vphaddw + vpackuswb packed to shorts.
-static const lvec8 kShufARGBToUV_AVX = {
- 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
- 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-};
-
-// Constants for BGRA.
-static const vec8 kBGRAToY = {
- 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
-};
-
-static const vec8 kBGRAToU = {
- 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
-};
-
-static const vec8 kBGRAToV = {
- 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
-};
-
-// Constants for ABGR.
-static const vec8 kABGRToY = {
- 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
-};
-
-static const vec8 kABGRToU = {
- -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
-};
-
-static const vec8 kABGRToV = {
- 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
-};
-
-// Constants for RGBA.
-static const vec8 kRGBAToY = {
- 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
-};
-
-static const vec8 kRGBAToU = {
- 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
-};
-
-static const vec8 kRGBAToV = {
- 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
-};
-
-static const uvec8 kAddY16 = {
- 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
-
-static const vec16 kAddYJ64 = {
- 64, 64, 64, 64, 64, 64, 64, 64
-};
-
-static const uvec8 kAddUV128 = {
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
-static const uvec16 kAddUVJ128 = {
- 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
-};
-
-// Shuffle table for converting RGB24 to ARGB.
-static const uvec8 kShuffleMaskRGB24ToARGB = {
- 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
-};
-
-// Shuffle table for converting RAW to ARGB.
-static const uvec8 kShuffleMaskRAWToARGB = {
- 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
-};
-
-// Shuffle table for converting ARGB to RGB24.
-static const uvec8 kShuffleMaskARGBToRGB24 = {
- 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting ARGB to RAW.
-static const uvec8 kShuffleMaskARGBToRAW = {
- 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
-static const uvec8 kShuffleMaskARGBToRGB24_0 = {
- 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
-};
-
-// Shuffle table for converting ARGB to RAW.
-static const uvec8 kShuffleMaskARGBToRAW_0 = {
- 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
-};
-
-// Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked) __declspec(align(16))
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
- __asm {
- mov eax, [esp + 4] // src_y
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0xff000000
- pslld xmm5, 24
-
- align 4
- convertloop:
- movq xmm0, qword ptr [eax]
- lea eax, [eax + 8]
- punpcklbw xmm0, xmm0
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm0
- punpckhwd xmm1, xmm1
- por xmm0, xmm5
- por xmm1, xmm5
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
- int pix) {
- __asm {
- mov eax, [esp + 4] // src_y
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0xff000000
- pslld xmm5, 24
-
- align 4
- convertloop:
- movq xmm0, qword ptr [eax]
- lea eax, [eax + 8]
- punpcklbw xmm0, xmm0
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm0
- punpckhwd xmm1, xmm1
- por xmm0, xmm5
- por xmm1, xmm5
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
- __asm {
- mov eax, [esp + 4] // src_rgb24
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0xff000000
- pslld xmm5, 24
- movdqa xmm4, kShuffleMaskRGB24ToARGB
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm3, [eax + 32]
- lea eax, [eax + 48]
- movdqa xmm2, xmm3
- palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
- pshufb xmm2, xmm4
- por xmm2, xmm5
- palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
- pshufb xmm0, xmm4
- movdqa [edx + 32], xmm2
- por xmm0, xmm5
- pshufb xmm1, xmm4
- movdqa [edx], xmm0
- por xmm1, xmm5
- palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
- pshufb xmm3, xmm4
- movdqa [edx + 16], xmm1
- por xmm3, xmm5
- sub ecx, 16
- movdqa [edx + 48], xmm3
- lea edx, [edx + 64]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
- int pix) {
- __asm {
- mov eax, [esp + 4] // src_raw
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0xff000000
- pslld xmm5, 24
- movdqa xmm4, kShuffleMaskRAWToARGB
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm3, [eax + 32]
- lea eax, [eax + 48]
- movdqa xmm2, xmm3
- palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
- pshufb xmm2, xmm4
- por xmm2, xmm5
- palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
- pshufb xmm0, xmm4
- movdqa [edx + 32], xmm2
- por xmm0, xmm5
- pshufb xmm1, xmm4
- movdqa [edx], xmm0
- por xmm1, xmm5
- palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
- pshufb xmm3, xmm4
- movdqa [edx + 16], xmm1
- por xmm3, xmm5
- sub ecx, 16
- movdqa [edx + 48], xmm3
- lea edx, [edx + 64]
- jg convertloop
- ret
- }
-}
-
-// pmul method to replicate bits.
-// Math to replicate bits:
-// (v << 8) | (v << 3)
-// v * 256 + v * 8
-// v * (256 + 8)
-// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
-// 20 instructions.
-__declspec(naked) __declspec(align(16))
-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
- int pix) {
- __asm {
- mov eax, 0x01080108 // generate multiplier to repeat 5 bits
- movd xmm5, eax
- pshufd xmm5, xmm5, 0
- mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
- movd xmm6, eax
- pshufd xmm6, xmm6, 0
- pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
- psllw xmm3, 11
- pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
- psllw xmm4, 10
- psrlw xmm4, 5
- pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
- psllw xmm7, 8
-
- mov eax, [esp + 4] // src_rgb565
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- sub edx, eax
- sub edx, eax
-
- align 4
- convertloop:
- movdqu xmm0, [eax] // fetch 8 pixels of bgr565
- movdqa xmm1, xmm0
- movdqa xmm2, xmm0
- pand xmm1, xmm3 // R in upper 5 bits
- psllw xmm2, 11 // B in upper 5 bits
- pmulhuw xmm1, xmm5 // * (256 + 8)
- pmulhuw xmm2, xmm5 // * (256 + 8)
- psllw xmm1, 8
- por xmm1, xmm2 // RB
- pand xmm0, xmm4 // G in middle 6 bits
- pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
- por xmm0, xmm7 // AG
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
- movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
- lea eax, [eax + 16]
- sub ecx, 8
- jg convertloop
- ret
- }
-}
-
-// 24 instructions
-__declspec(naked) __declspec(align(16))
-void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
- int pix) {
- __asm {
- mov eax, 0x01080108 // generate multiplier to repeat 5 bits
- movd xmm5, eax
- pshufd xmm5, xmm5, 0
- mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
- movd xmm6, eax
- pshufd xmm6, xmm6, 0
- pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
- psllw xmm3, 11
- movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
- psrlw xmm4, 6
- pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
- psllw xmm7, 8
-
- mov eax, [esp + 4] // src_argb1555
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- sub edx, eax
- sub edx, eax
-
- align 4
- convertloop:
- movdqu xmm0, [eax] // fetch 8 pixels of 1555
- movdqa xmm1, xmm0
- movdqa xmm2, xmm0
- psllw xmm1, 1 // R in upper 5 bits
- psllw xmm2, 11 // B in upper 5 bits
- pand xmm1, xmm3
- pmulhuw xmm2, xmm5 // * (256 + 8)
- pmulhuw xmm1, xmm5 // * (256 + 8)
- psllw xmm1, 8
- por xmm1, xmm2 // RB
- movdqa xmm2, xmm0
- pand xmm0, xmm4 // G in middle 5 bits
- psraw xmm2, 8 // A
- pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
- pand xmm2, xmm7
- por xmm0, xmm2 // AG
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
- movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
- lea eax, [eax + 16]
- sub ecx, 8
- jg convertloop
- ret
- }
-}
-
-// 18 instructions.
-__declspec(naked) __declspec(align(16))
-void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
- int pix) {
- __asm {
- mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
- movd xmm4, eax
- pshufd xmm4, xmm4, 0
- movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
- pslld xmm5, 4
- mov eax, [esp + 4] // src_argb4444
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- sub edx, eax
- sub edx, eax
-
- align 4
- convertloop:
- movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
- movdqa xmm2, xmm0
- pand xmm0, xmm4 // mask low nibbles
- pand xmm2, xmm5 // mask high nibbles
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- psllw xmm1, 4
- psrlw xmm3, 4
- por xmm0, xmm1
- por xmm2, xmm3
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm2
- punpckhbw xmm1, xmm2
- movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
- movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
- lea eax, [eax + 16]
- sub ecx, 8
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // pix
- movdqa xmm6, kShuffleMaskARGBToRGB24
-
- align 4
- convertloop:
- movdqu xmm0, [eax] // fetch 16 pixels of argb
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- lea eax, [eax + 64]
- pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
- pshufb xmm1, xmm6
- pshufb xmm2, xmm6
- pshufb xmm3, xmm6
- movdqa xmm4, xmm1 // 4 bytes from 1 for 0
- psrldq xmm1, 4 // 8 bytes from 1
- pslldq xmm4, 12 // 4 bytes from 1 for 0
- movdqa xmm5, xmm2 // 8 bytes from 2 for 1
- por xmm0, xmm4 // 4 bytes from 1 for 0
- pslldq xmm5, 8 // 8 bytes from 2 for 1
- movdqu [edx], xmm0 // store 0
- por xmm1, xmm5 // 8 bytes from 2 for 1
- psrldq xmm2, 8 // 4 bytes from 2
- pslldq xmm3, 4 // 12 bytes from 3 for 2
- por xmm2, xmm3 // 12 bytes from 3 for 2
- movdqu [edx + 16], xmm1 // store 1
- movdqu [edx + 32], xmm2 // store 2
- lea edx, [edx + 48]
- sub ecx, 16
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // pix
- movdqa xmm6, kShuffleMaskARGBToRAW
-
- align 4
- convertloop:
- movdqu xmm0, [eax] // fetch 16 pixels of argb
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- lea eax, [eax + 64]
- pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
- pshufb xmm1, xmm6
- pshufb xmm2, xmm6
- pshufb xmm3, xmm6
- movdqa xmm4, xmm1 // 4 bytes from 1 for 0
- psrldq xmm1, 4 // 8 bytes from 1
- pslldq xmm4, 12 // 4 bytes from 1 for 0
- movdqa xmm5, xmm2 // 8 bytes from 2 for 1
- por xmm0, xmm4 // 4 bytes from 1 for 0
- pslldq xmm5, 8 // 8 bytes from 2 for 1
- movdqu [edx], xmm0 // store 0
- por xmm1, xmm5 // 8 bytes from 2 for 1
- psrldq xmm2, 8 // 4 bytes from 2
- pslldq xmm3, 4 // 12 bytes from 3 for 2
- por xmm2, xmm3 // 12 bytes from 3 for 2
- movdqu [edx + 16], xmm1 // store 1
- movdqu [edx + 32], xmm2 // store 2
- lea edx, [edx + 48]
- sub ecx, 16
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
- psrld xmm3, 27
- pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
- psrld xmm4, 26
- pslld xmm4, 5
- pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
- pslld xmm5, 11
-
- align 4
- convertloop:
- movdqa xmm0, [eax] // fetch 4 pixels of argb
- movdqa xmm1, xmm0 // B
- movdqa xmm2, xmm0 // G
- pslld xmm0, 8 // R
- psrld xmm1, 3 // B
- psrld xmm2, 5 // G
- psrad xmm0, 16 // R
- pand xmm1, xmm3 // B
- pand xmm2, xmm4 // G
- pand xmm0, xmm5 // R
- por xmm1, xmm2 // BG
- por xmm0, xmm1 // BGR
- packssdw xmm0, xmm0
- lea eax, [eax + 16]
- movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
- lea edx, [edx + 8]
- sub ecx, 4
- jg convertloop
- ret
- }
-}
-
-// TODO(fbarchard): Improve sign extension/packing.
-__declspec(naked) __declspec(align(16))
-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
- psrld xmm4, 27
- movdqa xmm5, xmm4 // generate mask 0x000003e0
- pslld xmm5, 5
- movdqa xmm6, xmm4 // generate mask 0x00007c00
- pslld xmm6, 10
- pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
- pslld xmm7, 15
-
- align 4
- convertloop:
- movdqa xmm0, [eax] // fetch 4 pixels of argb
- movdqa xmm1, xmm0 // B
- movdqa xmm2, xmm0 // G
- movdqa xmm3, xmm0 // R
- psrad xmm0, 16 // A
- psrld xmm1, 3 // B
- psrld xmm2, 6 // G
- psrld xmm3, 9 // R
- pand xmm0, xmm7 // A
- pand xmm1, xmm4 // B
- pand xmm2, xmm5 // G
- pand xmm3, xmm6 // R
- por xmm0, xmm1 // BA
- por xmm2, xmm3 // GR
- por xmm0, xmm2 // BGRA
- packssdw xmm0, xmm0
- lea eax, [eax + 16]
- movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
- lea edx, [edx + 8]
- sub ecx, 4
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
- psllw xmm4, 12
- movdqa xmm3, xmm4 // generate mask 0x00f000f0
- psrlw xmm3, 8
-
- align 4
- convertloop:
- movdqa xmm0, [eax] // fetch 4 pixels of argb
- movdqa xmm1, xmm0
- pand xmm0, xmm3 // low nibble
- pand xmm1, xmm4 // high nibble
- psrl xmm0, 4
- psrl xmm1, 8
- por xmm0, xmm1
- packuswb xmm0, xmm0
- lea eax, [eax + 16]
- movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
- lea edx, [edx + 8]
- sub ecx, 4
- jg convertloop
- ret
- }
-}
-
-// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked) __declspec(align(16))
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kARGBToY
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked) __declspec(align(16))
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm4, kARGBToYJ
- movdqa xmm5, kAddYJ64
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- paddw xmm0, xmm5 // Add .5 for rounding.
- paddw xmm2, xmm5
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-#ifdef HAS_ARGBTOYROW_AVX2
-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked) __declspec(align(32))
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- vbroadcastf128 ymm4, kARGBToY
- vbroadcastf128 ymm5, kAddY16
- vmovdqa ymm6, kPermdARGBToY_AVX
-
- align 4
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vmovdqu ymm2, [eax + 64]
- vmovdqu ymm3, [eax + 96]
- vpmaddubsw ymm0, ymm0, ymm4
- vpmaddubsw ymm1, ymm1, ymm4
- vpmaddubsw ymm2, ymm2, ymm4
- vpmaddubsw ymm3, ymm3, ymm4
- lea eax, [eax + 128]
- vphaddw ymm0, ymm0, ymm1 // mutates.
- vphaddw ymm2, ymm2, ymm3
- vpsrlw ymm0, ymm0, 7
- vpsrlw ymm2, ymm2, 7
- vpackuswb ymm0, ymm0, ymm2 // mutates.
- vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
- vpaddb ymm0, ymm0, ymm5
- sub ecx, 32
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- jg convertloop
- vzeroupper
- ret
- }
-}
-#endif // HAS_ARGBTOYROW_AVX2
-
-#ifdef HAS_ARGBTOYROW_AVX2
-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked) __declspec(align(32))
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- vbroadcastf128 ymm4, kARGBToYJ
- vbroadcastf128 ymm5, kAddYJ64
- vmovdqa ymm6, kPermdARGBToY_AVX
-
- align 4
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vmovdqu ymm2, [eax + 64]
- vmovdqu ymm3, [eax + 96]
- vpmaddubsw ymm0, ymm0, ymm4
- vpmaddubsw ymm1, ymm1, ymm4
- vpmaddubsw ymm2, ymm2, ymm4
- vpmaddubsw ymm3, ymm3, ymm4
- lea eax, [eax + 128]
- vphaddw ymm0, ymm0, ymm1 // mutates.
- vphaddw ymm2, ymm2, ymm3
- vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
- vpaddw ymm2, ymm2, ymm5
- vpsrlw ymm0, ymm0, 7
- vpsrlw ymm2, ymm2, 7
- vpackuswb ymm0, ymm0, ymm2 // mutates.
- vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
- sub ecx, 32
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- jg convertloop
-
- vzeroupper
- ret
- }
-}
-#endif // HAS_ARGBTOYJROW_AVX2
-
-__declspec(naked) __declspec(align(16))
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kARGBToY
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm4, kARGBToYJ
- movdqa xmm5, kAddYJ64
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- paddw xmm0, xmm5
- paddw xmm2, xmm5
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kBGRAToY
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kBGRAToY
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kABGRToY
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kABGRToY
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kRGBAToY
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kRGBAToY
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kARGBToU
- movdqa xmm6, kARGBToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pavgb xmm0, [eax + esi]
- pavgb xmm1, [eax + esi + 16]
- pavgb xmm2, [eax + esi + 32]
- pavgb xmm3, [eax + esi + 48]
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kARGBToUJ
- movdqa xmm6, kARGBToVJ
- movdqa xmm5, kAddUVJ128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pavgb xmm0, [eax + esi]
- pavgb xmm1, [eax + esi + 16]
- pavgb xmm2, [eax + esi + 32]
- pavgb xmm3, [eax + esi + 48]
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- paddw xmm0, xmm5 // +.5 rounding -> unsigned
- paddw xmm1, xmm5
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-#ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked) __declspec(align(32))
-void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- vbroadcastf128 ymm5, kAddUV128
- vbroadcastf128 ymm6, kARGBToV
- vbroadcastf128 ymm7, kARGBToU
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 32x2 argb pixels to 16x1 */
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vmovdqu ymm2, [eax + 64]
- vmovdqu ymm3, [eax + 96]
- vpavgb ymm0, ymm0, [eax + esi]
- vpavgb ymm1, ymm1, [eax + esi + 32]
- vpavgb ymm2, ymm2, [eax + esi + 64]
- vpavgb ymm3, ymm3, [eax + esi + 96]
- lea eax, [eax + 128]
- vshufps ymm4, ymm0, ymm1, 0x88
- vshufps ymm0, ymm0, ymm1, 0xdd
- vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
- vshufps ymm4, ymm2, ymm3, 0x88
- vshufps ymm2, ymm2, ymm3, 0xdd
- vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 32 different pixels, its 16 pixels of U and 16 of V
- vpmaddubsw ymm1, ymm0, ymm7 // U
- vpmaddubsw ymm3, ymm2, ymm7
- vpmaddubsw ymm0, ymm0, ymm6 // V
- vpmaddubsw ymm2, ymm2, ymm6
- vphaddw ymm1, ymm1, ymm3 // mutates
- vphaddw ymm0, ymm0, ymm2
- vpsraw ymm1, ymm1, 8
- vpsraw ymm0, ymm0, 8
- vpacksswb ymm0, ymm1, ymm0 // mutates
- vpermq ymm0, ymm0, 0xd8 // For vpacksswb
- vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw
- vpaddb ymm0, ymm0, ymm5 // -> unsigned
-
- // step 3 - store 16 U and 16 V values
- sub ecx, 32
- vextractf128 [edx], ymm0, 0 // U
- vextractf128 [edx + edi], ymm0, 1 // V
- lea edx, [edx + 16]
- jg convertloop
-
- pop edi
- pop esi
- vzeroupper
- ret
- }
-}
-#endif // HAS_ARGBTOUVROW_AVX2
-
-__declspec(naked) __declspec(align(16))
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kARGBToU
- movdqa xmm6, kARGBToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kARGBToUJ
- movdqa xmm6, kARGBToVJ
- movdqa xmm5, kAddUVJ128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- paddw xmm0, xmm5 // +.5 rounding -> unsigned
- paddw xmm1, xmm5
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_argb
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- movdqa xmm7, kARGBToU
- movdqa xmm6, kARGBToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* convert to U and V */
- movdqa xmm0, [eax] // U
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pmaddubsw xmm0, xmm7
- pmaddubsw xmm1, xmm7
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm3, xmm7
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psraw xmm0, 8
- psraw xmm2, 8
- packsswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqa [edx], xmm0
-
- movdqa xmm0, [eax] // V
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pmaddubsw xmm0, xmm6
- pmaddubsw xmm1, xmm6
- pmaddubsw xmm2, xmm6
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psraw xmm0, 8
- psraw xmm2, 8
- packsswb xmm0, xmm2
- paddb xmm0, xmm5
- lea eax, [eax + 64]
- movdqa [edx + edi], xmm0
- lea edx, [edx + 16]
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_argb
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- movdqa xmm7, kARGBToU
- movdqa xmm6, kARGBToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* convert to U and V */
- movdqu xmm0, [eax] // U
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm7
- pmaddubsw xmm1, xmm7
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm3, xmm7
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psraw xmm0, 8
- psraw xmm2, 8
- packsswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqu [edx], xmm0
-
- movdqu xmm0, [eax] // V
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm6
- pmaddubsw xmm1, xmm6
- pmaddubsw xmm2, xmm6
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psraw xmm0, 8
- psraw xmm2, 8
- packsswb xmm0, xmm2
- paddb xmm0, xmm5
- lea eax, [eax + 64]
- movdqu [edx + edi], xmm0
- lea edx, [edx + 16]
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_argb
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- movdqa xmm7, kARGBToU
- movdqa xmm6, kARGBToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_argb
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- movdqa xmm7, kARGBToU
- movdqa xmm6, kARGBToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kBGRAToU
- movdqa xmm6, kBGRAToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pavgb xmm0, [eax + esi]
- pavgb xmm1, [eax + esi + 16]
- pavgb xmm2, [eax + esi + 32]
- pavgb xmm3, [eax + esi + 48]
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kBGRAToU
- movdqa xmm6, kBGRAToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kABGRToU
- movdqa xmm6, kABGRToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pavgb xmm0, [eax + esi]
- pavgb xmm1, [eax + esi + 16]
- pavgb xmm2, [eax + esi + 32]
- pavgb xmm3, [eax + esi + 48]
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kABGRToU
- movdqa xmm6, kABGRToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kRGBAToU
- movdqa xmm6, kRGBAToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pavgb xmm0, [eax + esi]
- pavgb xmm1, [eax + esi + 16]
- pavgb xmm2, [eax + esi + 32]
- pavgb xmm3, [eax + esi + 48]
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kRGBAToU
- movdqa xmm6, kRGBAToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-#endif // HAS_ARGBTOYROW_SSSE3
-
-#ifdef HAS_I422TOARGBROW_AVX2
-
-static const lvec8 kUVToB_AVX = {
- UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
- UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
-};
-static const lvec8 kUVToR_AVX = {
- UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,
- UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
-};
-static const lvec8 kUVToG_AVX = {
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
-};
-static const lvec16 kYToRgb_AVX = {
- YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
-};
-static const lvec16 kYSub16_AVX = {
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-};
-static const lvec16 kUVBiasB_AVX = {
- BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
-};
-static const lvec16 kUVBiasG_AVX = {
- BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
-};
-static const lvec16 kUVBiasR_AVX = {
- BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
-};
-
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) __declspec(align(16))
-void I422ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // argb
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- vpxor ymm4, ymm4, ymm4
-
- align 4
- convertloop:
- vmovq xmm0, qword ptr [esi] // U
- vmovq xmm1, qword ptr [esi + edi] // V
- lea esi, [esi + 8]
- vpunpcklbw ymm0, ymm0, ymm1 // UV
- vpermq ymm0, ymm0, 0xd8
- vpunpcklwd ymm0, ymm0, ymm0 // UVUV
- vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV
- vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV
- vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV
- vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed
- vpsubw ymm1, ymm1, kUVBiasG_AVX
- vpsubw ymm0, ymm0, kUVBiasR_AVX
-
- // Step 2: Find Y contribution to 16 R,G,B values
- vmovdqu xmm3, [eax] // NOLINT
- lea eax, [eax + 16]
- vpermq ymm3, ymm3, 0xd8
- vpunpcklbw ymm3, ymm3, ymm4
- vpsubsw ymm3, ymm3, kYSub16_AVX
- vpmullw ymm3, ymm3, kYToRgb_AVX
- vpaddsw ymm2, ymm2, ymm3 // B += Y
- vpaddsw ymm1, ymm1, ymm3 // G += Y
- vpaddsw ymm0, ymm0, ymm3 // R += Y
- vpsraw ymm2, ymm2, 6
- vpsraw ymm1, ymm1, 6
- vpsraw ymm0, ymm0, 6
- vpackuswb ymm2, ymm2, ymm2 // B
- vpackuswb ymm1, ymm1, ymm1 // G
- vpackuswb ymm0, ymm0, ymm0 // R
-
- // Step 3: Weave into ARGB
- vpunpcklbw ymm2, ymm2, ymm1 // BG
- vpermq ymm2, ymm2, 0xd8
- vpunpcklbw ymm0, ymm0, ymm5 // RA
- vpermq ymm0, ymm0, 0xd8
- vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels
- vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels
- vmovdqu [edx], ymm1
- vmovdqu [edx + 32], ymm2
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
- vzeroupper
-
- pop edi
- pop esi
- ret
- }
-}
-#endif // HAS_I422TOARGBROW_AVX2
-
-#ifdef HAS_I422TOARGBROW_SSSE3
-
-// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
-
-// Read 8 UV from 444.
-#define READYUV444 __asm { \
- __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
- __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
- __asm lea esi, [esi + 8] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- }
-
-// Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422 __asm { \
- __asm movd xmm0, [esi] /* U */ \
- __asm movd xmm1, [esi + edi] /* V */ \
- __asm lea esi, [esi + 4] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- }
-
-// Read 2 UV from 411, upsample to 8 UV.
-#define READYUV411 __asm { \
- __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \
- __asm movd xmm0, ebx \
- __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \
- __asm movd xmm1, ebx \
- __asm lea esi, [esi + 2] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \
- }
-
-// Read 4 UV from NV12, upsample to 8 UV.
-#define READNV12 __asm { \
- __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \
- __asm lea esi, [esi + 8] \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- }
-
-// Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB __asm { \
- /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
- __asm movdqa xmm1, xmm0 \
- __asm movdqa xmm2, xmm0 \
- __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
- __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
- __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
- __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
- __asm psubw xmm1, kUVBiasG \
- __asm psubw xmm2, kUVBiasR \
- /* Step 2: Find Y contribution to 8 R,G,B values */ \
- __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
- __asm lea eax, [eax + 8] \
- __asm punpcklbw xmm3, xmm4 \
- __asm psubsw xmm3, kYSub16 \
- __asm pmullw xmm3, kYToRgb \
- __asm paddsw xmm0, xmm3 /* B += Y */ \
- __asm paddsw xmm1, xmm3 /* G += Y */ \
- __asm paddsw xmm2, xmm3 /* R += Y */ \
- __asm psraw xmm0, 6 \
- __asm psraw xmm1, 6 \
- __asm psraw xmm2, 6 \
- __asm packuswb xmm0, xmm0 /* B */ \
- __asm packuswb xmm1, xmm1 /* G */ \
- __asm packuswb xmm2, xmm2 /* R */ \
- }
-
-// Convert 8 pixels: 8 VU and 8 Y.
-#define YVUTORGB __asm { \
- /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
- __asm movdqa xmm1, xmm0 \
- __asm movdqa xmm2, xmm0 \
- __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \
- __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \
- __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \
- __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
- __asm psubw xmm1, kUVBiasG \
- __asm psubw xmm2, kUVBiasR \
- /* Step 2: Find Y contribution to 8 R,G,B values */ \
- __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
- __asm lea eax, [eax + 8] \
- __asm punpcklbw xmm3, xmm4 \
- __asm psubsw xmm3, kYSub16 \
- __asm pmullw xmm3, kYToRgb \
- __asm paddsw xmm0, xmm3 /* B += Y */ \
- __asm paddsw xmm1, xmm3 /* G += Y */ \
- __asm paddsw xmm2, xmm3 /* R += Y */ \
- __asm psraw xmm0, 6 \
- __asm psraw xmm1, 6 \
- __asm psraw xmm2, 6 \
- __asm packuswb xmm0, xmm0 /* B */ \
- __asm packuswb xmm1, xmm1 /* G */ \
- __asm packuswb xmm2, xmm2 /* R */ \
- }
-
-// 8 pixels, dest aligned 16.
-// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I444ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // argb
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV444
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I422ToRGB24Row_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgb24,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // rgb24
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pxor xmm4, xmm4
- movdqa xmm5, kShuffleMaskARGBToRGB24_0
- movdqa xmm6, kShuffleMaskARGBToRGB24
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into RRGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm2 // RR
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRR first 4 pixels
- punpckhwd xmm1, xmm2 // BGRR next 4 pixels
- pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
- pshufb xmm1, xmm6 // Pack into first 12 bytes.
- palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
- movq qword ptr [edx], xmm0 // First 8 bytes
- movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
- lea edx, [edx + 24]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I422ToRAWRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_raw,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // raw
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pxor xmm4, xmm4
- movdqa xmm5, kShuffleMaskARGBToRAW_0
- movdqa xmm6, kShuffleMaskARGBToRAW
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into RRGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm2 // RR
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRR first 4 pixels
- punpckhwd xmm1, xmm2 // BGRR next 4 pixels
- pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
- pshufb xmm1, xmm6 // Pack into first 12 bytes.
- palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
- movq qword ptr [edx], xmm0 // First 8 bytes
- movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
- lea edx, [edx + 24]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-// 8 pixels, dest unaligned.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I422ToRGB565Row_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb565_buf,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // rgb565
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pxor xmm4, xmm4
- pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
- psrld xmm5, 27
- pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
- psrld xmm6, 26
- pslld xmm6, 5
- pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
- pslld xmm7, 11
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into RRGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm2 // RR
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRR first 4 pixels
- punpckhwd xmm1, xmm2 // BGRR next 4 pixels
-
- // Step 3b: RRGB -> RGB565
- movdqa xmm3, xmm0 // B first 4 pixels of argb
- movdqa xmm2, xmm0 // G
- pslld xmm0, 8 // R
- psrld xmm3, 3 // B
- psrld xmm2, 5 // G
- psrad xmm0, 16 // R
- pand xmm3, xmm5 // B
- pand xmm2, xmm6 // G
- pand xmm0, xmm7 // R
- por xmm3, xmm2 // BG
- por xmm0, xmm3 // BGR
- movdqa xmm3, xmm1 // B next 4 pixels of argb
- movdqa xmm2, xmm1 // G
- pslld xmm1, 8 // R
- psrld xmm3, 3 // B
- psrld xmm2, 5 // G
- psrad xmm1, 16 // R
- pand xmm3, xmm5 // B
- pand xmm2, xmm6 // G
- pand xmm1, xmm7 // R
- por xmm3, xmm2 // BG
- por xmm1, xmm3 // BGR
- packssdw xmm0, xmm1
- sub ecx, 8
- movdqu [edx], xmm0 // store 8 pixels of RGB565
- lea edx, [edx + 16]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I422ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // argb
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-// 8 pixels, dest aligned 16.
-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-// Similar to I420 but duplicate UV once more.
-__declspec(naked) __declspec(align(16))
-void I411ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push ebx
- push esi
- push edi
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ecx, [esp + 12 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV411 // modifies EBX
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- pop ebx
- ret
- }
-}
-
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void NV12ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // Y
- mov esi, [esp + 4 + 8] // UV
- mov edx, [esp + 4 + 12] // argb
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READNV12
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop esi
- ret
- }
-}
-
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void NV21ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // Y
- mov esi, [esp + 4 + 8] // VU
- mov edx, [esp + 4 + 12] // argb
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READNV12
- YVUTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop esi
- ret
- }
-}
-
-// 8 pixels, unaligned.
-// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // argb
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV444
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-// 8 pixels, unaligned.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // argb
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-// 8 pixels, unaligned.
-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-// Similar to I420 but duplicate UV once more.
-__declspec(naked) __declspec(align(16))
-void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push ebx
- push esi
- push edi
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ecx, [esp + 12 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV411 // modifies EBX
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- pop ebx
- ret
- }
-}
-
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // Y
- mov esi, [esp + 4 + 8] // UV
- mov edx, [esp + 4 + 12] // argb
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READNV12
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop esi
- ret
- }
-}
-
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // Y
- mov esi, [esp + 4 + 8] // VU
- mov edx, [esp + 4 + 12] // argb
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READNV12
- YVUTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void I422ToBGRARow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_bgra,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // bgra
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into BGRA
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- punpcklbw xmm1, xmm0 // GB
- punpcklbw xmm5, xmm2 // AR
- movdqa xmm0, xmm5
- punpcklwd xmm5, xmm1 // BGRA first 4 pixels
- punpckhwd xmm0, xmm1 // BGRA next 4 pixels
- movdqa [edx], xmm5
- movdqa [edx + 16], xmm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_bgra,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // bgra
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into BGRA
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- punpcklbw xmm1, xmm0 // GB
- punpcklbw xmm5, xmm2 // AR
- movdqa xmm0, xmm5
- punpcklwd xmm5, xmm1 // BGRA first 4 pixels
- punpckhwd xmm0, xmm1 // BGRA next 4 pixels
- movdqu [edx], xmm5
- movdqu [edx + 16], xmm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void I422ToABGRRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_abgr,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // abgr
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm2, xmm1 // RG
- punpcklbw xmm0, xmm5 // BA
- movdqa xmm1, xmm2
- punpcklwd xmm2, xmm0 // RGBA first 4 pixels
- punpckhwd xmm1, xmm0 // RGBA next 4 pixels
- movdqa [edx], xmm2
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_abgr,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // abgr
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm2, xmm1 // RG
- punpcklbw xmm0, xmm5 // BA
- movdqa xmm1, xmm2
- punpcklwd xmm2, xmm0 // RGBA first 4 pixels
- punpckhwd xmm1, xmm0 // RGBA next 4 pixels
- movdqu [edx], xmm2
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void I422ToRGBARow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgba,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // rgba
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into RGBA
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- punpcklbw xmm1, xmm2 // GR
- punpcklbw xmm5, xmm0 // AB
- movdqa xmm0, xmm5
- punpcklwd xmm5, xmm1 // RGBA first 4 pixels
- punpckhwd xmm0, xmm1 // RGBA next 4 pixels
- movdqa [edx], xmm5
- movdqa [edx + 16], xmm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgba,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // rgba
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into RGBA
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- punpcklbw xmm1, xmm2 // GR
- punpcklbw xmm5, xmm0 // AB
- movdqa xmm0, xmm5
- punpcklwd xmm5, xmm1 // RGBA first 4 pixels
- punpckhwd xmm0, xmm1 // RGBA next 4 pixels
- movdqu [edx], xmm5
- movdqu [edx + 16], xmm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-#endif // HAS_I422TOARGBROW_SSSE3
-
-#ifdef HAS_YTOARGBROW_SSE2
-__declspec(naked) __declspec(align(16))
-void YToARGBRow_SSE2(const uint8* y_buf,
- uint8* rgb_buf,
- int width) {
- __asm {
- pxor xmm5, xmm5
- pcmpeqb xmm4, xmm4 // generate mask 0xff000000
- pslld xmm4, 24
- mov eax, 0x00100010
- movd xmm3, eax
- pshufd xmm3, xmm3, 0
- mov eax, 0x004a004a // 74
- movd xmm2, eax
- pshufd xmm2, xmm2,0
- mov eax, [esp + 4] // Y
- mov edx, [esp + 8] // rgb
- mov ecx, [esp + 12] // width
-
- align 4
- convertloop:
- // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
- movq xmm0, qword ptr [eax]
- lea eax, [eax + 8]
- punpcklbw xmm0, xmm5 // 0.Y
- psubusw xmm0, xmm3
- pmullw xmm0, xmm2
- psrlw xmm0, 6
- packuswb xmm0, xmm0 // G
-
- // Step 2: Weave into ARGB
- punpcklbw xmm0, xmm0 // GG
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm0 // BGRA first 4 pixels
- punpckhwd xmm1, xmm1 // BGRA next 4 pixels
- por xmm0, xmm4
- por xmm1, xmm4
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- ret
- }
-}
-#endif // HAS_YTOARGBROW_SSE2
-
-#ifdef HAS_MIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-static const uvec8 kShuffleMirror = {
- 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
-
-__declspec(naked) __declspec(align(16))
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- movdqa xmm5, kShuffleMirror
- lea eax, [eax - 16]
-
- align 4
- convertloop:
- movdqa xmm0, [eax + ecx]
- pshufb xmm0, xmm5
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-#endif // HAS_MIRRORROW_SSSE3
-
-#ifdef HAS_MIRRORROW_AVX2
-// Shuffle table for reversing the bytes.
-static const ulvec8 kShuffleMirror_AVX2 = {
- 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
- 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
-
-__declspec(naked) __declspec(align(16))
-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- vmovdqa ymm5, kShuffleMirror_AVX2
- lea eax, [eax - 32]
-
- align 4
- convertloop:
- vmovdqu ymm0, [eax + ecx]
- vpshufb ymm0, ymm0, ymm5
- vpermq ymm0, ymm0, 0x4e // swap high and low halfs
- sub ecx, 32
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- jg convertloop
- vzeroupper
- ret
- }
-}
-#endif // HAS_MIRRORROW_AVX2
-
-#ifdef HAS_MIRRORROW_SSE2
-// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
-// version can not.
-__declspec(naked) __declspec(align(16))
-void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- lea eax, [eax - 16]
-
- align 4
- convertloop:
- movdqu xmm0, [eax + ecx]
- movdqa xmm1, xmm0 // swap bytes
- psllw xmm0, 8
- psrlw xmm1, 8
- por xmm0, xmm1
- pshuflw xmm0, xmm0, 0x1b // swap words
- pshufhw xmm0, xmm0, 0x1b
- pshufd xmm0, xmm0, 0x4e // swap qwords
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-#endif // HAS_MIRRORROW_SSE2
-
-#ifdef HAS_MIRRORROW_UV_SSSE3
-// Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorUV = {
- 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
-
-__declspec(naked) __declspec(align(16))
-void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
- int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- movdqa xmm1, kShuffleMirrorUV
- lea eax, [eax + ecx * 2 - 16]
- sub edi, edx
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- lea eax, [eax - 16]
- pshufb xmm0, xmm1
- sub ecx, 8
- movlpd qword ptr [edx], xmm0
- movhpd qword ptr [edx + edi], xmm0
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- ret
- }
-}
-#endif // HAS_MIRRORROW_UV_SSSE3
-
-#ifdef HAS_ARGBMIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-static const uvec8 kARGBShuffleMirror = {
- 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
-};
-
-__declspec(naked) __declspec(align(16))
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
- movdqa xmm5, kARGBShuffleMirror
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- lea eax, [eax - 16]
- pshufb xmm0, xmm5
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-#endif // HAS_ARGBMIRRORROW_SSSE3
-
-#ifdef HAS_ARGBMIRRORROW_AVX2
-// Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {
- 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
-
-__declspec(naked) __declspec(align(16))
-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- lea eax, [eax - 32]
- vmovdqa ymm5, kARGBShuffleMirror_AVX2
-
- align 4
- convertloop:
- vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order
- sub ecx, 8
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- jg convertloop
- vzeroupper
- ret
- }
-}
-#endif // HAS_ARGBMIRRORROW_AVX2
-
-#ifdef HAS_SPLITUVROW_SSE2
-__declspec(naked) __declspec(align(16))
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_uv
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- pand xmm0, xmm5 // even bytes
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- psrlw xmm2, 8 // odd bytes
- psrlw xmm3, 8
- packuswb xmm2, xmm3
- movdqa [edx], xmm0
- movdqa [edx + edi], xmm2
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_uv
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- pand xmm0, xmm5 // even bytes
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- psrlw xmm2, 8 // odd bytes
- psrlw xmm3, 8
- packuswb xmm2, xmm3
- movdqu [edx], xmm0
- movdqu [edx + edi], xmm2
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
-
- pop edi
- ret
- }
-}
-#endif // HAS_SPLITUVROW_SSE2
-
-#ifdef HAS_SPLITUVROW_AVX2
-__declspec(naked) __declspec(align(16))
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_uv
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
- vpsrlw ymm5, ymm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpsrlw ymm2, ymm0, 8 // odd bytes
- vpsrlw ymm3, ymm1, 8
- vpand ymm0, ymm0, ymm5 // even bytes
- vpand ymm1, ymm1, ymm5
- vpackuswb ymm0, ymm0, ymm1
- vpackuswb ymm2, ymm2, ymm3
- vpermq ymm0, ymm0, 0xd8
- vpermq ymm2, ymm2, 0xd8
- vmovdqu [edx], ymm0
- vmovdqu [edx + edi], ymm2
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloop
-
- pop edi
- vzeroupper
- ret
- }
-}
-#endif // HAS_SPLITUVROW_AVX2
-
-#ifdef HAS_MERGEUVROW_SSE2
-__declspec(naked) __declspec(align(16))
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
- int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_u
- mov edx, [esp + 4 + 8] // src_v
- mov edi, [esp + 4 + 12] // dst_uv
- mov ecx, [esp + 4 + 16] // width
- sub edx, eax
-
- align 4
- convertloop:
- movdqa xmm0, [eax] // read 16 U's
- movdqa xmm1, [eax + edx] // and 16 V's
- lea eax, [eax + 16]
- movdqa xmm2, xmm0
- punpcklbw xmm0, xmm1 // first 8 UV pairs
- punpckhbw xmm2, xmm1 // next 8 UV pairs
- movdqa [edi], xmm0
- movdqa [edi + 16], xmm2
- lea edi, [edi + 32]
- sub ecx, 16
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
- uint8* dst_uv, int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_u
- mov edx, [esp + 4 + 8] // src_v
- mov edi, [esp + 4 + 12] // dst_uv
- mov ecx, [esp + 4 + 16] // width
- sub edx, eax
-
- align 4
- convertloop:
- movdqu xmm0, [eax] // read 16 U's
- movdqu xmm1, [eax + edx] // and 16 V's
- lea eax, [eax + 16]
- movdqa xmm2, xmm0
- punpcklbw xmm0, xmm1 // first 8 UV pairs
- punpckhbw xmm2, xmm1 // next 8 UV pairs
- movdqu [edi], xmm0
- movdqu [edi + 16], xmm2
- lea edi, [edi + 32]
- sub ecx, 16
- jg convertloop
-
- pop edi
- ret
- }
-}
-#endif // HAS_MERGEUVROW_SSE2
-
-#ifdef HAS_MERGEUVROW_AVX2
-__declspec(naked) __declspec(align(16))
-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
- int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_u
- mov edx, [esp + 4 + 8] // src_v
- mov edi, [esp + 4 + 12] // dst_uv
- mov ecx, [esp + 4 + 16] // width
- sub edx, eax
-
- align 4
- convertloop:
- vmovdqu ymm0, [eax] // read 32 U's
- vmovdqu ymm1, [eax + edx] // and 32 V's
- lea eax, [eax + 32]
- vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
- vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
- vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0
- vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0
- vmovdqu [edi], ymm1
- vmovdqu [edi + 32], ymm2
- lea edi, [edi + 64]
- sub ecx, 32
- jg convertloop
-
- pop edi
- vzeroupper
- ret
- }
-}
-#endif // HAS_MERGEUVROW_AVX2
-
-#ifdef HAS_COPYROW_SSE2
-// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked) __declspec(align(16))
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloop
- ret
- }
-}
-#endif // HAS_COPYROW_SSE2
-
-// Unaligned Multiple of 1.
-__declspec(naked) __declspec(align(16))
-void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
- __asm {
- mov eax, esi
- mov edx, edi
- mov esi, [esp + 4] // src
- mov edi, [esp + 8] // dst
- mov ecx, [esp + 12] // count
- rep movsb
- mov edi, edx
- mov esi, eax
- ret
- }
-}
-
-#ifdef HAS_COPYROW_X86
-__declspec(naked) __declspec(align(16))
-void CopyRow_X86(const uint8* src, uint8* dst, int count) {
- __asm {
- mov eax, esi
- mov edx, edi
- mov esi, [esp + 4] // src
- mov edi, [esp + 8] // dst
- mov ecx, [esp + 12] // count
- shr ecx, 2
- rep movsd
- mov edi, edx
- mov esi, eax
- ret
- }
-}
-#endif // HAS_COPYROW_X86
-
-#ifdef HAS_ARGBCOPYALPHAROW_SSE2
-// width in pixels
-__declspec(naked) __declspec(align(16))
-void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
- pcmpeqb xmm0, xmm0 // generate mask 0xff000000
- pslld xmm0, 24
- pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
- psrld xmm1, 8
-
- align 4
- convertloop:
- movdqa xmm2, [eax]
- movdqa xmm3, [eax + 16]
- lea eax, [eax + 32]
- movdqa xmm4, [edx]
- movdqa xmm5, [edx + 16]
- pand xmm2, xmm0
- pand xmm3, xmm0
- pand xmm4, xmm1
- pand xmm5, xmm1
- por xmm2, xmm4
- por xmm3, xmm5
- movdqa [edx], xmm2
- movdqa [edx + 16], xmm3
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- ret
- }
-}
-#endif // HAS_ARGBCOPYALPHAROW_SSE2
-
-#ifdef HAS_ARGBCOPYALPHAROW_AVX2
-// width in pixels
-__declspec(naked) __declspec(align(16))
-void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
- vpcmpeqb ymm0, ymm0, ymm0
- vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
-
- align 4
- convertloop:
- vmovdqu ymm1, [eax]
- vmovdqu ymm2, [eax + 32]
- lea eax, [eax + 64]
- vpblendvb ymm1, ymm1, [edx], ymm0
- vpblendvb ymm2, ymm2, [edx + 32], ymm0
- vmovdqu [edx], ymm1
- vmovdqu [edx + 32], ymm2
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
-
- vzeroupper
- ret
- }
-}
-#endif // HAS_ARGBCOPYALPHAROW_AVX2
-
-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
-// width in pixels
-__declspec(naked) __declspec(align(16))
-void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
- pcmpeqb xmm0, xmm0 // generate mask 0xff000000
- pslld xmm0, 24
- pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
- psrld xmm1, 8
-
- align 4
- convertloop:
- movq xmm2, qword ptr [eax] // 8 Y's
- lea eax, [eax + 8]
- punpcklbw xmm2, xmm2
- punpckhwd xmm3, xmm2
- punpcklwd xmm2, xmm2
- movdqa xmm4, [edx]
- movdqa xmm5, [edx + 16]
- pand xmm2, xmm0
- pand xmm3, xmm0
- pand xmm4, xmm1
- pand xmm5, xmm1
- por xmm2, xmm4
- por xmm3, xmm5
- movdqa [edx], xmm2
- movdqa [edx + 16], xmm3
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- ret
- }
-}
-#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
-
-#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
-// width in pixels
-__declspec(naked) __declspec(align(16))
-void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
- vpcmpeqb ymm0, ymm0, ymm0
- vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
-
- align 4
- convertloop:
- vpmovzxbd ymm1, qword ptr [eax]
- vpmovzxbd ymm2, qword ptr [eax + 8]
- lea eax, [eax + 16]
- vpslld ymm1, ymm1, 24
- vpslld ymm2, ymm2, 24
- vpblendvb ymm1, ymm1, [edx], ymm0
- vpblendvb ymm2, ymm2, [edx + 32], ymm0
- vmovdqu [edx], ymm1
- vmovdqu [edx + 32], ymm2
- lea edx, [edx + 64]
- sub ecx, 16
- jg convertloop
-
- vzeroupper
- ret
- }
-}
-#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
-
-#ifdef HAS_SETROW_X86
-// SetRow8 writes 'count' bytes using a 32 bit value repeated.
-__declspec(naked) __declspec(align(16))
-void SetRow_X86(uint8* dst, uint32 v32, int count) {
- __asm {
- mov edx, edi
- mov edi, [esp + 4] // dst
- mov eax, [esp + 8] // v32
- mov ecx, [esp + 12] // count
- shr ecx, 2
- rep stosd
- mov edi, edx
- ret
- }
-}
-
-// SetRow32 writes 'count' words using a 32 bit value repeated.
-__declspec(naked) __declspec(align(16))
-void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
- int dst_stride, int height) {
- __asm {
- push esi
- push edi
- push ebp
- mov edi, [esp + 12 + 4] // dst
- mov eax, [esp + 12 + 8] // v32
- mov ebp, [esp + 12 + 12] // width
- mov edx, [esp + 12 + 16] // dst_stride
- mov esi, [esp + 12 + 20] // height
- lea ecx, [ebp * 4]
- sub edx, ecx // stride - width * 4
-
- align 4
- convertloop:
- mov ecx, ebp
- rep stosd
- add edi, edx
- sub esi, 1
- jg convertloop
-
- pop ebp
- pop edi
- pop esi
- ret
- }
-}
-#endif // HAS_SETROW_X86
-
-#ifdef HAS_YUY2TOYROW_AVX2
-__declspec(naked) __declspec(align(16))
-void YUY2ToYRow_AVX2(const uint8* src_yuy2,
- uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] // src_yuy2
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // pix
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
- vpsrlw ymm5, ymm5, 8
-
- align 4
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpand ymm0, ymm0, ymm5 // even bytes are Y
- vpand ymm1, ymm1, ymm5
- vpackuswb ymm0, ymm0, ymm1 // mutates.
- vpermq ymm0, ymm0, 0xd8
- sub ecx, 32
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- jg convertloop
- vzeroupper
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
- vpsrlw ymm5, ymm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vpavgb ymm0, ymm0, [eax + esi]
- vpavgb ymm1, ymm1, [eax + esi + 32]
- lea eax, [eax + 64]
- vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
- vpsrlw ymm1, ymm1, 8
- vpackuswb ymm0, ymm0, ymm1 // mutates.
- vpermq ymm0, ymm0, 0xd8
- vpand ymm1, ymm0, ymm5 // U
- vpsrlw ymm0, ymm0, 8 // V
- vpackuswb ymm1, ymm1, ymm1 // mutates.
- vpackuswb ymm0, ymm0, ymm0 // mutates.
- vpermq ymm1, ymm1, 0xd8
- vpermq ymm0, ymm0, 0xd8
- vextractf128 [edx], ymm1, 0 // U
- vextractf128 [edx + edi], ymm0, 0 // V
- lea edx, [edx + 16]
- sub ecx, 32
- jg convertloop
-
- pop edi
- pop esi
- vzeroupper
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
- vpsrlw ymm5, ymm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
- vpsrlw ymm1, ymm1, 8
- vpackuswb ymm0, ymm0, ymm1 // mutates.
- vpermq ymm0, ymm0, 0xd8
- vpand ymm1, ymm0, ymm5 // U
- vpsrlw ymm0, ymm0, 8 // V
- vpackuswb ymm1, ymm1, ymm1 // mutates.
- vpackuswb ymm0, ymm0, ymm0 // mutates.
- vpermq ymm1, ymm1, 0xd8
- vpermq ymm0, ymm0, 0xd8
- vextractf128 [edx], ymm1, 0 // U
- vextractf128 [edx + edi], ymm0, 0 // V
- lea edx, [edx + 16]
- sub ecx, 32
- jg convertloop
-
- pop edi
- vzeroupper
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToYRow_AVX2(const uint8* src_uyvy,
- uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] // src_uyvy
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // pix
-
- align 4
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpsrlw ymm0, ymm0, 8 // odd bytes are Y
- vpsrlw ymm1, ymm1, 8
- vpackuswb ymm0, ymm0, ymm1 // mutates.
- vpermq ymm0, ymm0, 0xd8
- sub ecx, 32
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- jg convertloop
- ret
- vzeroupper
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
- vpsrlw ymm5, ymm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vpavgb ymm0, ymm0, [eax + esi]
- vpavgb ymm1, ymm1, [eax + esi + 32]
- lea eax, [eax + 64]
- vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
- vpand ymm1, ymm1, ymm5
- vpackuswb ymm0, ymm0, ymm1 // mutates.
- vpermq ymm0, ymm0, 0xd8
- vpand ymm1, ymm0, ymm5 // U
- vpsrlw ymm0, ymm0, 8 // V
- vpackuswb ymm1, ymm1, ymm1 // mutates.
- vpackuswb ymm0, ymm0, ymm0 // mutates.
- vpermq ymm1, ymm1, 0xd8
- vpermq ymm0, ymm0, 0xd8
- vextractf128 [edx], ymm1, 0 // U
- vextractf128 [edx + edi], ymm0, 0 // V
- lea edx, [edx + 16]
- sub ecx, 32
- jg convertloop
-
- pop edi
- pop esi
- vzeroupper
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
- vpsrlw ymm5, ymm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
- vpand ymm1, ymm1, ymm5
- vpackuswb ymm0, ymm0, ymm1 // mutates.
- vpermq ymm0, ymm0, 0xd8
- vpand ymm1, ymm0, ymm5 // U
- vpsrlw ymm0, ymm0, 8 // V
- vpackuswb ymm1, ymm1, ymm1 // mutates.
- vpackuswb ymm0, ymm0, ymm0 // mutates.
- vpermq ymm1, ymm1, 0xd8
- vpermq ymm0, ymm0, 0xd8
- vextractf128 [edx], ymm1, 0 // U
- vextractf128 [edx + edi], ymm0, 0 // V
- lea edx, [edx + 16]
- sub ecx, 32
- jg convertloop
-
- pop edi
- vzeroupper
- ret
- }
-}
-#endif // HAS_YUY2TOYROW_AVX2
-
-#ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked) __declspec(align(16))
-void YUY2ToYRow_SSE2(const uint8* src_yuy2,
- uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] // src_yuy2
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5 // even bytes are Y
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
- psrlw xmm0, 8 // YUYV -> UVUV
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // YUYV -> UVUV
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] // src_yuy2
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5 // even bytes are Y
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + esi]
- movdqu xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
- psrlw xmm0, 8 // YUYV -> UVUV
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // YUYV -> UVUV
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToYRow_SSE2(const uint8* src_uyvy,
- uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] // src_uyvy
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // pix
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // odd bytes are Y
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
- pand xmm0, xmm5 // UYVY -> UVUV
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5 // UYVY -> UVUV
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] // src_uyvy
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // pix
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // odd bytes are Y
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + esi]
- movdqu xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
- pand xmm0, xmm5 // UYVY -> UVUV
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5 // UYVY -> UVUV
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- ret
- }
-}
-#endif // HAS_YUY2TOYROW_SSE2
-
-#ifdef HAS_ARGBBLENDROW_SSE2
-// Blend 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
-void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm7, xmm7 // generate constant 1
- psrlw xmm7, 15
- pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
- psrlw xmm6, 8
- pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
- psllw xmm5, 8
- pcmpeqb xmm4, xmm4 // generate mask 0xff000000
- pslld xmm4, 24
-
- sub ecx, 1
- je convertloop1 // only 1 pixel?
- jl convertloop1b
-
- // 1 pixel loop until destination pointer is aligned.
- alignloop1:
- test edx, 15 // aligned?
- je alignloop1b
- movd xmm3, [eax]
- lea eax, [eax + 4]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movd xmm2, [esi] // _r_b
- psrlw xmm3, 8 // alpha
- pshufhw xmm3, xmm3, 0F5h // 8 alpha words
- pshuflw xmm3, xmm3, 0F5h
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movd xmm1, [esi] // _a_g
- lea esi, [esi + 4]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- sub ecx, 1
- movd [edx], xmm0
- lea edx, [edx + 4]
- jge alignloop1
-
- alignloop1b:
- add ecx, 1 - 4
- jl convertloop4b
-
- // 4 pixel loop.
- convertloop4:
- movdqu xmm3, [eax] // src argb
- lea eax, [eax + 16]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movdqu xmm2, [esi] // _r_b
- psrlw xmm3, 8 // alpha
- pshufhw xmm3, xmm3, 0F5h // 8 alpha words
- pshuflw xmm3, xmm3, 0F5h
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movdqu xmm1, [esi] // _a_g
- lea esi, [esi + 16]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jge convertloop4
-
- convertloop4b:
- add ecx, 4 - 1
- jl convertloop1b
-
- // 1 pixel loop.
- convertloop1:
- movd xmm3, [eax] // src argb
- lea eax, [eax + 4]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movd xmm2, [esi] // _r_b
- psrlw xmm3, 8 // alpha
- pshufhw xmm3, xmm3, 0F5h // 8 alpha words
- pshuflw xmm3, xmm3, 0F5h
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movd xmm1, [esi] // _a_g
- lea esi, [esi + 4]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- sub ecx, 1
- movd [edx], xmm0
- lea edx, [edx + 4]
- jge convertloop1
-
- convertloop1b:
- pop esi
- ret
- }
-}
-#endif // HAS_ARGBBLENDROW_SSE2
-
-#ifdef HAS_ARGBBLENDROW_SSSE3
-// Shuffle table for isolating alpha.
-static const uvec8 kShuffleAlpha = {
- 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
- 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
-};
-// Same as SSE2, but replaces:
-// psrlw xmm3, 8 // alpha
-// pshufhw xmm3, xmm3, 0F5h // 8 alpha words
-// pshuflw xmm3, xmm3, 0F5h
-// with..
-// pshufb xmm3, kShuffleAlpha // alpha
-// Blend 8 pixels at a time.
-
-__declspec(naked) __declspec(align(16))
-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm7, xmm7 // generate constant 0x0001
- psrlw xmm7, 15
- pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
- psrlw xmm6, 8
- pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
- psllw xmm5, 8
- pcmpeqb xmm4, xmm4 // generate mask 0xff000000
- pslld xmm4, 24
-
- sub ecx, 1
- je convertloop1 // only 1 pixel?
- jl convertloop1b
-
- // 1 pixel loop until destination pointer is aligned.
- alignloop1:
- test edx, 15 // aligned?
- je alignloop1b
- movd xmm3, [eax]
- lea eax, [eax + 4]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movd xmm2, [esi] // _r_b
- pshufb xmm3, kShuffleAlpha // alpha
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movd xmm1, [esi] // _a_g
- lea esi, [esi + 4]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- sub ecx, 1
- movd [edx], xmm0
- lea edx, [edx + 4]
- jge alignloop1
-
- alignloop1b:
- add ecx, 1 - 4
- jl convertloop4b
-
- test eax, 15 // unaligned?
- jne convertuloop4
- test esi, 15 // unaligned?
- jne convertuloop4
-
- // 4 pixel loop.
- convertloop4:
- movdqa xmm3, [eax] // src argb
- lea eax, [eax + 16]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movdqa xmm2, [esi] // _r_b
- pshufb xmm3, kShuffleAlpha // alpha
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movdqa xmm1, [esi] // _a_g
- lea esi, [esi + 16]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jge convertloop4
- jmp convertloop4b
-
- // 4 pixel unaligned loop.
- convertuloop4:
- movdqu xmm3, [eax] // src argb
- lea eax, [eax + 16]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movdqu xmm2, [esi] // _r_b
- pshufb xmm3, kShuffleAlpha // alpha
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movdqu xmm1, [esi] // _a_g
- lea esi, [esi + 16]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jge convertuloop4
-
- convertloop4b:
- add ecx, 4 - 1
- jl convertloop1b
-
- // 1 pixel loop.
- convertloop1:
- movd xmm3, [eax] // src argb
- lea eax, [eax + 4]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movd xmm2, [esi] // _r_b
- pshufb xmm3, kShuffleAlpha // alpha
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movd xmm1, [esi] // _a_g
- lea esi, [esi + 4]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- sub ecx, 1
- movd [edx], xmm0
- lea edx, [edx + 4]
- jge convertloop1
-
- convertloop1b:
- pop esi
- ret
- }
-}
-#endif // HAS_ARGBBLENDROW_SSSE3
-
-#ifdef HAS_ARGBATTENUATEROW_SSE2
-// Attenuate 4 pixels at a time.
-// Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
-void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb0
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- pcmpeqb xmm4, xmm4 // generate mask 0xff000000
- pslld xmm4, 24
- pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
- psrld xmm5, 8
-
- align 4
- convertloop:
- movdqa xmm0, [eax] // read 4 pixels
- punpcklbw xmm0, xmm0 // first 2
- pshufhw xmm2, xmm0, 0FFh // 8 alpha words
- pshuflw xmm2, xmm2, 0FFh
- pmulhuw xmm0, xmm2 // rgb * a
- movdqa xmm1, [eax] // read 4 pixels
- punpckhbw xmm1, xmm1 // next 2 pixels
- pshufhw xmm2, xmm1, 0FFh // 8 alpha words
- pshuflw xmm2, xmm2, 0FFh
- pmulhuw xmm1, xmm2 // rgb * a
- movdqa xmm2, [eax] // alphas
- lea eax, [eax + 16]
- psrlw xmm0, 8
- pand xmm2, xmm4
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- pand xmm0, xmm5 // keep original alphas
- por xmm0, xmm2
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
-
- ret
- }
-}
-#endif // HAS_ARGBATTENUATEROW_SSE2
-
-#ifdef HAS_ARGBATTENUATEROW_SSSE3
-// Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha0 = {
- 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
-};
-static const uvec8 kShuffleAlpha1 = {
- 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
- 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
-};
-__declspec(naked) __declspec(align(16))
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb0
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- pcmpeqb xmm3, xmm3 // generate mask 0xff000000
- pslld xmm3, 24
- movdqa xmm4, kShuffleAlpha0
- movdqa xmm5, kShuffleAlpha1
-
- align 4
- convertloop:
- movdqu xmm0, [eax] // read 4 pixels
- pshufb xmm0, xmm4 // isolate first 2 alphas
- movdqu xmm1, [eax] // read 4 pixels
- punpcklbw xmm1, xmm1 // first 2 pixel rgbs
- pmulhuw xmm0, xmm1 // rgb * a
- movdqu xmm1, [eax] // read 4 pixels
- pshufb xmm1, xmm5 // isolate next 2 alphas
- movdqu xmm2, [eax] // read 4 pixels
- punpckhbw xmm2, xmm2 // next 2 pixel rgbs
- pmulhuw xmm1, xmm2 // rgb * a
- movdqu xmm2, [eax] // mask original alpha
- lea eax, [eax + 16]
- pand xmm2, xmm3
- psrlw xmm0, 8
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- por xmm0, xmm2 // copy original alpha
- sub ecx, 4
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
-
- ret
- }
-}
-#endif // HAS_ARGBATTENUATEROW_SSSE3
-
-#ifdef HAS_ARGBATTENUATEROW_AVX2
-// Shuffle table duplicating alpha.
-static const ulvec8 kShuffleAlpha_AVX2 = {
- 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
- 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
- 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
- 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
-};
-__declspec(naked) __declspec(align(16))
-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb0
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- vmovdqa ymm4, kShuffleAlpha_AVX2
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
- vpslld ymm5, ymm5, 24
-
- align 4
- convertloop:
- vmovdqu ymm6, [eax] // read 8 pixels.
- vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
- vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
- vpshufb ymm2, ymm0, ymm4 // low 4 alphas
- vpshufb ymm3, ymm1, ymm4 // high 4 alphas
- vpmulhuw ymm0, ymm0, ymm2 // rgb * a
- vpmulhuw ymm1, ymm1, ymm3 // rgb * a
- vpand ymm6, ymm6, ymm5 // isolate alpha
- vpsrlw ymm0, ymm0, 8
- vpsrlw ymm1, ymm1, 8
- vpackuswb ymm0, ymm0, ymm1 // unmutated.
- vpor ymm0, ymm0, ymm6 // copy original alpha
- sub ecx, 8
- vmovdqu [eax + edx], ymm0
- lea eax, [eax + 32]
- jg convertloop
-
- vzeroupper
- ret
- }
-}
-#endif // HAS_ARGBATTENUATEROW_AVX2
-
-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
-// Unattenuate 4 pixels at a time.
-// Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb0
- mov edx, [esp + 8 + 8] // dst_argb
- mov ecx, [esp + 8 + 12] // width
-
- align 4
- convertloop:
- movdqu xmm0, [eax] // read 4 pixels
- movzx esi, byte ptr [eax + 3] // first alpha
- movzx edi, byte ptr [eax + 7] // second alpha
- punpcklbw xmm0, xmm0 // first 2
- movd xmm2, dword ptr fixed_invtbl8[esi * 4]
- movd xmm3, dword ptr fixed_invtbl8[edi * 4]
- pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
- pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
- movlhps xmm2, xmm3
- pmulhuw xmm0, xmm2 // rgb * a
-
- movdqu xmm1, [eax] // read 4 pixels
- movzx esi, byte ptr [eax + 11] // third alpha
- movzx edi, byte ptr [eax + 15] // forth alpha
- punpckhbw xmm1, xmm1 // next 2
- movd xmm2, dword ptr fixed_invtbl8[esi * 4]
- movd xmm3, dword ptr fixed_invtbl8[edi * 4]
- pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
- pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
- movlhps xmm2, xmm3
- pmulhuw xmm1, xmm2 // rgb * a
- lea eax, [eax + 16]
-
- packuswb xmm0, xmm1
- sub ecx, 4
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- pop edi
- pop esi
- ret
- }
-}
-#endif // HAS_ARGBUNATTENUATEROW_SSE2
-
-#ifdef HAS_ARGBUNATTENUATEROW_AVX2
-// Shuffle table duplicating alpha.
-static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
- 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
- 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
-};
-// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
-// USE_GATHER is not on by default, due to being a slow instruction.
-#ifdef USE_GATHER
-__declspec(naked) __declspec(align(16))
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
- int width) {
- __asm {
- mov eax, [esp + 4] // src_argb0
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- vmovdqa ymm4, kUnattenShuffleAlpha_AVX2
-
- align 4
- convertloop:
- vmovdqu ymm6, [eax] // read 8 pixels.
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
- vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
- vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
- vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
- vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
- vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
- vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
- vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
- vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
- vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
- vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
- vpackuswb ymm0, ymm0, ymm1 // unmutated.
- sub ecx, 8
- vmovdqu [eax + edx], ymm0
- lea eax, [eax + 32]
- jg convertloop
-
- vzeroupper
- ret
- }
-}
-#else // USE_GATHER
-__declspec(naked) __declspec(align(16))
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
- int width) {
- __asm {
-
- mov eax, [esp + 4] // src_argb0
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- vmovdqa ymm5, kUnattenShuffleAlpha_AVX2
-
- push esi
- push edi
-
- align 4
- convertloop:
- // replace VPGATHER
- movzx esi, byte ptr [eax + 3] // alpha0
- movzx edi, byte ptr [eax + 7] // alpha1
- vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0]
- vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1]
- movzx esi, byte ptr [eax + 11] // alpha2
- movzx edi, byte ptr [eax + 15] // alpha3
- vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
- vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2]
- vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3]
- movzx esi, byte ptr [eax + 19] // alpha4
- movzx edi, byte ptr [eax + 23] // alpha5
- vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
- vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4]
- vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5]
- movzx esi, byte ptr [eax + 27] // alpha6
- movzx edi, byte ptr [eax + 31] // alpha7
- vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
- vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6]
- vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7]
- vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
- vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
- vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
- vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
- // end of VPGATHER
-
- vmovdqu ymm6, [eax] // read 8 pixels.
- vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
- vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
- vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
- vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
- vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
- vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
- vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
- vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
- vpackuswb ymm0, ymm0, ymm1 // unmutated.
- sub ecx, 8
- vmovdqu [eax + edx], ymm0
- lea eax, [eax + 32]
- jg convertloop
-
- pop edi
- pop esi
- vzeroupper
- ret
- }
-}
-#endif // USE_GATHER
-#endif // HAS_ARGBATTENUATEROW_AVX2
-
-#ifdef HAS_ARGBGRAYROW_SSSE3
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
-__declspec(naked) __declspec(align(16))
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_argb */
- mov ecx, [esp + 12] /* width */
- movdqa xmm4, kARGBToYJ
- movdqa xmm5, kAddYJ64
-
- align 4
- convertloop:
- movdqa xmm0, [eax] // G
- movdqa xmm1, [eax + 16]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- phaddw xmm0, xmm1
- paddw xmm0, xmm5 // Add .5 for rounding.
- psrlw xmm0, 7
- packuswb xmm0, xmm0 // 8 G bytes
- movdqa xmm2, [eax] // A
- movdqa xmm3, [eax + 16]
- lea eax, [eax + 32]
- psrld xmm2, 24
- psrld xmm3, 24
- packuswb xmm2, xmm3
- packuswb xmm2, xmm2 // 8 A bytes
- movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
- punpcklbw xmm0, xmm0 // 8 GG words
- punpcklbw xmm3, xmm2 // 8 GA words
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm3 // GGGA first 4
- punpckhwd xmm1, xmm3 // GGGA next 4
- sub ecx, 8
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- jg convertloop
- ret
- }
-}
-#endif // HAS_ARGBGRAYROW_SSSE3
-
-#ifdef HAS_ARGBSEPIAROW_SSSE3
-// b = (r * 35 + g * 68 + b * 17) >> 7
-// g = (r * 45 + g * 88 + b * 22) >> 7
-// r = (r * 50 + g * 98 + b * 24) >> 7
-// Constant for ARGB color to sepia tone.
-static const vec8 kARGBToSepiaB = {
- 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
-};
-
-static const vec8 kARGBToSepiaG = {
- 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
-};
-
-static const vec8 kARGBToSepiaR = {
- 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
-};
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-__declspec(naked) __declspec(align(16))
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
- __asm {
- mov eax, [esp + 4] /* dst_argb */
- mov ecx, [esp + 8] /* width */
- movdqa xmm2, kARGBToSepiaB
- movdqa xmm3, kARGBToSepiaG
- movdqa xmm4, kARGBToSepiaR
-
- align 4
- convertloop:
- movdqa xmm0, [eax] // B
- movdqa xmm6, [eax + 16]
- pmaddubsw xmm0, xmm2
- pmaddubsw xmm6, xmm2
- phaddw xmm0, xmm6
- psrlw xmm0, 7
- packuswb xmm0, xmm0 // 8 B values
- movdqa xmm5, [eax] // G
- movdqa xmm1, [eax + 16]
- pmaddubsw xmm5, xmm3
- pmaddubsw xmm1, xmm3
- phaddw xmm5, xmm1
- psrlw xmm5, 7
- packuswb xmm5, xmm5 // 8 G values
- punpcklbw xmm0, xmm5 // 8 BG values
- movdqa xmm5, [eax] // R
- movdqa xmm1, [eax + 16]
- pmaddubsw xmm5, xmm4
- pmaddubsw xmm1, xmm4
- phaddw xmm5, xmm1
- psrlw xmm5, 7
- packuswb xmm5, xmm5 // 8 R values
- movdqa xmm6, [eax] // A
- movdqa xmm1, [eax + 16]
- psrld xmm6, 24
- psrld xmm1, 24
- packuswb xmm6, xmm1
- packuswb xmm6, xmm6 // 8 A values
- punpcklbw xmm5, xmm6 // 8 RA values
- movdqa xmm1, xmm0 // Weave BG, RA together
- punpcklwd xmm0, xmm5 // BGRA first 4
- punpckhwd xmm1, xmm5 // BGRA next 4
- sub ecx, 8
- movdqa [eax], xmm0
- movdqa [eax + 16], xmm1
- lea eax, [eax + 32]
- jg convertloop
- ret
- }
-}
-#endif // HAS_ARGBSEPIAROW_SSSE3
-
-#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// Same as Sepia except matrix is provided.
-// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
-// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
-__declspec(naked) __declspec(align(16))
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const int8* matrix_argb, int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_argb */
- mov ecx, [esp + 12] /* matrix_argb */
- movdqu xmm5, [ecx]
- pshufd xmm2, xmm5, 0x00
- pshufd xmm3, xmm5, 0x55
- pshufd xmm4, xmm5, 0xaa
- pshufd xmm5, xmm5, 0xff
- mov ecx, [esp + 16] /* width */
-
- align 4
- convertloop:
- movdqa xmm0, [eax] // B
- movdqa xmm7, [eax + 16]
- pmaddubsw xmm0, xmm2
- pmaddubsw xmm7, xmm2
- movdqa xmm6, [eax] // G
- movdqa xmm1, [eax + 16]
- pmaddubsw xmm6, xmm3
- pmaddubsw xmm1, xmm3
- phaddsw xmm0, xmm7 // B
- phaddsw xmm6, xmm1 // G
- psraw xmm0, 6 // B
- psraw xmm6, 6 // G
- packuswb xmm0, xmm0 // 8 B values
- packuswb xmm6, xmm6 // 8 G values
- punpcklbw xmm0, xmm6 // 8 BG values
- movdqa xmm1, [eax] // R
- movdqa xmm7, [eax + 16]
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm7, xmm4
- phaddsw xmm1, xmm7 // R
- movdqa xmm6, [eax] // A
- movdqa xmm7, [eax + 16]
- pmaddubsw xmm6, xmm5
- pmaddubsw xmm7, xmm5
- phaddsw xmm6, xmm7 // A
- psraw xmm1, 6 // R
- psraw xmm6, 6 // A
- packuswb xmm1, xmm1 // 8 R values
- packuswb xmm6, xmm6 // 8 A values
- punpcklbw xmm1, xmm6 // 8 RA values
- movdqa xmm6, xmm0 // Weave BG, RA together
- punpcklwd xmm0, xmm1 // BGRA first 4
- punpckhwd xmm6, xmm1 // BGRA next 4
- sub ecx, 8
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm6
- lea eax, [eax + 32]
- lea edx, [edx + 32]
- jg convertloop
- ret
- }
-}
-#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
-
-#ifdef HAS_ARGBQUANTIZEROW_SSE2
-// Quantize 4 ARGB pixels (16 bytes).
-// Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
- int interval_offset, int width) {
- __asm {
- mov eax, [esp + 4] /* dst_argb */
- movd xmm2, [esp + 8] /* scale */
- movd xmm3, [esp + 12] /* interval_size */
- movd xmm4, [esp + 16] /* interval_offset */
- mov ecx, [esp + 20] /* width */
- pshuflw xmm2, xmm2, 040h
- pshufd xmm2, xmm2, 044h
- pshuflw xmm3, xmm3, 040h
- pshufd xmm3, xmm3, 044h
- pshuflw xmm4, xmm4, 040h
- pshufd xmm4, xmm4, 044h
- pxor xmm5, xmm5 // constant 0
- pcmpeqb xmm6, xmm6 // generate mask 0xff000000
- pslld xmm6, 24
-
- align 4
- convertloop:
- movdqa xmm0, [eax] // read 4 pixels
- punpcklbw xmm0, xmm5 // first 2 pixels
- pmulhuw xmm0, xmm2 // pixel * scale >> 16
- movdqa xmm1, [eax] // read 4 pixels
- punpckhbw xmm1, xmm5 // next 2 pixels
- pmulhuw xmm1, xmm2
- pmullw xmm0, xmm3 // * interval_size
- movdqa xmm7, [eax] // read 4 pixels
- pmullw xmm1, xmm3
- pand xmm7, xmm6 // mask alpha
- paddw xmm0, xmm4 // + interval_size / 2
- paddw xmm1, xmm4
- packuswb xmm0, xmm1
- por xmm0, xmm7
- sub ecx, 4
- movdqa [eax], xmm0
- lea eax, [eax + 16]
- jg convertloop
- ret
- }
-}
-#endif // HAS_ARGBQUANTIZEROW_SSE2
-
-#ifdef HAS_ARGBSHADEROW_SSE2
-// Shade 4 pixels at a time by specified value.
-// Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
- uint32 value) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- movd xmm2, [esp + 16] // value
- punpcklbw xmm2, xmm2
- punpcklqdq xmm2, xmm2
-
- align 4
- convertloop:
- movdqa xmm0, [eax] // read 4 pixels
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm0 // first 2
- punpckhbw xmm1, xmm1 // next 2
- pmulhuw xmm0, xmm2 // argb * value
- pmulhuw xmm1, xmm2 // argb * value
- psrlw xmm0, 8
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
-
- ret
- }
-}
-#endif // HAS_ARGBSHADEROW_SSE2
-
-#ifdef HAS_ARGBMULTIPLYROW_SSE2
-// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) __declspec(align(16))
-void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- pxor xmm5, xmm5 // constant 0
-
- align 4
- convertloop:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
- movdqu xmm2, [esi] // read 4 pixels from src_argb1
- movdqu xmm1, xmm0
- movdqu xmm3, xmm2
- punpcklbw xmm0, xmm0 // first 2
- punpckhbw xmm1, xmm1 // next 2
- punpcklbw xmm2, xmm5 // first 2
- punpckhbw xmm3, xmm5 // next 2
- pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
- pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
- lea eax, [eax + 16]
- lea esi, [esi + 16]
- packuswb xmm0, xmm1
- sub ecx, 4
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
-
- pop esi
- ret
- }
-}
-#endif // HAS_ARGBMULTIPLYROW_SSE2
-
-#ifdef HAS_ARGBADDROW_SSE2
-// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-// TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked) __declspec(align(16))
-void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
-
- sub ecx, 4
- jl convertloop49
-
- align 4
- convertloop4:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
- lea eax, [eax + 16]
- movdqu xmm1, [esi] // read 4 pixels from src_argb1
- lea esi, [esi + 16]
- paddusb xmm0, xmm1 // src_argb0 + src_argb1
- sub ecx, 4
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jge convertloop4
-
- convertloop49:
- add ecx, 4 - 1
- jl convertloop19
-
- convertloop1:
- movd xmm0, [eax] // read 1 pixels from src_argb0
- lea eax, [eax + 4]
- movd xmm1, [esi] // read 1 pixels from src_argb1
- lea esi, [esi + 4]
- paddusb xmm0, xmm1 // src_argb0 + src_argb1
- sub ecx, 1
- movd [edx], xmm0
- lea edx, [edx + 4]
- jge convertloop1
-
- convertloop19:
- pop esi
- ret
- }
-}
-#endif // HAS_ARGBADDROW_SSE2
-
-#ifdef HAS_ARGBSUBTRACTROW_SSE2
-// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) __declspec(align(16))
-void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
-
- align 4
- convertloop:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
- lea eax, [eax + 16]
- movdqu xmm1, [esi] // read 4 pixels from src_argb1
- lea esi, [esi + 16]
- psubusb xmm0, xmm1 // src_argb0 - src_argb1
- sub ecx, 4
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
-
- pop esi
- ret
- }
-}
-#endif // HAS_ARGBSUBTRACTROW_SSE2
-
-#ifdef HAS_ARGBMULTIPLYROW_AVX2
-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
-void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- vpxor ymm5, ymm5, ymm5 // constant 0
-
- align 4
- convertloop:
- vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
- lea eax, [eax + 32]
- vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
- lea esi, [esi + 32]
- vpunpcklbw ymm0, ymm1, ymm1 // low 4
- vpunpckhbw ymm1, ymm1, ymm1 // high 4
- vpunpcklbw ymm2, ymm3, ymm5 // low 4
- vpunpckhbw ymm3, ymm3, ymm5 // high 4
- vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
- vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
- vpackuswb ymm0, ymm0, ymm1
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop esi
- vzeroupper
- ret
- }
-}
-#endif // HAS_ARGBMULTIPLYROW_AVX2
-
-#ifdef HAS_ARGBADDROW_AVX2
-// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
-void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
-
- align 4
- convertloop:
- vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
- lea eax, [eax + 32]
- vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
- lea esi, [esi + 32]
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop esi
- vzeroupper
- ret
- }
-}
-#endif // HAS_ARGBADDROW_AVX2
-
-#ifdef HAS_ARGBSUBTRACTROW_AVX2
-// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
-void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
-
- align 4
- convertloop:
- vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
- lea eax, [eax + 32]
- vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
- lea esi, [esi + 32]
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop esi
- vzeroupper
- ret
- }
-}
-#endif // HAS_ARGBSUBTRACTROW_AVX2
-
-#ifdef HAS_SOBELXROW_SSE2
-// SobelX as a matrix is
-// -1 0 1
-// -2 0 2
-// -1 0 1
-__declspec(naked) __declspec(align(16))
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
- const uint8* src_y2, uint8* dst_sobelx, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_y0
- mov esi, [esp + 8 + 8] // src_y1
- mov edi, [esp + 8 + 12] // src_y2
- mov edx, [esp + 8 + 16] // dst_sobelx
- mov ecx, [esp + 8 + 20] // width
- sub esi, eax
- sub edi, eax
- sub edx, eax
- pxor xmm5, xmm5 // constant 0
-
- align 4
- convertloop:
- movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
- movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- psubw xmm0, xmm1
- movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
- movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
- punpcklbw xmm1, xmm5
- punpcklbw xmm2, xmm5
- psubw xmm1, xmm2
- movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
- movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
- punpcklbw xmm2, xmm5
- punpcklbw xmm3, xmm5
- psubw xmm2, xmm3
- paddw xmm0, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm1
- pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
- psubw xmm1, xmm0
- pmaxsw xmm0, xmm1
- packuswb xmm0, xmm0
- sub ecx, 8
- movq qword ptr [eax + edx], xmm0
- lea eax, [eax + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-#endif // HAS_SOBELXROW_SSE2
-
-#ifdef HAS_SOBELYROW_SSE2
-// SobelY as a matrix is
-// -1 -2 -1
-// 0 0 0
-// 1 2 1
-__declspec(naked) __declspec(align(16))
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_y0
- mov esi, [esp + 4 + 8] // src_y1
- mov edx, [esp + 4 + 12] // dst_sobely
- mov ecx, [esp + 4 + 16] // width
- sub esi, eax
- sub edx, eax
- pxor xmm5, xmm5 // constant 0
-
- align 4
- convertloop:
- movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
- movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- psubw xmm0, xmm1
- movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
- movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
- punpcklbw xmm1, xmm5
- punpcklbw xmm2, xmm5
- psubw xmm1, xmm2
- movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
- movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
- punpcklbw xmm2, xmm5
- punpcklbw xmm3, xmm5
- psubw xmm2, xmm3
- paddw xmm0, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm1
- pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
- psubw xmm1, xmm0
- pmaxsw xmm0, xmm1
- packuswb xmm0, xmm0
- sub ecx, 8
- movq qword ptr [eax + edx], xmm0
- lea eax, [eax + 8]
- jg convertloop
-
- pop esi
- ret
- }
-}
-#endif // HAS_SOBELYROW_SSE2
-
-#ifdef HAS_SOBELROW_SSE2
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-__declspec(naked) __declspec(align(16))
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_sobelx
- mov esi, [esp + 4 + 8] // src_sobely
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- sub esi, eax
- pcmpeqb xmm5, xmm5 // alpha 255
- pslld xmm5, 24 // 0xff000000
-
- align 4
- convertloop:
- movdqa xmm0, [eax] // read 16 pixels src_sobelx
- movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
- lea eax, [eax + 16]
- paddusb xmm0, xmm1 // sobel = sobelx + sobely
- movdqa xmm2, xmm0 // GG
- punpcklbw xmm2, xmm0 // First 8
- punpckhbw xmm0, xmm0 // Next 8
- movdqa xmm1, xmm2 // GGGG
- punpcklwd xmm1, xmm2 // First 4
- punpckhwd xmm2, xmm2 // Next 4
- por xmm1, xmm5 // GGGA
- por xmm2, xmm5
- movdqa xmm3, xmm0 // GGGG
- punpcklwd xmm3, xmm0 // Next 4
- punpckhwd xmm0, xmm0 // Last 4
- por xmm3, xmm5 // GGGA
- por xmm0, xmm5
- sub ecx, 16
- movdqa [edx], xmm1
- movdqa [edx + 16], xmm2
- movdqa [edx + 32], xmm3
- movdqa [edx + 48], xmm0
- lea edx, [edx + 64]
- jg convertloop
-
- pop esi
- ret
- }
-}
-#endif // HAS_SOBELROW_SSE2
-
-#ifdef HAS_SOBELTOPLANEROW_SSE2
-// Adds Sobel X and Sobel Y and stores Sobel into a plane.
-__declspec(naked) __declspec(align(16))
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_y, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_sobelx
- mov esi, [esp + 4 + 8] // src_sobely
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- sub esi, eax
-
- align 4
- convertloop:
- movdqa xmm0, [eax] // read 16 pixels src_sobelx
- movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
- lea eax, [eax + 16]
- paddusb xmm0, xmm1 // sobel = sobelx + sobely
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
-
- pop esi
- ret
- }
-}
-#endif // HAS_SOBELTOPLANEROW_SSE2
-
-#ifdef HAS_SOBELXYROW_SSE2
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-__declspec(naked) __declspec(align(16))
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_sobelx
- mov esi, [esp + 4 + 8] // src_sobely
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- sub esi, eax
- pcmpeqb xmm5, xmm5 // alpha 255
-
- align 4
- convertloop:
- movdqa xmm0, [eax] // read 16 pixels src_sobelx
- movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
- lea eax, [eax + 16]
- movdqa xmm2, xmm0
- paddusb xmm2, xmm1 // sobel = sobelx + sobely
- movdqa xmm3, xmm0 // XA
- punpcklbw xmm3, xmm5
- punpckhbw xmm0, xmm5
- movdqa xmm4, xmm1 // YS
- punpcklbw xmm4, xmm2
- punpckhbw xmm1, xmm2
- movdqa xmm6, xmm4 // YSXA
- punpcklwd xmm6, xmm3 // First 4
- punpckhwd xmm4, xmm3 // Next 4
- movdqa xmm7, xmm1 // YSXA
- punpcklwd xmm7, xmm0 // Next 4
- punpckhwd xmm1, xmm0 // Last 4
- sub ecx, 16
- movdqa [edx], xmm6
- movdqa [edx + 16], xmm4
- movdqa [edx + 32], xmm7
- movdqa [edx + 48], xmm1
- lea edx, [edx + 64]
- jg convertloop
-
- pop esi
- ret
- }
-}
-#endif // HAS_SOBELXYROW_SSE2
-
-#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-// Consider float CumulativeSum.
-// Consider calling CumulativeSum one row at time as needed.
-// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
-// Convert cumulative sum for an area to an average for 1 pixel.
-// topleft is pointer to top left of CumulativeSum buffer for area.
-// botleft is pointer to bottom left of CumulativeSum buffer.
-// width is offset from left to right of area in CumulativeSum buffer measured
-// in number of ints.
-// area is the number of pixels in the area being averaged.
-// dst points to pixel to store result to.
-// count is number of averaged pixels to produce.
-// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
-// aligned.
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
- int width, int area, uint8* dst,
- int count) {
- __asm {
- mov eax, topleft // eax topleft
- mov esi, botleft // esi botleft
- mov edx, width
- movd xmm5, area
- mov edi, dst
- mov ecx, count
- cvtdq2ps xmm5, xmm5
- rcpss xmm4, xmm5 // 1.0f / area
- pshufd xmm4, xmm4, 0
- sub ecx, 4
- jl l4b
-
- cmp area, 128 // 128 pixels will not overflow 15 bits.
- ja l4
-
- pshufd xmm5, xmm5, 0 // area
- pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
- psrld xmm6, 16
- cvtdq2ps xmm6, xmm6
- addps xmm5, xmm6 // (65536.0 + area - 1)
- mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
- cvtps2dq xmm5, xmm5 // 0.16 fixed point
- packssdw xmm5, xmm5 // 16 bit shorts
-
- // 4 pixel loop small blocks.
- align 4
- s4:
- // top left
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
-
- // - top right
- psubd xmm0, [eax + edx * 4]
- psubd xmm1, [eax + edx * 4 + 16]
- psubd xmm2, [eax + edx * 4 + 32]
- psubd xmm3, [eax + edx * 4 + 48]
- lea eax, [eax + 64]
-
- // - bottom left
- psubd xmm0, [esi]
- psubd xmm1, [esi + 16]
- psubd xmm2, [esi + 32]
- psubd xmm3, [esi + 48]
-
- // + bottom right
- paddd xmm0, [esi + edx * 4]
- paddd xmm1, [esi + edx * 4 + 16]
- paddd xmm2, [esi + edx * 4 + 32]
- paddd xmm3, [esi + edx * 4 + 48]
- lea esi, [esi + 64]
-
- packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
- packssdw xmm2, xmm3
-
- pmulhuw xmm0, xmm5
- pmulhuw xmm2, xmm5
-
- packuswb xmm0, xmm2
- movdqu [edi], xmm0
- lea edi, [edi + 16]
- sub ecx, 4
- jge s4
-
- jmp l4b
-
- // 4 pixel loop
- align 4
- l4:
- // top left
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
-
- // - top right
- psubd xmm0, [eax + edx * 4]
- psubd xmm1, [eax + edx * 4 + 16]
- psubd xmm2, [eax + edx * 4 + 32]
- psubd xmm3, [eax + edx * 4 + 48]
- lea eax, [eax + 64]
-
- // - bottom left
- psubd xmm0, [esi]
- psubd xmm1, [esi + 16]
- psubd xmm2, [esi + 32]
- psubd xmm3, [esi + 48]
-
- // + bottom right
- paddd xmm0, [esi + edx * 4]
- paddd xmm1, [esi + edx * 4 + 16]
- paddd xmm2, [esi + edx * 4 + 32]
- paddd xmm3, [esi + edx * 4 + 48]
- lea esi, [esi + 64]
-
- cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
- cvtdq2ps xmm1, xmm1
- mulps xmm0, xmm4
- mulps xmm1, xmm4
- cvtdq2ps xmm2, xmm2
- cvtdq2ps xmm3, xmm3
- mulps xmm2, xmm4
- mulps xmm3, xmm4
- cvtps2dq xmm0, xmm0
- cvtps2dq xmm1, xmm1
- cvtps2dq xmm2, xmm2
- cvtps2dq xmm3, xmm3
- packssdw xmm0, xmm1
- packssdw xmm2, xmm3
- packuswb xmm0, xmm2
- movdqu [edi], xmm0
- lea edi, [edi + 16]
- sub ecx, 4
- jge l4
-
- l4b:
- add ecx, 4 - 1
- jl l1b
-
- // 1 pixel loop
- align 4
- l1:
- movdqa xmm0, [eax]
- psubd xmm0, [eax + edx * 4]
- lea eax, [eax + 16]
- psubd xmm0, [esi]
- paddd xmm0, [esi + edx * 4]
- lea esi, [esi + 16]
- cvtdq2ps xmm0, xmm0
- mulps xmm0, xmm4
- cvtps2dq xmm0, xmm0
- packssdw xmm0, xmm0
- packuswb xmm0, xmm0
- movd dword ptr [edi], xmm0
- lea edi, [edi + 4]
- sub ecx, 1
- jge l1
- l1b:
- }
-}
-#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-
-#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
-// Creates a table of cumulative sums where each value is a sum of all values
-// above and to the left of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
- const int32* previous_cumsum, int width) {
- __asm {
- mov eax, row
- mov edx, cumsum
- mov esi, previous_cumsum
- mov ecx, width
- pxor xmm0, xmm0
- pxor xmm1, xmm1
-
- sub ecx, 4
- jl l4b
- test edx, 15
- jne l4b
-
- // 4 pixel loop
- align 4
- l4:
- movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
- lea eax, [eax + 16]
- movdqa xmm4, xmm2
-
- punpcklbw xmm2, xmm1
- movdqa xmm3, xmm2
- punpcklwd xmm2, xmm1
- punpckhwd xmm3, xmm1
-
- punpckhbw xmm4, xmm1
- movdqa xmm5, xmm4
- punpcklwd xmm4, xmm1
- punpckhwd xmm5, xmm1
-
- paddd xmm0, xmm2
- movdqa xmm2, [esi] // previous row above.
- paddd xmm2, xmm0
-
- paddd xmm0, xmm3
- movdqa xmm3, [esi + 16]
- paddd xmm3, xmm0
-
- paddd xmm0, xmm4
- movdqa xmm4, [esi + 32]
- paddd xmm4, xmm0
-
- paddd xmm0, xmm5
- movdqa xmm5, [esi + 48]
- lea esi, [esi + 64]
- paddd xmm5, xmm0
-
- movdqa [edx], xmm2
- movdqa [edx + 16], xmm3
- movdqa [edx + 32], xmm4
- movdqa [edx + 48], xmm5
-
- lea edx, [edx + 64]
- sub ecx, 4
- jge l4
-
- l4b:
- add ecx, 4 - 1
- jl l1b
-
- // 1 pixel loop
- align 4
- l1:
- movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
- lea eax, [eax + 4]
- punpcklbw xmm2, xmm1
- punpcklwd xmm2, xmm1
- paddd xmm0, xmm2
- movdqu xmm2, [esi]
- lea esi, [esi + 16]
- paddd xmm2, xmm0
- movdqu [edx], xmm2
- lea edx, [edx + 16]
- sub ecx, 1
- jge l1
-
- l1b:
- }
-}
-#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
-
-#ifdef HAS_ARGBAFFINEROW_SSE2
-// Copy ARGB pixels from source image with slope to a row of destination.
-__declspec(naked) __declspec(align(16))
-LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
- uint8* dst_argb, const float* uv_dudv, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 12] // src_argb
- mov esi, [esp + 16] // stride
- mov edx, [esp + 20] // dst_argb
- mov ecx, [esp + 24] // pointer to uv_dudv
- movq xmm2, qword ptr [ecx] // uv
- movq xmm7, qword ptr [ecx + 8] // dudv
- mov ecx, [esp + 28] // width
- shl esi, 16 // 4, stride
- add esi, 4
- movd xmm5, esi
- sub ecx, 4
- jl l4b
-
- // setup for 4 pixel loop
- pshufd xmm7, xmm7, 0x44 // dup dudv
- pshufd xmm5, xmm5, 0 // dup 4, stride
- movdqa xmm0, xmm2 // x0, y0, x1, y1
- addps xmm0, xmm7
- movlhps xmm2, xmm0
- movdqa xmm4, xmm7
- addps xmm4, xmm4 // dudv *= 2
- movdqa xmm3, xmm2 // x2, y2, x3, y3
- addps xmm3, xmm4
- addps xmm4, xmm4 // dudv *= 4
-
- // 4 pixel loop
- align 4
- l4:
- cvttps2dq xmm0, xmm2 // x, y float to int first 2
- cvttps2dq xmm1, xmm3 // x, y float to int next 2
- packssdw xmm0, xmm1 // x, y as 8 shorts
- pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
- movd esi, xmm0
- pshufd xmm0, xmm0, 0x39 // shift right
- movd edi, xmm0
- pshufd xmm0, xmm0, 0x39 // shift right
- movd xmm1, [eax + esi] // read pixel 0
- movd xmm6, [eax + edi] // read pixel 1
- punpckldq xmm1, xmm6 // combine pixel 0 and 1
- addps xmm2, xmm4 // x, y += dx, dy first 2
- movq qword ptr [edx], xmm1
- movd esi, xmm0
- pshufd xmm0, xmm0, 0x39 // shift right
- movd edi, xmm0
- movd xmm6, [eax + esi] // read pixel 2
- movd xmm0, [eax + edi] // read pixel 3
- punpckldq xmm6, xmm0 // combine pixel 2 and 3
- addps xmm3, xmm4 // x, y += dx, dy next 2
- sub ecx, 4
- movq qword ptr 8[edx], xmm6
- lea edx, [edx + 16]
- jge l4
-
- l4b:
- add ecx, 4 - 1
- jl l1b
-
- // 1 pixel loop
- align 4
- l1:
- cvttps2dq xmm0, xmm2 // x, y float to int
- packssdw xmm0, xmm0 // x, y as shorts
- pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
- addps xmm2, xmm7 // x, y += dx, dy
- movd esi, xmm0
- movd xmm0, [eax + esi] // copy a pixel
- sub ecx, 1
- movd [edx], xmm0
- lea edx, [edx + 4]
- jge l1
- l1b:
- pop edi
- pop esi
- ret
- }
-}
-#endif // HAS_ARGBAFFINEROW_SSE2
-
-#ifdef HAS_INTERPOLATEROW_AVX2
-// Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- shr eax, 1
- // Dispatch to specialized filters if applicable.
- cmp eax, 0
- je xloop100 // 0 / 128. Blend 100 / 0.
- sub edi, esi
- cmp eax, 32
- je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
- cmp eax, 64
- je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
- cmp eax, 96
- je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
-
- vmovd xmm0, eax // high fraction 0..127
- neg eax
- add eax, 128
- vmovd xmm5, eax // low fraction 128..1
- vpunpcklbw xmm5, xmm5, xmm0
- vpunpcklwd xmm5, xmm5, xmm5
- vpxor ymm0, ymm0, ymm0
- vpermd ymm5, ymm0, ymm5
-
- align 4
- xloop:
- vmovdqu ymm0, [esi]
- vmovdqu ymm2, [esi + edx]
- vpunpckhbw ymm1, ymm0, ymm2 // mutates
- vpunpcklbw ymm0, ymm0, ymm2 // mutates
- vpmaddubsw ymm0, ymm0, ymm5
- vpmaddubsw ymm1, ymm1, ymm5
- vpsrlw ymm0, ymm0, 7
- vpsrlw ymm1, ymm1, 7
- vpackuswb ymm0, ymm0, ymm1 // unmutates
- sub ecx, 32
- vmovdqu [esi + edi], ymm0
- lea esi, [esi + 32]
- jg xloop
- jmp xloop99
-
- // Blend 25 / 75.
- align 4
- xloop25:
- vmovdqu ymm0, [esi]
- vpavgb ymm0, ymm0, [esi + edx]
- vpavgb ymm0, ymm0, [esi + edx]
- sub ecx, 32
- vmovdqu [esi + edi], ymm0
- lea esi, [esi + 32]
- jg xloop25
- jmp xloop99
-
- // Blend 50 / 50.
- align 4
- xloop50:
- vmovdqu ymm0, [esi]
- vpavgb ymm0, ymm0, [esi + edx]
- sub ecx, 32
- vmovdqu [esi + edi], ymm0
- lea esi, [esi + 32]
- jg xloop50
- jmp xloop99
-
- // Blend 75 / 25.
- align 4
- xloop75:
- vmovdqu ymm0, [esi + edx]
- vpavgb ymm0, ymm0, [esi]
- vpavgb ymm0, ymm0, [esi]
- sub ecx, 32
- vmovdqu [esi + edi], ymm0
- lea esi, [esi + 32]
- jg xloop75
- jmp xloop99
-
- // Blend 100 / 0 - Copy row unchanged.
- align 4
- xloop100:
- rep movsb
-
- xloop99:
- pop edi
- pop esi
- vzeroupper
- ret
- }
-}
-#endif // HAS_INTERPOLATEROW_AVX2
-
-#ifdef HAS_INTERPOLATEROW_SSSE3
-// Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- sub edi, esi
- shr eax, 1
- // Dispatch to specialized filters if applicable.
- cmp eax, 0
- je xloop100 // 0 / 128. Blend 100 / 0.
- cmp eax, 32
- je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
- cmp eax, 64
- je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
- cmp eax, 96
- je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
-
- movd xmm0, eax // high fraction 0..127
- neg eax
- add eax, 128
- movd xmm5, eax // low fraction 128..1
- punpcklbw xmm5, xmm0
- punpcklwd xmm5, xmm5
- pshufd xmm5, xmm5, 0
-
- align 4
- xloop:
- movdqa xmm0, [esi]
- movdqa xmm2, [esi + edx]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm2
- punpckhbw xmm1, xmm2
- pmaddubsw xmm0, xmm5
- pmaddubsw xmm1, xmm5
- psrlw xmm0, 7
- psrlw xmm1, 7
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop
- jmp xloop99
-
- // Blend 25 / 75.
- align 4
- xloop25:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + edx]
- pavgb xmm0, xmm1
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop25
- jmp xloop99
-
- // Blend 50 / 50.
- align 4
- xloop50:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + edx]
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop50
- jmp xloop99
-
- // Blend 75 / 25.
- align 4
- xloop75:
- movdqa xmm1, [esi]
- movdqa xmm0, [esi + edx]
- pavgb xmm0, xmm1
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop75
- jmp xloop99
-
- // Blend 100 / 0 - Copy row unchanged.
- align 4
- xloop100:
- movdqa xmm0, [esi]
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop100
-
- xloop99:
- pop edi
- pop esi
- ret
- }
-}
-#endif // HAS_INTERPOLATEROW_SSSE3
-
-#ifdef HAS_INTERPOLATEROW_SSE2
-// Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
-void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- sub edi, esi
- // Dispatch to specialized filters if applicable.
- cmp eax, 0
- je xloop100 // 0 / 256. Blend 100 / 0.
- cmp eax, 64
- je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
- cmp eax, 128
- je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
- cmp eax, 192
- je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
-
- movd xmm5, eax // xmm5 = y fraction
- punpcklbw xmm5, xmm5
- psrlw xmm5, 1
- punpcklwd xmm5, xmm5
- punpckldq xmm5, xmm5
- punpcklqdq xmm5, xmm5
- pxor xmm4, xmm4
-
- align 4
- xloop:
- movdqa xmm0, [esi] // row0
- movdqa xmm2, [esi + edx] // row1
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- punpcklbw xmm2, xmm4
- punpckhbw xmm3, xmm4
- punpcklbw xmm0, xmm4
- punpckhbw xmm1, xmm4
- psubw xmm2, xmm0 // row1 - row0
- psubw xmm3, xmm1
- paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
- paddw xmm3, xmm3
- pmulhw xmm2, xmm5 // scale diff
- pmulhw xmm3, xmm5
- paddw xmm0, xmm2 // sum rows
- paddw xmm1, xmm3
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop
- jmp xloop99
-
- // Blend 25 / 75.
- align 4
- xloop25:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + edx]
- pavgb xmm0, xmm1
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop25
- jmp xloop99
-
- // Blend 50 / 50.
- align 4
- xloop50:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + edx]
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop50
- jmp xloop99
-
- // Blend 75 / 25.
- align 4
- xloop75:
- movdqa xmm1, [esi]
- movdqa xmm0, [esi + edx]
- pavgb xmm0, xmm1
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop75
- jmp xloop99
-
- // Blend 100 / 0 - Copy row unchanged.
- align 4
- xloop100:
- movdqa xmm0, [esi]
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop100
-
- xloop99:
- pop edi
- pop esi
- ret
- }
-}
-#endif // HAS_INTERPOLATEROW_SSE2
-
-// Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- sub edi, esi
- shr eax, 1
- // Dispatch to specialized filters if applicable.
- cmp eax, 0
- je xloop100 // 0 / 128. Blend 100 / 0.
- cmp eax, 32
- je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
- cmp eax, 64
- je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
- cmp eax, 96
- je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
-
- movd xmm0, eax // high fraction 0..127
- neg eax
- add eax, 128
- movd xmm5, eax // low fraction 128..1
- punpcklbw xmm5, xmm0
- punpcklwd xmm5, xmm5
- pshufd xmm5, xmm5, 0
-
- align 4
- xloop:
- movdqu xmm0, [esi]
- movdqu xmm2, [esi + edx]
- movdqu xmm1, xmm0
- punpcklbw xmm0, xmm2
- punpckhbw xmm1, xmm2
- pmaddubsw xmm0, xmm5
- pmaddubsw xmm1, xmm5
- psrlw xmm0, 7
- psrlw xmm1, 7
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqu [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop
- jmp xloop99
-
- // Blend 25 / 75.
- align 4
- xloop25:
- movdqu xmm0, [esi]
- movdqu xmm1, [esi + edx]
- pavgb xmm0, xmm1
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqu [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop25
- jmp xloop99
-
- // Blend 50 / 50.
- align 4
- xloop50:
- movdqu xmm0, [esi]
- movdqu xmm1, [esi + edx]
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqu [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop50
- jmp xloop99
-
- // Blend 75 / 25.
- align 4
- xloop75:
- movdqu xmm1, [esi]
- movdqu xmm0, [esi + edx]
- pavgb xmm0, xmm1
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqu [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop75
- jmp xloop99
-
- // Blend 100 / 0 - Copy row unchanged.
- align 4
- xloop100:
- movdqu xmm0, [esi]
- sub ecx, 16
- movdqu [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop100
-
- xloop99:
- pop edi
- pop esi
- ret
- }
-}
-
-#ifdef HAS_INTERPOLATEROW_SSE2
-// Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- sub edi, esi
- // Dispatch to specialized filters if applicable.
- cmp eax, 0
- je xloop100 // 0 / 256. Blend 100 / 0.
- cmp eax, 64
- je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
- cmp eax, 128
- je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
- cmp eax, 192
- je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
-
- movd xmm5, eax // xmm5 = y fraction
- punpcklbw xmm5, xmm5
- psrlw xmm5, 1
- punpcklwd xmm5, xmm5
- punpckldq xmm5, xmm5
- punpcklqdq xmm5, xmm5
- pxor xmm4, xmm4
-
- align 4
- xloop:
- movdqu xmm0, [esi] // row0
- movdqu xmm2, [esi + edx] // row1
- movdqu xmm1, xmm0
- movdqu xmm3, xmm2
- punpcklbw xmm2, xmm4
- punpckhbw xmm3, xmm4
- punpcklbw xmm0, xmm4
- punpckhbw xmm1, xmm4
- psubw xmm2, xmm0 // row1 - row0
- psubw xmm3, xmm1
- paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
- paddw xmm3, xmm3
- pmulhw xmm2, xmm5 // scale diff
- pmulhw xmm3, xmm5
- paddw xmm0, xmm2 // sum rows
- paddw xmm1, xmm3
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqu [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop
- jmp xloop99
-
- // Blend 25 / 75.
- align 4
- xloop25:
- movdqu xmm0, [esi]
- movdqu xmm1, [esi + edx]
- pavgb xmm0, xmm1
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqu [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop25
- jmp xloop99
-
- // Blend 50 / 50.
- align 4
- xloop50:
- movdqu xmm0, [esi]
- movdqu xmm1, [esi + edx]
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqu [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop50
- jmp xloop99
-
- // Blend 75 / 25.
- align 4
- xloop75:
- movdqu xmm1, [esi]
- movdqu xmm0, [esi + edx]
- pavgb xmm0, xmm1
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqu [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop75
- jmp xloop99
-
- // Blend 100 / 0 - Copy row unchanged.
- align 4
- xloop100:
- movdqu xmm0, [esi]
- sub ecx, 16
- movdqu [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop100
-
- xloop99:
- pop edi
- pop esi
- ret
- }
-}
-#endif // HAS_INTERPOLATEROW_SSE2
-
-__declspec(naked) __declspec(align(16))
-void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_uv
- mov edx, [esp + 4 + 8] // src_uv_stride
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- sub edi, eax
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- pavgb xmm0, [eax + edx]
- sub ecx, 16
- movdqa [eax + edi], xmm0
- lea eax, [eax + 16]
- jg convertloop
- pop edi
- ret
- }
-}
-
-#ifdef HAS_HALFROW_AVX2
-__declspec(naked) __declspec(align(16))
-void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_uv
- mov edx, [esp + 4 + 8] // src_uv_stride
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- sub edi, eax
-
- align 4
- convertloop:
- vmovdqu ymm0, [eax]
- vpavgb ymm0, ymm0, [eax + edx]
- sub ecx, 32
- vmovdqu [eax + edi], ymm0
- lea eax, [eax + 32]
- jg convertloop
-
- pop edi
- vzeroupper
- ret
- }
-}
-#endif // HAS_HALFROW_AVX2
-
-__declspec(naked) __declspec(align(16))
-void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_bayer
- movd xmm5, [esp + 12] // selector
- mov ecx, [esp + 16] // pix
- pshufd xmm5, xmm5, 0
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- pshufb xmm0, xmm5
- pshufb xmm1, xmm5
- punpckldq xmm0, xmm1
- sub ecx, 8
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- jg wloop
- ret
- }
-}
-
-// Specialized ARGB to Bayer that just isolates G channel.
-__declspec(naked) __declspec(align(16))
-void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_bayer
- // selector
- mov ecx, [esp + 16] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
- psrld xmm5, 24
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrld xmm0, 8 // Move green to bottom.
- psrld xmm1, 8
- pand xmm0, xmm5
- pand xmm1, xmm5
- packssdw xmm0, xmm1
- packuswb xmm0, xmm1
- sub ecx, 8
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- jg wloop
- ret
- }
-}
-
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-__declspec(naked) __declspec(align(16))
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int pix) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // shuffler
- movdqa xmm5, [ecx]
- mov ecx, [esp + 16] // pix
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- pshufb xmm0, xmm5
- pshufb xmm1, xmm5
- sub ecx, 8
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- jg wloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int pix) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // shuffler
- movdqa xmm5, [ecx]
- mov ecx, [esp + 16] // pix
-
- align 4
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- pshufb xmm0, xmm5
- pshufb xmm1, xmm5
- sub ecx, 8
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- jg wloop
- ret
- }
-}
-
-#ifdef HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked) __declspec(align(16))
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int pix) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // shuffler
- vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
- mov ecx, [esp + 16] // pix
-
- align 4
- wloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpshufb ymm0, ymm0, ymm5
- vpshufb ymm1, ymm1, ymm5
- sub ecx, 16
- vmovdqu [edx], ymm0
- vmovdqu [edx + 32], ymm1
- lea edx, [edx + 64]
- jg wloop
-
- vzeroupper
- ret
- }
-}
-#endif // HAS_ARGBSHUFFLEROW_AVX2
-
-__declspec(naked) __declspec(align(16))
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int pix) {
- __asm {
- push ebx
- push esi
- mov eax, [esp + 8 + 4] // src_argb
- mov edx, [esp + 8 + 8] // dst_argb
- mov esi, [esp + 8 + 12] // shuffler
- mov ecx, [esp + 8 + 16] // pix
- pxor xmm5, xmm5
-
- mov ebx, [esi] // shuffler
- cmp ebx, 0x03000102
- je shuf_3012
- cmp ebx, 0x00010203
- je shuf_0123
- cmp ebx, 0x00030201
- je shuf_0321
- cmp ebx, 0x02010003
- je shuf_2103
-
- // TODO(fbarchard): Use one source pointer and 3 offsets.
- shuf_any1:
- movzx ebx, byte ptr [esi]
- movzx ebx, byte ptr [eax + ebx]
- mov [edx], bl
- movzx ebx, byte ptr [esi + 1]
- movzx ebx, byte ptr [eax + ebx]
- mov [edx + 1], bl
- movzx ebx, byte ptr [esi + 2]
- movzx ebx, byte ptr [eax + ebx]
- mov [edx + 2], bl
- movzx ebx, byte ptr [esi + 3]
- movzx ebx, byte ptr [eax + ebx]
- mov [edx + 3], bl
- lea eax, [eax + 4]
- lea edx, [edx + 4]
- sub ecx, 1
- jg shuf_any1
- jmp shuf99
-
- align 4
- shuf_0123:
- movdqu xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm5
- punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
- pshuflw xmm0, xmm0, 01Bh
- pshufhw xmm1, xmm1, 01Bh
- pshuflw xmm1, xmm1, 01Bh
- packuswb xmm0, xmm1
- sub ecx, 4
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg shuf_0123
- jmp shuf99
-
- align 4
- shuf_0321:
- movdqu xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm5
- punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
- pshuflw xmm0, xmm0, 039h
- pshufhw xmm1, xmm1, 039h
- pshuflw xmm1, xmm1, 039h
- packuswb xmm0, xmm1
- sub ecx, 4
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg shuf_0321
- jmp shuf99
-
- align 4
- shuf_2103:
- movdqu xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm5
- punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
- pshuflw xmm0, xmm0, 093h
- pshufhw xmm1, xmm1, 093h
- pshuflw xmm1, xmm1, 093h
- packuswb xmm0, xmm1
- sub ecx, 4
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg shuf_2103
- jmp shuf99
-
- align 4
- shuf_3012:
- movdqu xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm5
- punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
- pshuflw xmm0, xmm0, 0C6h
- pshufhw xmm1, xmm1, 0C6h
- pshuflw xmm1, xmm1, 0C6h
- packuswb xmm0, xmm1
- sub ecx, 4
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg shuf_3012
-
- shuf99:
- pop esi
- pop ebx
- ret
- }
-}
-
-// YUY2 - Macro-pixel = 2 image pixels
-// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
-
-// UYVY - Macro-pixel = 2 image pixels
-// U0Y0V0Y1
-
-__declspec(naked) __declspec(align(16))
-void I422ToYUY2Row_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_y
- mov esi, [esp + 8 + 8] // src_u
- mov edx, [esp + 8 + 12] // src_v
- mov edi, [esp + 8 + 16] // dst_frame
- mov ecx, [esp + 8 + 20] // width
- sub edx, esi
-
- align 4
- convertloop:
- movq xmm2, qword ptr [esi] // U
- movq xmm3, qword ptr [esi + edx] // V
- lea esi, [esi + 8]
- punpcklbw xmm2, xmm3 // UV
- movdqu xmm0, [eax] // Y
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm2 // YUYV
- punpckhbw xmm1, xmm2
- movdqu [edi], xmm0
- movdqu [edi + 16], xmm1
- lea edi, [edi + 32]
- sub ecx, 16
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void I422ToUYVYRow_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_y
- mov esi, [esp + 8 + 8] // src_u
- mov edx, [esp + 8 + 12] // src_v
- mov edi, [esp + 8 + 16] // dst_frame
- mov ecx, [esp + 8 + 20] // width
- sub edx, esi
-
- align 4
- convertloop:
- movq xmm2, qword ptr [esi] // U
- movq xmm3, qword ptr [esi + edx] // V
- lea esi, [esi + 8]
- punpcklbw xmm2, xmm3 // UV
- movdqu xmm0, [eax] // Y
- movdqa xmm1, xmm2
- lea eax, [eax + 16]
- punpcklbw xmm1, xmm0 // UYVY
- punpckhbw xmm2, xmm0
- movdqu [edi], xmm1
- movdqu [edi + 16], xmm2
- lea edi, [edi + 32]
- sub ecx, 16
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-__declspec(naked) __declspec(align(16))
-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
- uint8* dst_argb, const float* poly,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] /* src_argb */
- mov edx, [esp + 4 + 8] /* dst_argb */
- mov esi, [esp + 4 + 12] /* poly */
- mov ecx, [esp + 4 + 16] /* width */
- pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
-
- // 2 pixel loop.
- align 4
- convertloop:
-// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
-// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
- movq xmm0, qword ptr [eax] // BGRABGRA
- lea eax, [eax + 8]
- punpcklbw xmm0, xmm3
- movdqa xmm4, xmm0
- punpcklwd xmm0, xmm3 // pixel 0
- punpckhwd xmm4, xmm3 // pixel 1
- cvtdq2ps xmm0, xmm0 // 4 floats
- cvtdq2ps xmm4, xmm4
- movdqa xmm1, xmm0 // X
- movdqa xmm5, xmm4
- mulps xmm0, [esi + 16] // C1 * X
- mulps xmm4, [esi + 16]
- addps xmm0, [esi] // result = C0 + C1 * X
- addps xmm4, [esi]
- movdqa xmm2, xmm1
- movdqa xmm6, xmm5
- mulps xmm2, xmm1 // X * X
- mulps xmm6, xmm5
- mulps xmm1, xmm2 // X * X * X
- mulps xmm5, xmm6
- mulps xmm2, [esi + 32] // C2 * X * X
- mulps xmm6, [esi + 32]
- mulps xmm1, [esi + 48] // C3 * X * X * X
- mulps xmm5, [esi + 48]
- addps xmm0, xmm2 // result += C2 * X * X
- addps xmm4, xmm6
- addps xmm0, xmm1 // result += C3 * X * X * X
- addps xmm4, xmm5
- cvttps2dq xmm0, xmm0
- cvttps2dq xmm4, xmm4
- packuswb xmm0, xmm4
- packuswb xmm0, xmm0
- sub ecx, 2
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- jg convertloop
- pop esi
- ret
- }
-}
-#endif // HAS_ARGBPOLYNOMIALROW_SSE2
-
-#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-__declspec(naked) __declspec(align(16))
-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
- uint8* dst_argb, const float* poly,
- int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_argb */
- mov ecx, [esp + 12] /* poly */
- vbroadcastf128 ymm4, [ecx] // C0
- vbroadcastf128 ymm5, [ecx + 16] // C1
- vbroadcastf128 ymm6, [ecx + 32] // C2
- vbroadcastf128 ymm7, [ecx + 48] // C3
- mov ecx, [esp + 16] /* width */
-
- // 2 pixel loop.
- align 4
- convertloop:
- vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
- lea eax, [eax + 8]
- vcvtdq2ps ymm0, ymm0 // X 8 floats
- vmulps ymm2, ymm0, ymm0 // X * X
- vmulps ymm3, ymm0, ymm7 // C3 * X
- vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
- vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
- vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
- vcvttps2dq ymm0, ymm0
- vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
- vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
- vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
- sub ecx, 2
- vmovq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- jg convertloop
- vzeroupper
- ret
- }
-}
-#endif // HAS_ARGBPOLYNOMIALROW_AVX2
-
-#ifdef HAS_ARGBCOLORTABLEROW_X86
-// Tranform ARGB pixels with color table.
-__declspec(naked) __declspec(align(16))
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] /* dst_argb */
- mov esi, [esp + 4 + 8] /* table_argb */
- mov ecx, [esp + 4 + 12] /* width */
-
- // 1 pixel loop.
- align 4
- convertloop:
- movzx edx, byte ptr [eax]
- lea eax, [eax + 4]
- movzx edx, byte ptr [esi + edx * 4]
- mov byte ptr [eax - 4], dl
- movzx edx, byte ptr [eax - 4 + 1]
- movzx edx, byte ptr [esi + edx * 4 + 1]
- mov byte ptr [eax - 4 + 1], dl
- movzx edx, byte ptr [eax - 4 + 2]
- movzx edx, byte ptr [esi + edx * 4 + 2]
- mov byte ptr [eax - 4 + 2], dl
- movzx edx, byte ptr [eax - 4 + 3]
- movzx edx, byte ptr [esi + edx * 4 + 3]
- mov byte ptr [eax - 4 + 3], dl
- dec ecx
- jg convertloop
- pop esi
- ret
- }
-}
-#endif // HAS_ARGBCOLORTABLEROW_X86
-
-#ifdef HAS_RGBCOLORTABLEROW_X86
-// Tranform RGB pixels with color table.
-__declspec(naked) __declspec(align(16))
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] /* dst_argb */
- mov esi, [esp + 4 + 8] /* table_argb */
- mov ecx, [esp + 4 + 12] /* width */
-
- // 1 pixel loop.
- align 4
- convertloop:
- movzx edx, byte ptr [eax]
- lea eax, [eax + 4]
- movzx edx, byte ptr [esi + edx * 4]
- mov byte ptr [eax - 4], dl
- movzx edx, byte ptr [eax - 4 + 1]
- movzx edx, byte ptr [esi + edx * 4 + 1]
- mov byte ptr [eax - 4 + 1], dl
- movzx edx, byte ptr [eax - 4 + 2]
- movzx edx, byte ptr [esi + edx * 4 + 2]
- mov byte ptr [eax - 4 + 2], dl
- dec ecx
- jg convertloop
-
- pop esi
- ret
- }
-}
-#endif // HAS_RGBCOLORTABLEROW_X86
-
-#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
-// Tranform RGB pixels with luma table.
-__declspec(naked) __declspec(align(16))
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- int width,
- const uint8* luma, uint32 lumacoeff) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] /* src_argb */
- mov edi, [esp + 8 + 8] /* dst_argb */
- mov ecx, [esp + 8 + 12] /* width */
- movd xmm2, dword ptr [esp + 8 + 16] // luma table
- movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
- pshufd xmm2, xmm2, 0
- pshufd xmm3, xmm3, 0
- pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
- psllw xmm4, 8
- pxor xmm5, xmm5
-
- // 4 pixel loop.
- align 4
- convertloop:
- movdqu xmm0, qword ptr [eax] // generate luma ptr
- pmaddubsw xmm0, xmm3
- phaddw xmm0, xmm0
- pand xmm0, xmm4 // mask out low bits
- punpcklwd xmm0, xmm5
- paddd xmm0, xmm2 // add table base
- movd esi, xmm0
- pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
-
- movzx edx, byte ptr [eax]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi], dl
- movzx edx, byte ptr [eax + 1]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 1], dl
- movzx edx, byte ptr [eax + 2]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 2], dl
- movzx edx, byte ptr [eax + 3] // copy alpha.
- mov byte ptr [edi + 3], dl
-
- movd esi, xmm0
- pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
-
- movzx edx, byte ptr [eax + 4]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 4], dl
- movzx edx, byte ptr [eax + 5]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 5], dl
- movzx edx, byte ptr [eax + 6]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 6], dl
- movzx edx, byte ptr [eax + 7] // copy alpha.
- mov byte ptr [edi + 7], dl
-
- movd esi, xmm0
- pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
-
- movzx edx, byte ptr [eax + 8]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 8], dl
- movzx edx, byte ptr [eax + 9]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 9], dl
- movzx edx, byte ptr [eax + 10]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 10], dl
- movzx edx, byte ptr [eax + 11] // copy alpha.
- mov byte ptr [edi + 11], dl
-
- movd esi, xmm0
-
- movzx edx, byte ptr [eax + 12]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 12], dl
- movzx edx, byte ptr [eax + 13]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 13], dl
- movzx edx, byte ptr [eax + 14]
- movzx edx, byte ptr [esi + edx]
- mov byte ptr [edi + 14], dl
- movzx edx, byte ptr [eax + 15] // copy alpha.
- mov byte ptr [edi + 15], dl
-
- sub ecx, 4
- lea eax, [eax + 16]
- lea edi, [edi + 16]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
-
-#endif // defined(_M_X64)
-#endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/row_x86.asm b/src/main/jni/libyuv/source/row_x86.asm
deleted file mode 100644
index 0cb326f8e..000000000
--- a/src/main/jni/libyuv/source/row_x86.asm
+++ /dev/null
@@ -1,146 +0,0 @@
-;
-; Copyright 2012 The LibYuv Project Authors. All rights reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%ifdef __YASM_VERSION_ID__
-%if __YASM_VERSION_ID__ < 01020000h
-%error AVX2 is supported only by yasm 1.2.0 or later.
-%endif
-%endif
-%include "x86inc.asm"
-
-SECTION .text
-
-; cglobal numeric constants are parameters, gpr regs, mm regs
-
-; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
-
-%macro YUY2TOYROW 2-3
-cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
-%ifidn %1,YUY2
- pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff
- psrlw m2, m2, 8
-%endif
-
- ALIGN 4
-.convertloop:
- mov%2 m0, [src_yuy2q]
- mov%2 m1, [src_yuy2q + mmsize]
- lea src_yuy2q, [src_yuy2q + mmsize * 2]
-%ifidn %1,YUY2
- pand m0, m0, m2 ; YUY2 even bytes are Y
- pand m1, m1, m2
-%else
- psrlw m0, m0, 8 ; UYVY odd bytes are Y
- psrlw m1, m1, 8
-%endif
- packuswb m0, m0, m1
-%if cpuflag(AVX2)
- vpermq m0, m0, 0xd8
-%endif
- sub pixd, mmsize
- mov%2 [dst_yq], m0
- lea dst_yq, [dst_yq + mmsize]
- jg .convertloop
- REP_RET
-%endmacro
-
-; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version.
-INIT_MMX MMX
-YUY2TOYROW YUY2,a,
-YUY2TOYROW YUY2,u,_Unaligned
-YUY2TOYROW UYVY,a,
-YUY2TOYROW UYVY,u,_Unaligned
-INIT_XMM SSE2
-YUY2TOYROW YUY2,a,
-YUY2TOYROW YUY2,u,_Unaligned
-YUY2TOYROW UYVY,a,
-YUY2TOYROW UYVY,u,_Unaligned
-INIT_YMM AVX2
-YUY2TOYROW YUY2,a,
-YUY2TOYROW UYVY,a,
-
-; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
-
-%macro SplitUVRow 1-2
-cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
- pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff
- psrlw m4, m4, 8
- sub dst_vq, dst_uq
-
- ALIGN 4
-.convertloop:
- mov%1 m0, [src_uvq]
- mov%1 m1, [src_uvq + mmsize]
- lea src_uvq, [src_uvq + mmsize * 2]
- psrlw m2, m0, 8 ; odd bytes
- psrlw m3, m1, 8
- pand m0, m0, m4 ; even bytes
- pand m1, m1, m4
- packuswb m0, m0, m1
- packuswb m2, m2, m3
-%if cpuflag(AVX2)
- vpermq m0, m0, 0xd8
- vpermq m2, m2, 0xd8
-%endif
- mov%1 [dst_uq], m0
- mov%1 [dst_uq + dst_vq], m2
- lea dst_uq, [dst_uq + mmsize]
- sub pixd, mmsize
- jg .convertloop
- REP_RET
-%endmacro
-
-INIT_MMX MMX
-SplitUVRow a,
-SplitUVRow u,_Unaligned
-INIT_XMM SSE2
-SplitUVRow a,
-SplitUVRow u,_Unaligned
-INIT_YMM AVX2
-SplitUVRow a,
-
-; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-; int width);
-
-%macro MergeUVRow_ 1-2
-cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
- sub src_vq, src_uq
-
- ALIGN 4
-.convertloop:
- mov%1 m0, [src_uq]
- mov%1 m1, [src_vq]
- lea src_uq, [src_uq + mmsize]
- punpcklbw m2, m0, m1 // first 8 UV pairs
- punpckhbw m0, m0, m1 // next 8 UV pairs
-%if cpuflag(AVX2)
- vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0
- vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0
- mov%1 [dst_uvq], m1
- mov%1 [dst_uvq + mmsize], m2
-%else
- mov%1 [dst_uvq], m2
- mov%1 [dst_uvq + mmsize], m0
-%endif
- lea dst_uvq, [dst_uvq + mmsize * 2]
- sub pixd, mmsize
- jg .convertloop
- REP_RET
-%endmacro
-
-INIT_MMX MMX
-MergeUVRow_ a,
-MergeUVRow_ u,_Unaligned
-INIT_XMM SSE2
-MergeUVRow_ a,
-MergeUVRow_ u,_Unaligned
-INIT_YMM AVX2
-MergeUVRow_ a,
-
diff --git a/src/main/jni/libyuv/source/scale.cc b/src/main/jni/libyuv/source/scale.cc
deleted file mode 100644
index 5b33b5f04..000000000
--- a/src/main/jni/libyuv/source/scale.cc
+++ /dev/null
@@ -1,1716 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/scale.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h" // For CopyPlane
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Remove this macro if OVERREAD is safe.
-#define AVOID_OVERREAD 1
-
-static __inline int Abs(int v) {
- return v >= 0 ? v : -v;
-}
-
-#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
-
-// Scale plane, 1/2
-// This is an optimized version for scaling down a plane to 1/2 of
-// its original size.
-
-static void ScalePlaneDown2(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- enum FilterMode filtering) {
- int y;
- void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) =
- filtering == kFilterNone ? ScaleRowDown2_C :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_C :
- ScaleRowDown2Box_C);
- int row_stride = src_stride << 1;
- if (!filtering) {
- src_ptr += src_stride; // Point to odd rows.
- src_stride = 0;
- }
-
-#if defined(HAS_SCALEROWDOWN2_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON;
- }
-#elif defined(HAS_SCALEROWDOWN2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 :
- ScaleRowDown2Box_Unaligned_SSE2);
- if (IS_ALIGNED(src_ptr, 16) &&
- IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
- ScaleRowDown2Box_SSE2);
- }
- }
-#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
- IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
- IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- ScaleRowDown2 = filtering ?
- ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2;
- }
-#endif
-
- if (filtering == kFilterLinear) {
- src_stride = 0;
- }
- // TODO(fbarchard): Loop through source height to allow odd height.
- for (y = 0; y < dst_height; ++y) {
- ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
- src_ptr += row_stride;
- dst_ptr += dst_stride;
- }
-}
-
-static void ScalePlaneDown2_16(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_ptr, uint16* dst_ptr,
- enum FilterMode filtering) {
- int y;
- void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int dst_width) =
- filtering == kFilterNone ? ScaleRowDown2_16_C :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C :
- ScaleRowDown2Box_16_C);
- int row_stride = src_stride << 1;
- if (!filtering) {
- src_ptr += src_stride; // Point to odd rows.
- src_stride = 0;
- }
-
-#if defined(HAS_SCALEROWDOWN2_16_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :
- ScaleRowDown2_16_NEON;
- }
-#elif defined(HAS_SCALEROWDOWN2_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown2 = filtering == kFilterNone ?
- ScaleRowDown2_Unaligned_16_SSE2 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_16_SSE2 :
- ScaleRowDown2Box_Unaligned_16_SSE2);
- if (IS_ALIGNED(src_ptr, 16) &&
- IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
- ScaleRowDown2Box_16_SSE2);
- }
- }
-#elif defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
- IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
- IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- ScaleRowDown2 = filtering ?
- ScaleRowDown2Box_16_MIPS_DSPR2 : ScaleRowDown2_16_MIPS_DSPR2;
- }
-#endif
-
- if (filtering == kFilterLinear) {
- src_stride = 0;
- }
- // TODO(fbarchard): Loop through source height to allow odd height.
- for (y = 0; y < dst_height; ++y) {
- ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
- src_ptr += row_stride;
- dst_ptr += dst_stride;
- }
-}
-
-// Scale plane, 1/4
-// This is an optimized version for scaling down a plane to 1/4 of
-// its original size.
-
-static void ScalePlaneDown4(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- enum FilterMode filtering) {
- int y;
- void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) =
- filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
- int row_stride = src_stride << 2;
- if (!filtering) {
- src_ptr += src_stride * 2; // Point to row 2.
- src_stride = 0;
- }
-#if defined(HAS_SCALEROWDOWN4_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
- ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
- }
-#elif defined(HAS_SCALEROWDOWN4_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
- }
-#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
- IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
- IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- ScaleRowDown4 = filtering ?
- ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2;
- }
-#endif
-
- if (filtering == kFilterLinear) {
- src_stride = 0;
- }
- for (y = 0; y < dst_height; ++y) {
- ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
- src_ptr += row_stride;
- dst_ptr += dst_stride;
- }
-}
-
-static void ScalePlaneDown4_16(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_ptr, uint16* dst_ptr,
- enum FilterMode filtering) {
- int y;
- void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int dst_width) =
- filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
- int row_stride = src_stride << 2;
- if (!filtering) {
- src_ptr += src_stride * 2; // Point to row 2.
- src_stride = 0;
- }
-#if defined(HAS_SCALEROWDOWN4_16_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
- ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :
- ScaleRowDown4_16_NEON;
- }
-#elif defined(HAS_SCALEROWDOWN4_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
- ScaleRowDown4_16_SSE2;
- }
-#elif defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
- IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
- IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- ScaleRowDown4 = filtering ?
- ScaleRowDown4Box_16_MIPS_DSPR2 : ScaleRowDown4_16_MIPS_DSPR2;
- }
-#endif
-
- if (filtering == kFilterLinear) {
- src_stride = 0;
- }
- for (y = 0; y < dst_height; ++y) {
- ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
- src_ptr += row_stride;
- dst_ptr += dst_stride;
- }
-}
-
-// Scale plane down, 3/4
-
-static void ScalePlaneDown34(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- enum FilterMode filtering) {
- int y;
- void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
- void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
- const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
- assert(dst_width % 3 == 0);
- if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_C;
- ScaleRowDown34_1 = ScaleRowDown34_C;
- } else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
- ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
- }
-#if defined(HAS_SCALEROWDOWN34_NEON)
- if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
- if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_NEON;
- ScaleRowDown34_1 = ScaleRowDown34_NEON;
- } else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
- ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
- }
- }
-#endif
-#if defined(HAS_SCALEROWDOWN34_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
- ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
- } else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
- ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
- }
- }
-#endif
-#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
- IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2;
- ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2;
- } else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2;
- ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2;
- }
- }
-#endif
-
- for (y = 0; y < dst_height - 2; y += 3) {
- ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
- dst_ptr, dst_width);
- src_ptr += src_stride * 2;
- dst_ptr += dst_stride;
- }
-
- // Remainder 1 or 2 rows with last row vertically unfiltered
- if ((dst_height % 3) == 2) {
- ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
- } else if ((dst_height % 3) == 1) {
- ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
- }
-}
-
-static void ScalePlaneDown34_16(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_ptr, uint16* dst_ptr,
- enum FilterMode filtering) {
- int y;
- void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int dst_width);
- void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int dst_width);
- const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
- assert(dst_width % 3 == 0);
- if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_16_C;
- ScaleRowDown34_1 = ScaleRowDown34_16_C;
- } else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C;
- ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C;
- }
-#if defined(HAS_SCALEROWDOWN34_16_NEON)
- if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
- if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_16_NEON;
- ScaleRowDown34_1 = ScaleRowDown34_16_NEON;
- } else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON;
- ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON;
- }
- }
-#endif
-#if defined(HAS_SCALEROWDOWN34_16_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
- ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
- } else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3;
- ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3;
- }
- }
-#endif
-#if defined(HAS_SCALEROWDOWN34_16_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
- IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_16_MIPS_DSPR2;
- ScaleRowDown34_1 = ScaleRowDown34_16_MIPS_DSPR2;
- } else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_MIPS_DSPR2;
- ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_MIPS_DSPR2;
- }
- }
-#endif
-
- for (y = 0; y < dst_height - 2; y += 3) {
- ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
- dst_ptr, dst_width);
- src_ptr += src_stride * 2;
- dst_ptr += dst_stride;
- }
-
- // Remainder 1 or 2 rows with last row vertically unfiltered
- if ((dst_height % 3) == 2) {
- ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
- } else if ((dst_height % 3) == 1) {
- ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
- }
-}
-
-
-// Scale plane, 3/8
-// This is an optimized version for scaling down a plane to 3/8
-// of its original size.
-//
-// Uses box filter arranges like this
-// aaabbbcc -> abc
-// aaabbbcc def
-// aaabbbcc ghi
-// dddeeeff
-// dddeeeff
-// dddeeeff
-// ggghhhii
-// ggghhhii
-// Boxes are 3x3, 2x3, 3x2 and 2x2
-
-static void ScalePlaneDown38(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- enum FilterMode filtering) {
- int y;
- void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
- void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
- const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
- assert(dst_width % 3 == 0);
- if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_C;
- ScaleRowDown38_2 = ScaleRowDown38_C;
- } else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
- ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
- }
-#if defined(HAS_SCALEROWDOWN38_NEON)
- if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
- if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_NEON;
- ScaleRowDown38_2 = ScaleRowDown38_NEON;
- } else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
- ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
- }
- }
-#elif defined(HAS_SCALEROWDOWN38_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
- ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
- } else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
- ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
- }
- }
-#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
- IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
- IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2;
- ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2;
- } else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2;
- ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2;
- }
- }
-#endif
-
- for (y = 0; y < dst_height - 2; y += 3) {
- ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
- src_ptr += src_stride * 3;
- dst_ptr += dst_stride;
- ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
- src_ptr += src_stride * 3;
- dst_ptr += dst_stride;
- ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
- src_ptr += src_stride * 2;
- dst_ptr += dst_stride;
- }
-
- // Remainder 1 or 2 rows with last row vertically unfiltered
- if ((dst_height % 3) == 2) {
- ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
- src_ptr += src_stride * 3;
- dst_ptr += dst_stride;
- ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
- } else if ((dst_height % 3) == 1) {
- ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
- }
-}
-
-static void ScalePlaneDown38_16(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_ptr, uint16* dst_ptr,
- enum FilterMode filtering) {
- int y;
- void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int dst_width);
- void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int dst_width);
- const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
- assert(dst_width % 3 == 0);
- if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_16_C;
- ScaleRowDown38_2 = ScaleRowDown38_16_C;
- } else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C;
- ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C;
- }
-#if defined(HAS_SCALEROWDOWN38_16_NEON)
- if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
- if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_16_NEON;
- ScaleRowDown38_2 = ScaleRowDown38_16_NEON;
- } else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON;
- ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;
- }
- }
-#elif defined(HAS_SCALEROWDOWN38_16_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
- ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
- } else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3;
- ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;
- }
- }
-#elif defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
- IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
- IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_16_MIPS_DSPR2;
- ScaleRowDown38_2 = ScaleRowDown38_16_MIPS_DSPR2;
- } else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_MIPS_DSPR2;
- ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_MIPS_DSPR2;
- }
- }
-#endif
-
- for (y = 0; y < dst_height - 2; y += 3) {
- ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
- src_ptr += src_stride * 3;
- dst_ptr += dst_stride;
- ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
- src_ptr += src_stride * 3;
- dst_ptr += dst_stride;
- ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
- src_ptr += src_stride * 2;
- dst_ptr += dst_stride;
- }
-
- // Remainder 1 or 2 rows with last row vertically unfiltered
- if ((dst_height % 3) == 2) {
- ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
- src_ptr += src_stride * 3;
- dst_ptr += dst_stride;
- ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
- } else if ((dst_height % 3) == 1) {
- ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
- }
-}
-
-static __inline uint32 SumBox(int iboxwidth, int iboxheight,
- ptrdiff_t src_stride, const uint8* src_ptr) {
- uint32 sum = 0u;
- int y;
- assert(iboxwidth > 0);
- assert(iboxheight > 0);
- for (y = 0; y < iboxheight; ++y) {
- int x;
- for (x = 0; x < iboxwidth; ++x) {
- sum += src_ptr[x];
- }
- src_ptr += src_stride;
- }
- return sum;
-}
-
-static __inline uint32 SumBox_16(int iboxwidth, int iboxheight,
- ptrdiff_t src_stride, const uint16* src_ptr) {
- uint32 sum = 0u;
- int y;
- assert(iboxwidth > 0);
- assert(iboxheight > 0);
- for (y = 0; y < iboxheight; ++y) {
- int x;
- for (x = 0; x < iboxwidth; ++x) {
- sum += src_ptr[x];
- }
- src_ptr += src_stride;
- }
- return sum;
-}
-
-static void ScalePlaneBoxRow_C(int dst_width, int boxheight,
- int x, int dx, ptrdiff_t src_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
- int i;
- int boxwidth;
- for (i = 0; i < dst_width; ++i) {
- int ix = x >> 16;
- x += dx;
- boxwidth = (x >> 16) - ix;
- *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
- (boxwidth * boxheight);
- }
-}
-
-static void ScalePlaneBoxRow_16_C(int dst_width, int boxheight,
- int x, int dx, ptrdiff_t src_stride,
- const uint16* src_ptr, uint16* dst_ptr) {
- int i;
- int boxwidth;
- for (i = 0; i < dst_width; ++i) {
- int ix = x >> 16;
- x += dx;
- boxwidth = (x >> 16) - ix;
- *dst_ptr++ = SumBox_16(boxwidth, boxheight, src_stride, src_ptr + ix) /
- (boxwidth * boxheight);
- }
-}
-
-static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
- uint32 sum = 0u;
- int x;
- assert(iboxwidth > 0);
- for (x = 0; x < iboxwidth; ++x) {
- sum += src_ptr[x];
- }
- return sum;
-}
-
-static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
- uint32 sum = 0u;
- int x;
- assert(iboxwidth > 0);
- for (x = 0; x < iboxwidth; ++x) {
- sum += src_ptr[x];
- }
- return sum;
-}
-
-static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
- const uint16* src_ptr, uint8* dst_ptr) {
- int i;
- int scaletbl[2];
- int minboxwidth = (dx >> 16);
- int* scaleptr = scaletbl - minboxwidth;
- int boxwidth;
- scaletbl[0] = 65536 / (minboxwidth * boxheight);
- scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
- for (i = 0; i < dst_width; ++i) {
- int ix = x >> 16;
- x += dx;
- boxwidth = (x >> 16) - ix;
- *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
- }
-}
-
-static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
- const uint32* src_ptr, uint16* dst_ptr) {
- int i;
- int scaletbl[2];
- int minboxwidth = (dx >> 16);
- int* scaleptr = scaletbl - minboxwidth;
- int boxwidth;
- scaletbl[0] = 65536 / (minboxwidth * boxheight);
- scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
- for (i = 0; i < dst_width; ++i) {
- int ix = x >> 16;
- x += dx;
- boxwidth = (x >> 16) - ix;
- *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
- scaleptr[boxwidth] >> 16;
- }
-}
-
-static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
- const uint16* src_ptr, uint8* dst_ptr) {
- int boxwidth = (dx >> 16);
- int scaleval = 65536 / (boxwidth * boxheight);
- int i;
- for (i = 0; i < dst_width; ++i) {
- *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
- x += boxwidth;
- }
-}
-
-static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
- const uint32* src_ptr, uint16* dst_ptr) {
- int boxwidth = (dx >> 16);
- int scaleval = 65536 / (boxwidth * boxheight);
- int i;
- for (i = 0; i < dst_width; ++i) {
- *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16;
- x += boxwidth;
- }
-}
-
-// Scale plane down to any dimensions, with interpolation.
-// (boxfilter).
-//
-// Same method as SimpleScale, which is fixed point, outputting
-// one pixel of destination using fixed point (16.16) to step
-// through source, sampling a box of pixel with simple
-// averaging.
-static void ScalePlaneBox(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
- int j;
- // Initial source x/y coordinate and step values as 16.16 fixed point.
- int x = 0;
- int y = 0;
- int dx = 0;
- int dy = 0;
- const int max_y = (src_height << 16);
- ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
- &x, &y, &dx, &dy);
- src_width = Abs(src_width);
- // TODO(fbarchard): Remove this and make AddRows handle boxheight 1.
- if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) {
- uint8* dst = dst_ptr;
- int j;
- for (j = 0; j < dst_height; ++j) {
- int boxheight;
- int iy = y >> 16;
- const uint8* src = src_ptr + iy * src_stride;
- y += dy;
- if (y > max_y) {
- y = max_y;
- }
- boxheight = (y >> 16) - iy;
- ScalePlaneBoxRow_C(dst_width, boxheight,
- x, dx, src_stride,
- src, dst);
- dst += dst_stride;
- }
- return;
- }
- {
- // Allocate a row buffer of uint16.
- align_buffer_64(row16, src_width * 2);
- void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
- const uint16* src_ptr, uint8* dst_ptr) =
- (dx & 0xffff) ? ScaleAddCols2_C: ScaleAddCols1_C;
- void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C;
-
-#if defined(HAS_SCALEADDROWS_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
-#ifdef AVOID_OVERREAD
- IS_ALIGNED(src_width, 16) &&
-#endif
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- ScaleAddRows = ScaleAddRows_SSE2;
- }
-#endif
-
- for (j = 0; j < dst_height; ++j) {
- int boxheight;
- int iy = y >> 16;
- const uint8* src = src_ptr + iy * src_stride;
- y += dy;
- if (y > (src_height << 16)) {
- y = (src_height << 16);
- }
- boxheight = (y >> 16) - iy;
- ScaleAddRows(src, src_stride, (uint16*)(row16),
- src_width, boxheight);
- ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16),
- dst_ptr);
- dst_ptr += dst_stride;
- }
- free_aligned_buffer_64(row16);
- }
-}
-
-static void ScalePlaneBox_16(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_ptr, uint16* dst_ptr) {
- int j;
- // Initial source x/y coordinate and step values as 16.16 fixed point.
- int x = 0;
- int y = 0;
- int dx = 0;
- int dy = 0;
- const int max_y = (src_height << 16);
- ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
- &x, &y, &dx, &dy);
- src_width = Abs(src_width);
- // TODO(fbarchard): Remove this and make AddRows handle boxheight 1.
- if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) {
- uint16* dst = dst_ptr;
- int j;
- for (j = 0; j < dst_height; ++j) {
- int boxheight;
- int iy = y >> 16;
- const uint16* src = src_ptr + iy * src_stride;
- y += dy;
- if (y > max_y) {
- y = max_y;
- }
- boxheight = (y >> 16) - iy;
- ScalePlaneBoxRow_16_C(dst_width, boxheight,
- x, dx, src_stride,
- src, dst);
- dst += dst_stride;
- }
- return;
- }
- {
- // Allocate a row buffer of uint32.
- align_buffer_64(row32, src_width * 4);
- void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
- const uint32* src_ptr, uint16* dst_ptr) =
- (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
- void (*ScaleAddRows)(const uint16* src_ptr, ptrdiff_t src_stride,
- uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C;
-
-#if defined(HAS_SCALEADDROWS_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
-#ifdef AVOID_OVERREAD
- IS_ALIGNED(src_width, 16) &&
-#endif
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- ScaleAddRows = ScaleAddRows_16_SSE2;
- }
-#endif
-
- for (j = 0; j < dst_height; ++j) {
- int boxheight;
- int iy = y >> 16;
- const uint16* src = src_ptr + iy * src_stride;
- y += dy;
- if (y > (src_height << 16)) {
- y = (src_height << 16);
- }
- boxheight = (y >> 16) - iy;
- ScaleAddRows(src, src_stride, (uint32*)(row32),
- src_width, boxheight);
- ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32),
- dst_ptr);
- dst_ptr += dst_stride;
- }
- free_aligned_buffer_64(row32);
- }
-}
-
-// Scale plane down with bilinear interpolation.
-void ScalePlaneBilinearDown(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- enum FilterMode filtering) {
- // Initial source x/y coordinate and step values as 16.16 fixed point.
- int x = 0;
- int y = 0;
- int dx = 0;
- int dy = 0;
- // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
- // Allocate a row buffer.
- align_buffer_64(row, src_width);
-
- const int max_y = (src_height - 1) << 16;
- int j;
- void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) =
- (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
- void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
- InterpolateRow_C;
- ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
- &x, &y, &dx, &dy);
- src_width = Abs(src_width);
-
-#if defined(HAS_INTERPOLATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
- InterpolateRow = InterpolateRow_Any_SSE2;
- if (IS_ALIGNED(src_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
- InterpolateRow = InterpolateRow_Any_SSSE3;
- if (IS_ALIGNED(src_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
- InterpolateRow = InterpolateRow_Any_AVX2;
- if (IS_ALIGNED(src_width, 32)) {
- InterpolateRow = InterpolateRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {
- InterpolateRow = InterpolateRow_Any_NEON;
- if (IS_ALIGNED(src_width, 16)) {
- InterpolateRow = InterpolateRow_NEON;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) {
- InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
- if (IS_ALIGNED(src_width, 4)) {
- InterpolateRow = InterpolateRow_MIPS_DSPR2;
- }
- }
-#endif
-
-
-#if defined(HAS_SCALEFILTERCOLS_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
- ScaleFilterCols = ScaleFilterCols_SSSE3;
- }
-#endif
- if (y > max_y) {
- y = max_y;
- }
-
- for (j = 0; j < dst_height; ++j) {
- int yi = y >> 16;
- const uint8* src = src_ptr + yi * src_stride;
- if (filtering == kFilterLinear) {
- ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
- } else {
- int yf = (y >> 8) & 255;
- InterpolateRow(row, src, src_stride, src_width, yf);
- ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
- }
- dst_ptr += dst_stride;
- y += dy;
- if (y > max_y) {
- y = max_y;
- }
- }
- free_aligned_buffer_64(row);
-}
-
-void ScalePlaneBilinearDown_16(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_ptr, uint16* dst_ptr,
- enum FilterMode filtering) {
- // Initial source x/y coordinate and step values as 16.16 fixed point.
- int x = 0;
- int y = 0;
- int dx = 0;
- int dy = 0;
- // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
- // Allocate a row buffer.
- align_buffer_64(row, src_width * 2);
-
- const int max_y = (src_height - 1) << 16;
- int j;
- void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
- int dst_width, int x, int dx) =
- (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
- void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
- ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
- InterpolateRow_16_C;
- ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
- &x, &y, &dx, &dy);
- src_width = Abs(src_width);
-
-#if defined(HAS_INTERPOLATEROW_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
- InterpolateRow = InterpolateRow_Any_16_SSE2;
- if (IS_ALIGNED(src_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
- if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- InterpolateRow = InterpolateRow_16_SSE2;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
- InterpolateRow = InterpolateRow_Any_16_SSSE3;
- if (IS_ALIGNED(src_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
- if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- InterpolateRow = InterpolateRow_16_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
- InterpolateRow = InterpolateRow_Any_16_AVX2;
- if (IS_ALIGNED(src_width, 32)) {
- InterpolateRow = InterpolateRow_16_AVX2;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_NEON)
- if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {
- InterpolateRow = InterpolateRow_Any_16_NEON;
- if (IS_ALIGNED(src_width, 16)) {
- InterpolateRow = InterpolateRow_16_NEON;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) {
- InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
- if (IS_ALIGNED(src_width, 4)) {
- InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
- }
- }
-#endif
-
-
-#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
- ScaleFilterCols = ScaleFilterCols_16_SSSE3;
- }
-#endif
- if (y > max_y) {
- y = max_y;
- }
-
- for (j = 0; j < dst_height; ++j) {
- int yi = y >> 16;
- const uint16* src = src_ptr + yi * src_stride;
- if (filtering == kFilterLinear) {
- ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
- } else {
- int yf = (y >> 8) & 255;
- InterpolateRow((uint16*)row, src, src_stride, src_width, yf);
- ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx);
- }
- dst_ptr += dst_stride;
- y += dy;
- if (y > max_y) {
- y = max_y;
- }
- }
- free_aligned_buffer_64(row);
-}
-
-// Scale up down with bilinear interpolation.
-void ScalePlaneBilinearUp(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- enum FilterMode filtering) {
- int j;
- // Initial source x/y coordinate and step values as 16.16 fixed point.
- int x = 0;
- int y = 0;
- int dx = 0;
- int dy = 0;
- const int max_y = (src_height - 1) << 16;
- void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
- InterpolateRow_C;
- void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) =
- filtering ? ScaleFilterCols_C : ScaleCols_C;
- ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
- &x, &y, &dx, &dy);
- src_width = Abs(src_width);
-
-#if defined(HAS_INTERPOLATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
- InterpolateRow = InterpolateRow_Any_SSE2;
- if (IS_ALIGNED(dst_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
- InterpolateRow = InterpolateRow_Any_SSSE3;
- if (IS_ALIGNED(dst_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
- InterpolateRow = InterpolateRow_Any_AVX2;
- if (IS_ALIGNED(dst_width, 32)) {
- InterpolateRow = InterpolateRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) {
- InterpolateRow = InterpolateRow_Any_NEON;
- if (IS_ALIGNED(dst_width, 16)) {
- InterpolateRow = InterpolateRow_NEON;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) {
- InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
- if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_MIPS_DSPR2;
- }
- }
-#endif
-
- if (filtering && src_width >= 32768) {
- ScaleFilterCols = ScaleFilterCols64_C;
- }
-#if defined(HAS_SCALEFILTERCOLS_SSSE3)
- if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
- ScaleFilterCols = ScaleFilterCols_SSSE3;
- }
-#endif
- if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
- ScaleFilterCols = ScaleColsUp2_C;
-#if defined(HAS_SCALECOLS_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleFilterCols = ScaleColsUp2_SSE2;
- }
-#endif
- }
-
- if (y > max_y) {
- y = max_y;
- }
- {
- int yi = y >> 16;
- const uint8* src = src_ptr + yi * src_stride;
-
- // Allocate 2 row buffers.
- const int kRowSize = (dst_width + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
-
- uint8* rowptr = row;
- int rowstride = kRowSize;
- int lasty = yi;
-
- ScaleFilterCols(rowptr, src, dst_width, x, dx);
- if (src_height > 1) {
- src += src_stride;
- }
- ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
- src += src_stride;
-
- for (j = 0; j < dst_height; ++j) {
- yi = y >> 16;
- if (yi != lasty) {
- if (y > max_y) {
- y = max_y;
- yi = y >> 16;
- src = src_ptr + yi * src_stride;
- }
- if (yi != lasty) {
- ScaleFilterCols(rowptr, src, dst_width, x, dx);
- rowptr += rowstride;
- rowstride = -rowstride;
- lasty = yi;
- src += src_stride;
- }
- }
- if (filtering == kFilterLinear) {
- InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
- } else {
- int yf = (y >> 8) & 255;
- InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
- }
- dst_ptr += dst_stride;
- y += dy;
- }
- free_aligned_buffer_64(row);
- }
-}
-
-void ScalePlaneBilinearUp_16(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_ptr, uint16* dst_ptr,
- enum FilterMode filtering) {
- int j;
- // Initial source x/y coordinate and step values as 16.16 fixed point.
- int x = 0;
- int y = 0;
- int dx = 0;
- int dy = 0;
- const int max_y = (src_height - 1) << 16;
- void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
- ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
- InterpolateRow_16_C;
- void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
- int dst_width, int x, int dx) =
- filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
- ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
- &x, &y, &dx, &dy);
- src_width = Abs(src_width);
-
-#if defined(HAS_INTERPOLATEROW_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
- InterpolateRow = InterpolateRow_Any_16_SSE2;
- if (IS_ALIGNED(dst_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
- if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_16_SSE2;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
- InterpolateRow = InterpolateRow_Any_16_SSSE3;
- if (IS_ALIGNED(dst_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
- if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_16_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
- InterpolateRow = InterpolateRow_Any_16_AVX2;
- if (IS_ALIGNED(dst_width, 32)) {
- InterpolateRow = InterpolateRow_16_AVX2;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_NEON)
- if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) {
- InterpolateRow = InterpolateRow_Any_16_NEON;
- if (IS_ALIGNED(dst_width, 16)) {
- InterpolateRow = InterpolateRow_16_NEON;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) {
- InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
- if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
- }
- }
-#endif
-
- if (filtering && src_width >= 32768) {
- ScaleFilterCols = ScaleFilterCols64_16_C;
- }
-#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
- if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
- ScaleFilterCols = ScaleFilterCols_16_SSSE3;
- }
-#endif
- if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
- ScaleFilterCols = ScaleColsUp2_16_C;
-#if defined(HAS_SCALECOLS_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleFilterCols = ScaleColsUp2_16_SSE2;
- }
-#endif
- }
-
- if (y > max_y) {
- y = max_y;
- }
- {
- int yi = y >> 16;
- const uint16* src = src_ptr + yi * src_stride;
-
- // Allocate 2 row buffers.
- const int kRowSize = (dst_width + 15) & ~15;
- align_buffer_64(row, kRowSize * 4);
-
- uint16* rowptr = (uint16*)row;
- int rowstride = kRowSize;
- int lasty = yi;
-
- ScaleFilterCols(rowptr, src, dst_width, x, dx);
- if (src_height > 1) {
- src += src_stride;
- }
- ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
- src += src_stride;
-
- for (j = 0; j < dst_height; ++j) {
- yi = y >> 16;
- if (yi != lasty) {
- if (y > max_y) {
- y = max_y;
- yi = y >> 16;
- src = src_ptr + yi * src_stride;
- }
- if (yi != lasty) {
- ScaleFilterCols(rowptr, src, dst_width, x, dx);
- rowptr += rowstride;
- rowstride = -rowstride;
- lasty = yi;
- src += src_stride;
- }
- }
- if (filtering == kFilterLinear) {
- InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
- } else {
- int yf = (y >> 8) & 255;
- InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
- }
- dst_ptr += dst_stride;
- y += dy;
- }
- free_aligned_buffer_64(row);
- }
-}
-
-// Scale Plane to/from any dimensions, without interpolation.
-// Fixed point math is used for performance: The upper 16 bits
-// of x and dx is the integer part of the source position and
-// the lower 16 bits are the fixed decimal part.
-
-static void ScalePlaneSimple(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
- int i;
- void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) = ScaleCols_C;
- // Initial source x/y coordinate and step values as 16.16 fixed point.
- int x = 0;
- int y = 0;
- int dx = 0;
- int dy = 0;
- ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
- &x, &y, &dx, &dy);
- src_width = Abs(src_width);
-
- if (src_width * 2 == dst_width && x < 0x8000) {
- ScaleCols = ScaleColsUp2_C;
-#if defined(HAS_SCALECOLS_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleCols = ScaleColsUp2_SSE2;
- }
-#endif
- }
-
- for (i = 0; i < dst_height; ++i) {
- ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
- dst_width, x, dx);
- dst_ptr += dst_stride;
- y += dy;
- }
-}
-
-static void ScalePlaneSimple_16(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_ptr, uint16* dst_ptr) {
- int i;
- void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr,
- int dst_width, int x, int dx) = ScaleCols_16_C;
- // Initial source x/y coordinate and step values as 16.16 fixed point.
- int x = 0;
- int y = 0;
- int dx = 0;
- int dy = 0;
- ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
- &x, &y, &dx, &dy);
- src_width = Abs(src_width);
-
- if (src_width * 2 == dst_width && x < 0x8000) {
- ScaleCols = ScaleColsUp2_16_C;
-#if defined(HAS_SCALECOLS_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleCols = ScaleColsUp2_16_SSE2;
- }
-#endif
- }
-
- for (i = 0; i < dst_height; ++i) {
- ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
- dst_width, x, dx);
- dst_ptr += dst_stride;
- y += dy;
- }
-}
-
-// Scale a plane.
-// This function dispatches to a specialized scaler based on scale factor.
-
-LIBYUV_API
-void ScalePlane(const uint8* src, int src_stride,
- int src_width, int src_height,
- uint8* dst, int dst_stride,
- int dst_width, int dst_height,
- enum FilterMode filtering) {
- // Simplify filtering when possible.
- filtering = ScaleFilterReduce(src_width, src_height,
- dst_width, dst_height,
- filtering);
-
- // Negative height means invert the image.
- if (src_height < 0) {
- src_height = -src_height;
- src = src + (src_height - 1) * src_stride;
- src_stride = -src_stride;
- }
-
- // Use specialized scales to improve performance for common resolutions.
- // For example, all the 1/2 scalings will use ScalePlaneDown2()
- if (dst_width == src_width && dst_height == src_height) {
- // Straight copy.
- CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
- return;
- }
- if (dst_width == src_width) {
- int dy = FixedDiv(src_height, dst_height);
- // Arbitrary scale vertically, but unscaled vertically.
- ScalePlaneVertical(src_height,
- dst_width, dst_height,
- src_stride, dst_stride, src, dst,
- 0, 0, dy, 1, filtering);
- return;
- }
- if (dst_width <= Abs(src_width) && dst_height <= src_height) {
- // Scale down.
- if (4 * dst_width == 3 * src_width &&
- 4 * dst_height == 3 * src_height) {
- // optimized, 3/4
- ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- return;
- }
- if (2 * dst_width == src_width && 2 * dst_height == src_height) {
- // optimized, 1/2
- ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- return;
- }
- // 3/8 rounded up for odd sized chroma height.
- if (8 * dst_width == 3 * src_width &&
- dst_height == ((src_height * 3 + 7) / 8)) {
- // optimized, 3/8
- ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- return;
- }
- if (4 * dst_width == src_width && 4 * dst_height == src_height &&
- filtering != kFilterBilinear) {
- // optimized, 1/4
- ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- return;
- }
- }
- if (filtering == kFilterBox && dst_height * 2 < src_height) {
- ScalePlaneBox(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst);
- return;
- }
- if (filtering && dst_height > src_height) {
- ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- return;
- }
- if (filtering) {
- ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- return;
- }
- ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst);
-}
-
-LIBYUV_API
-void ScalePlane_16(const uint16* src, int src_stride,
- int src_width, int src_height,
- uint16* dst, int dst_stride,
- int dst_width, int dst_height,
- enum FilterMode filtering) {
- // Simplify filtering when possible.
- filtering = ScaleFilterReduce(src_width, src_height,
- dst_width, dst_height,
- filtering);
-
- // Negative height means invert the image.
- if (src_height < 0) {
- src_height = -src_height;
- src = src + (src_height - 1) * src_stride;
- src_stride = -src_stride;
- }
-
- // Use specialized scales to improve performance for common resolutions.
- // For example, all the 1/2 scalings will use ScalePlaneDown2()
- if (dst_width == src_width && dst_height == src_height) {
- // Straight copy.
- CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
- return;
- }
- if (dst_width == src_width) {
- int dy = FixedDiv(src_height, dst_height);
- // Arbitrary scale vertically, but unscaled vertically.
- ScalePlaneVertical_16(src_height,
- dst_width, dst_height,
- src_stride, dst_stride, src, dst,
- 0, 0, dy, 1, filtering);
- return;
- }
- if (dst_width <= Abs(src_width) && dst_height <= src_height) {
- // Scale down.
- if (4 * dst_width == 3 * src_width &&
- 4 * dst_height == 3 * src_height) {
- // optimized, 3/4
- ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- return;
- }
- if (2 * dst_width == src_width && 2 * dst_height == src_height) {
- // optimized, 1/2
- ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- return;
- }
- // 3/8 rounded up for odd sized chroma height.
- if (8 * dst_width == 3 * src_width &&
- dst_height == ((src_height * 3 + 7) / 8)) {
- // optimized, 3/8
- ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- return;
- }
- if (4 * dst_width == src_width && 4 * dst_height == src_height &&
- filtering != kFilterBilinear) {
- // optimized, 1/4
- ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- return;
- }
- }
- if (filtering == kFilterBox && dst_height * 2 < src_height) {
- ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst);
- return;
- }
- if (filtering && dst_height > src_height) {
- ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- return;
- }
- if (filtering) {
- ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- return;
- }
- ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst);
-}
-
-// Scale an I420 image.
-// This function in turn calls a scaling function for each plane.
-
-LIBYUV_API
-int I420Scale(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- int src_width, int src_height,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int dst_width, int dst_height,
- enum FilterMode filtering) {
- int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
- int src_halfheight = SUBSAMPLE(src_height, 1, 1);
- int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
- int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
- if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
- !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
- return -1;
- }
-
- ScalePlane(src_y, src_stride_y, src_width, src_height,
- dst_y, dst_stride_y, dst_width, dst_height,
- filtering);
- ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
- dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
- filtering);
- ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
- dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
- filtering);
- return 0;
-}
-
-LIBYUV_API
-int I420Scale_16(const uint16* src_y, int src_stride_y,
- const uint16* src_u, int src_stride_u,
- const uint16* src_v, int src_stride_v,
- int src_width, int src_height,
- uint16* dst_y, int dst_stride_y,
- uint16* dst_u, int dst_stride_u,
- uint16* dst_v, int dst_stride_v,
- int dst_width, int dst_height,
- enum FilterMode filtering) {
- int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
- int src_halfheight = SUBSAMPLE(src_height, 1, 1);
- int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
- int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
- if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
- !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
- return -1;
- }
-
- ScalePlane_16(src_y, src_stride_y, src_width, src_height,
- dst_y, dst_stride_y, dst_width, dst_height,
- filtering);
- ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight,
- dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
- filtering);
- ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight,
- dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
- filtering);
- return 0;
-}
-
-// Deprecated api
-LIBYUV_API
-int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
- int src_stride_y, int src_stride_u, int src_stride_v,
- int src_width, int src_height,
- uint8* dst_y, uint8* dst_u, uint8* dst_v,
- int dst_stride_y, int dst_stride_u, int dst_stride_v,
- int dst_width, int dst_height,
- LIBYUV_BOOL interpolate) {
- return I420Scale(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- src_width, src_height,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- dst_width, dst_height,
- interpolate ? kFilterBox : kFilterNone);
-}
-
-// Deprecated api
-LIBYUV_API
-int ScaleOffset(const uint8* src, int src_width, int src_height,
- uint8* dst, int dst_width, int dst_height, int dst_yoffset,
- LIBYUV_BOOL interpolate) {
- // Chroma requires offset to multiple of 2.
- int dst_yoffset_even = dst_yoffset & ~1;
- int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
- int src_halfheight = SUBSAMPLE(src_height, 1, 1);
- int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
- int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
- int aheight = dst_height - dst_yoffset_even * 2; // actual output height
- const uint8* src_y = src;
- const uint8* src_u = src + src_width * src_height;
- const uint8* src_v = src + src_width * src_height +
- src_halfwidth * src_halfheight;
- uint8* dst_y = dst + dst_yoffset_even * dst_width;
- uint8* dst_u = dst + dst_width * dst_height +
- (dst_yoffset_even >> 1) * dst_halfwidth;
- uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
- (dst_yoffset_even >> 1) * dst_halfwidth;
- if (!src || src_width <= 0 || src_height <= 0 ||
- !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 ||
- dst_yoffset_even >= dst_height) {
- return -1;
- }
- return I420Scale(src_y, src_width,
- src_u, src_halfwidth,
- src_v, src_halfwidth,
- src_width, src_height,
- dst_y, dst_width,
- dst_u, dst_halfwidth,
- dst_v, dst_halfwidth,
- dst_width, aheight,
- interpolate ? kFilterBox : kFilterNone);
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/scale_argb.cc b/src/main/jni/libyuv/source/scale_argb.cc
deleted file mode 100644
index e339cd7c7..000000000
--- a/src/main/jni/libyuv/source/scale_argb.cc
+++ /dev/null
@@ -1,809 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/scale.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h" // For CopyARGB
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-static __inline int Abs(int v) {
- return v >= 0 ? v : -v;
-}
-
-// ScaleARGB ARGB, 1/2
-// This is an optimized version for scaling down a ARGB to 1/2 of
-// its original size.
-static void ScaleARGBDown2(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_argb, uint8* dst_argb,
- int x, int dx, int y, int dy,
- enum FilterMode filtering) {
- int j;
- int row_stride = src_stride * (dy >> 16);
- void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) =
- filtering == kFilterNone ? ScaleARGBRowDown2_C :
- (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
- ScaleARGBRowDown2Box_C);
- assert(dx == 65536 * 2); // Test scale factor of 2.
- assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
- // Advance to odd row, even column.
- if (filtering == kFilterBilinear) {
- src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
- } else {
- src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
- }
-
-#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
- (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
- ScaleARGBRowDown2Box_SSE2);
- }
-#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
- ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
- ScaleARGBRowDown2_NEON;
- }
-#endif
-
- if (filtering == kFilterLinear) {
- src_stride = 0;
- }
- for (j = 0; j < dst_height; ++j) {
- ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
- src_argb += row_stride;
- dst_argb += dst_stride;
- }
-}
-
-// ScaleARGB ARGB, 1/4
-// This is an optimized version for scaling down a ARGB to 1/4 of
-// its original size.
-static void ScaleARGBDown4Box(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_argb, uint8* dst_argb,
- int x, int dx, int y, int dy) {
- int j;
- // Allocate 2 rows of ARGB.
- const int kRowSize = (dst_width * 2 * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
- int row_stride = src_stride * (dy >> 16);
- void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
- // Advance to odd row, even column.
- src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
- assert(dx == 65536 * 4); // Test scale factor of 4.
- assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
-#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
- }
-#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
- ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
- }
-#endif
- for (j = 0; j < dst_height; ++j) {
- ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
- ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
- row + kRowSize, dst_width * 2);
- ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
- src_argb += row_stride;
- dst_argb += dst_stride;
- }
- free_aligned_buffer_64(row);
-}
-
-// ScaleARGB ARGB Even
-// This is an optimized version for scaling down a ARGB to even
-// multiple of its original size.
-static void ScaleARGBDownEven(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_argb, uint8* dst_argb,
- int x, int dx, int y, int dy,
- enum FilterMode filtering) {
- int j;
- int col_step = dx >> 16;
- int row_stride = (dy >> 16) * src_stride;
- void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
- int src_step, uint8* dst_argb, int dst_width) =
- filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
- assert(IS_ALIGNED(src_width, 2));
- assert(IS_ALIGNED(src_height, 2));
- src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
-#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
- ScaleARGBRowDownEven_SSE2;
- }
-#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) &&
- IS_ALIGNED(src_argb, 4)) {
- ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
- ScaleARGBRowDownEven_NEON;
- }
-#endif
-
- if (filtering == kFilterLinear) {
- src_stride = 0;
- }
- for (j = 0; j < dst_height; ++j) {
- ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
- src_argb += row_stride;
- dst_argb += dst_stride;
- }
-}
-
-// Scale ARGB down with bilinear interpolation.
-static void ScaleARGBBilinearDown(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_argb, uint8* dst_argb,
- int x, int dx, int y, int dy,
- enum FilterMode filtering) {
- int j;
- void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
- ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
- InterpolateRow_C;
- void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) =
- (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
- int64 xlast = x + (int64)(dst_width - 1) * dx;
- int64 xl = (dx >= 0) ? x : xlast;
- int64 xr = (dx >= 0) ? xlast : x;
- int clip_src_width;
- xl = (xl >> 16) & ~3; // Left edge aligned.
- xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
- xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel.
- if (xr > src_width) {
- xr = src_width;
- }
- clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4.
- src_argb += xl * 4;
- x -= (int)(xl << 16);
-#if defined(HAS_INTERPOLATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {
- InterpolateRow = InterpolateRow_Any_SSE2;
- if (IS_ALIGNED(clip_src_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {
- InterpolateRow = InterpolateRow_Any_SSSE3;
- if (IS_ALIGNED(clip_src_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) {
- InterpolateRow = InterpolateRow_Any_AVX2;
- if (IS_ALIGNED(clip_src_width, 32)) {
- InterpolateRow = InterpolateRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) {
- InterpolateRow = InterpolateRow_Any_NEON;
- if (IS_ALIGNED(clip_src_width, 16)) {
- InterpolateRow = InterpolateRow_NEON;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 &&
- IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
- InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
- if (IS_ALIGNED(clip_src_width, 4)) {
- InterpolateRow = InterpolateRow_MIPS_DSPR2;
- }
- }
-#endif
-#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
- ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
- }
-#endif
- // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
- // Allocate a row of ARGB.
- {
- align_buffer_64(row, clip_src_width * 4);
-
- const int max_y = (src_height - 1) << 16;
- if (y > max_y) {
- y = max_y;
- }
- for (j = 0; j < dst_height; ++j) {
- int yi = y >> 16;
- const uint8* src = src_argb + yi * src_stride;
- if (filtering == kFilterLinear) {
- ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
- } else {
- int yf = (y >> 8) & 255;
- InterpolateRow(row, src, src_stride, clip_src_width, yf);
- ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
- }
- dst_argb += dst_stride;
- y += dy;
- if (y > max_y) {
- y = max_y;
- }
- }
- free_aligned_buffer_64(row);
- }
-}
-
-// Scale ARGB up with bilinear interpolation.
-static void ScaleARGBBilinearUp(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_argb, uint8* dst_argb,
- int x, int dx, int y, int dy,
- enum FilterMode filtering) {
- int j;
- void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
- ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
- InterpolateRow_C;
- void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) =
- filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
- const int max_y = (src_height - 1) << 16;
-#if defined(HAS_INTERPOLATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
- InterpolateRow = InterpolateRow_Any_SSE2;
- if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
- InterpolateRow = InterpolateRow_Any_SSSE3;
- if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
- InterpolateRow = InterpolateRow_Any_AVX2;
- if (IS_ALIGNED(dst_width, 8)) {
- InterpolateRow = InterpolateRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
- InterpolateRow = InterpolateRow_Any_NEON;
- if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_NEON;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
- IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
- InterpolateRow = InterpolateRow_MIPS_DSPR2;
- }
-#endif
- if (src_width >= 32768) {
- ScaleARGBFilterCols = filtering ?
- ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
- }
-#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
- if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
- ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
- }
-#endif
-#if defined(HAS_SCALEARGBCOLS_SSE2)
- if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
- ScaleARGBFilterCols = ScaleARGBCols_SSE2;
- }
-#endif
- if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
- ScaleARGBFilterCols = ScaleARGBColsUp2_C;
-#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
- }
-#endif
- }
-
- if (y > max_y) {
- y = max_y;
- }
-
- {
- int yi = y >> 16;
- const uint8* src = src_argb + yi * src_stride;
-
- // Allocate 2 rows of ARGB.
- const int kRowSize = (dst_width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
-
- uint8* rowptr = row;
- int rowstride = kRowSize;
- int lasty = yi;
-
- ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
- if (src_height > 1) {
- src += src_stride;
- }
- ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
- src += src_stride;
-
- for (j = 0; j < dst_height; ++j) {
- yi = y >> 16;
- if (yi != lasty) {
- if (y > max_y) {
- y = max_y;
- yi = y >> 16;
- src = src_argb + yi * src_stride;
- }
- if (yi != lasty) {
- ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
- rowptr += rowstride;
- rowstride = -rowstride;
- lasty = yi;
- src += src_stride;
- }
- }
- if (filtering == kFilterLinear) {
- InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
- } else {
- int yf = (y >> 8) & 255;
- InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
- }
- dst_argb += dst_stride;
- y += dy;
- }
- free_aligned_buffer_64(row);
- }
-}
-
-#ifdef YUVSCALEUP
-// Scale YUV to ARGB up with bilinear interpolation.
-static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride_y,
- int src_stride_u,
- int src_stride_v,
- int dst_stride_argb,
- const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- int x, int dx, int y, int dy,
- enum FilterMode filtering) {
- int j;
- void (*I422ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) {
- I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(src_width, 8)) {
- I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) {
- I422ToARGBRow = I422ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(src_width, 16)) {
- I422ToARGBRow = I422ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && src_width >= 8) {
- I422ToARGBRow = I422ToARGBRow_Any_NEON;
- if (IS_ALIGNED(src_width, 8)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
- IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
- IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
- IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
- IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
- I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
- }
-#endif
-
- void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
- ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
- InterpolateRow_C;
-#if defined(HAS_INTERPOLATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
- InterpolateRow = InterpolateRow_Any_SSE2;
- if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
- InterpolateRow = InterpolateRow_Any_SSSE3;
- if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
- InterpolateRow = InterpolateRow_Any_AVX2;
- if (IS_ALIGNED(dst_width, 8)) {
- InterpolateRow = InterpolateRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
- InterpolateRow = InterpolateRow_Any_NEON;
- if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_NEON;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
- IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
- InterpolateRow = InterpolateRow_MIPS_DSPR2;
- }
-#endif
-
- void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) =
- filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
- if (src_width >= 32768) {
- ScaleARGBFilterCols = filtering ?
- ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
- }
-#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
- if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
- ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
- }
-#endif
-#if defined(HAS_SCALEARGBCOLS_SSE2)
- if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
- ScaleARGBFilterCols = ScaleARGBCols_SSE2;
- }
-#endif
- if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
- ScaleARGBFilterCols = ScaleARGBColsUp2_C;
-#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
- }
-#endif
- }
-
- const int max_y = (src_height - 1) << 16;
- if (y > max_y) {
- y = max_y;
- }
- const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate.
- int yi = y >> 16;
- int uv_yi = yi >> kYShift;
- const uint8* src_row_y = src_y + yi * src_stride_y;
- const uint8* src_row_u = src_u + uv_yi * src_stride_u;
- const uint8* src_row_v = src_v + uv_yi * src_stride_v;
-
- // Allocate 2 rows of ARGB.
- const int kRowSize = (dst_width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
-
- // Allocate 1 row of ARGB for source conversion.
- align_buffer_64(argb_row, src_width * 4);
-
- uint8* rowptr = row;
- int rowstride = kRowSize;
- int lasty = yi;
-
- // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
- ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
- if (src_height > 1) {
- src_row_y += src_stride_y;
- if (yi & 1) {
- src_row_u += src_stride_u;
- src_row_v += src_stride_v;
- }
- }
- ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
- if (src_height > 2) {
- src_row_y += src_stride_y;
- if (!(yi & 1)) {
- src_row_u += src_stride_u;
- src_row_v += src_stride_v;
- }
- }
-
- for (j = 0; j < dst_height; ++j) {
- yi = y >> 16;
- if (yi != lasty) {
- if (y > max_y) {
- y = max_y;
- yi = y >> 16;
- uv_yi = yi >> kYShift;
- src_row_y = src_y + yi * src_stride_y;
- src_row_u = src_u + uv_yi * src_stride_u;
- src_row_v = src_v + uv_yi * src_stride_v;
- }
- if (yi != lasty) {
- // TODO(fbarchard): Convert the clipped region of row.
- I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
- ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
- rowptr += rowstride;
- rowstride = -rowstride;
- lasty = yi;
- src_row_y += src_stride_y;
- if (yi & 1) {
- src_row_u += src_stride_u;
- src_row_v += src_stride_v;
- }
- }
- }
- if (filtering == kFilterLinear) {
- InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
- } else {
- int yf = (y >> 8) & 255;
- InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
- }
- dst_argb += dst_stride_argb;
- y += dy;
- }
- free_aligned_buffer_64(row);
- free_aligned_buffer_64(row_argb);
-}
-#endif
-
-// Scale ARGB to/from any dimensions, without interpolation.
-// Fixed point math is used for performance: The upper 16 bits
-// of x and dx is the integer part of the source position and
-// the lower 16 bits are the fixed decimal part.
-
-static void ScaleARGBSimple(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_argb, uint8* dst_argb,
- int x, int dx, int y, int dy) {
- int j;
- void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) =
- (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
-#if defined(HAS_SCALEARGBCOLS_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
- ScaleARGBCols = ScaleARGBCols_SSE2;
- }
-#endif
- if (src_width * 2 == dst_width && x < 0x8000) {
- ScaleARGBCols = ScaleARGBColsUp2_C;
-#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleARGBCols = ScaleARGBColsUp2_SSE2;
- }
-#endif
- }
-
- for (j = 0; j < dst_height; ++j) {
- ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
- dst_width, x, dx);
- dst_argb += dst_stride;
- y += dy;
- }
-}
-
-// ScaleARGB a ARGB.
-// This function in turn calls a scaling function
-// suitable for handling the desired resolutions.
-static void ScaleARGB(const uint8* src, int src_stride,
- int src_width, int src_height,
- uint8* dst, int dst_stride,
- int dst_width, int dst_height,
- int clip_x, int clip_y, int clip_width, int clip_height,
- enum FilterMode filtering) {
- // Initial source x/y coordinate and step values as 16.16 fixed point.
- int x = 0;
- int y = 0;
- int dx = 0;
- int dy = 0;
- // ARGB does not support box filter yet, but allow the user to pass it.
- // Simplify filtering when possible.
- filtering = ScaleFilterReduce(src_width, src_height,
- dst_width, dst_height,
- filtering);
-
- // Negative src_height means invert the image.
- if (src_height < 0) {
- src_height = -src_height;
- src = src + (src_height - 1) * src_stride;
- src_stride = -src_stride;
- }
- ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
- &x, &y, &dx, &dy);
- src_width = Abs(src_width);
- if (clip_x) {
- int64 clipf = (int64)(clip_x) * dx;
- x += (clipf & 0xffff);
- src += (clipf >> 16) * 4;
- dst += clip_x * 4;
- }
- if (clip_y) {
- int64 clipf = (int64)(clip_y) * dy;
- y += (clipf & 0xffff);
- src += (clipf >> 16) * src_stride;
- dst += clip_y * dst_stride;
- }
-
- // Special case for integer step values.
- if (((dx | dy) & 0xffff) == 0) {
- if (!dx || !dy) { // 1 pixel wide and/or tall.
- filtering = kFilterNone;
- } else {
- // Optimized even scale down. ie 2, 4, 6, 8, 10x.
- if (!(dx & 0x10000) && !(dy & 0x10000)) {
- if (dx == 0x20000) {
- // Optimized 1/2 downsample.
- ScaleARGBDown2(src_width, src_height,
- clip_width, clip_height,
- src_stride, dst_stride, src, dst,
- x, dx, y, dy, filtering);
- return;
- }
- if (dx == 0x40000 && filtering == kFilterBox) {
- // Optimized 1/4 box downsample.
- ScaleARGBDown4Box(src_width, src_height,
- clip_width, clip_height,
- src_stride, dst_stride, src, dst,
- x, dx, y, dy);
- return;
- }
- ScaleARGBDownEven(src_width, src_height,
- clip_width, clip_height,
- src_stride, dst_stride, src, dst,
- x, dx, y, dy, filtering);
- return;
- }
- // Optimized odd scale down. ie 3, 5, 7, 9x.
- if ((dx & 0x10000) && (dy & 0x10000)) {
- filtering = kFilterNone;
- if (dx == 0x10000 && dy == 0x10000) {
- // Straight copy.
- ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
- dst, dst_stride, clip_width, clip_height);
- return;
- }
- }
- }
- }
- if (dx == 0x10000 && (x & 0xffff) == 0) {
- // Arbitrary scale vertically, but unscaled vertically.
- ScalePlaneVertical(src_height,
- clip_width, clip_height,
- src_stride, dst_stride, src, dst,
- x, y, dy, 4, filtering);
- return;
- }
- if (filtering && dy < 65536) {
- ScaleARGBBilinearUp(src_width, src_height,
- clip_width, clip_height,
- src_stride, dst_stride, src, dst,
- x, dx, y, dy, filtering);
- return;
- }
- if (filtering) {
- ScaleARGBBilinearDown(src_width, src_height,
- clip_width, clip_height,
- src_stride, dst_stride, src, dst,
- x, dx, y, dy, filtering);
- return;
- }
- ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
- src_stride, dst_stride, src, dst,
- x, dx, y, dy);
-}
-
-LIBYUV_API
-int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
- int src_width, int src_height,
- uint8* dst_argb, int dst_stride_argb,
- int dst_width, int dst_height,
- int clip_x, int clip_y, int clip_width, int clip_height,
- enum FilterMode filtering) {
- if (!src_argb || src_width == 0 || src_height == 0 ||
- !dst_argb || dst_width <= 0 || dst_height <= 0 ||
- clip_x < 0 || clip_y < 0 ||
- (clip_x + clip_width) > dst_width ||
- (clip_y + clip_height) > dst_height) {
- return -1;
- }
- ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
- dst_argb, dst_stride_argb, dst_width, dst_height,
- clip_x, clip_y, clip_width, clip_height, filtering);
- return 0;
-}
-
-// Scale an ARGB image.
-LIBYUV_API
-int ARGBScale(const uint8* src_argb, int src_stride_argb,
- int src_width, int src_height,
- uint8* dst_argb, int dst_stride_argb,
- int dst_width, int dst_height,
- enum FilterMode filtering) {
- if (!src_argb || src_width == 0 || src_height == 0 ||
- !dst_argb || dst_width <= 0 || dst_height <= 0) {
- return -1;
- }
- ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
- dst_argb, dst_stride_argb, dst_width, dst_height,
- 0, 0, dst_width, dst_height, filtering);
- return 0;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/scale_common.cc b/src/main/jni/libyuv/source/scale_common.cc
deleted file mode 100644
index e4b2acc41..000000000
--- a/src/main/jni/libyuv/source/scale_common.cc
+++ /dev/null
@@ -1,1165 +0,0 @@
-/*
- * Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/scale.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h" // For CopyARGB
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-static __inline int Abs(int v) {
- return v >= 0 ? v : -v;
-}
-
-// CPU agnostic row functions
-void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- int x;
- for (x = 0; x < dst_width - 1; x += 2) {
- dst[0] = src_ptr[1];
- dst[1] = src_ptr[3];
- dst += 2;
- src_ptr += 4;
- }
- if (dst_width & 1) {
- dst[0] = src_ptr[1];
- }
-}
-
-void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width) {
- int x;
- for (x = 0; x < dst_width - 1; x += 2) {
- dst[0] = src_ptr[1];
- dst[1] = src_ptr[3];
- dst += 2;
- src_ptr += 4;
- }
- if (dst_width & 1) {
- dst[0] = src_ptr[1];
- }
-}
-
-void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- const uint8* s = src_ptr;
- int x;
- for (x = 0; x < dst_width - 1; x += 2) {
- dst[0] = (s[0] + s[1] + 1) >> 1;
- dst[1] = (s[2] + s[3] + 1) >> 1;
- dst += 2;
- s += 4;
- }
- if (dst_width & 1) {
- dst[0] = (s[0] + s[1] + 1) >> 1;
- }
-}
-
-void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width) {
- const uint16* s = src_ptr;
- int x;
- for (x = 0; x < dst_width - 1; x += 2) {
- dst[0] = (s[0] + s[1] + 1) >> 1;
- dst[1] = (s[2] + s[3] + 1) >> 1;
- dst += 2;
- s += 4;
- }
- if (dst_width & 1) {
- dst[0] = (s[0] + s[1] + 1) >> 1;
- }
-}
-
-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- const uint8* s = src_ptr;
- const uint8* t = src_ptr + src_stride;
- int x;
- for (x = 0; x < dst_width - 1; x += 2) {
- dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
- dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
- dst += 2;
- s += 4;
- t += 4;
- }
- if (dst_width & 1) {
- dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
- }
-}
-
-void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width) {
- const uint16* s = src_ptr;
- const uint16* t = src_ptr + src_stride;
- int x;
- for (x = 0; x < dst_width - 1; x += 2) {
- dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
- dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
- dst += 2;
- s += 4;
- t += 4;
- }
- if (dst_width & 1) {
- dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
- }
-}
-
-void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- int x;
- for (x = 0; x < dst_width - 1; x += 2) {
- dst[0] = src_ptr[2];
- dst[1] = src_ptr[6];
- dst += 2;
- src_ptr += 8;
- }
- if (dst_width & 1) {
- dst[0] = src_ptr[2];
- }
-}
-
-void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width) {
- int x;
- for (x = 0; x < dst_width - 1; x += 2) {
- dst[0] = src_ptr[2];
- dst[1] = src_ptr[6];
- dst += 2;
- src_ptr += 8;
- }
- if (dst_width & 1) {
- dst[0] = src_ptr[2];
- }
-}
-
-void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- intptr_t stride = src_stride;
- int x;
- for (x = 0; x < dst_width - 1; x += 2) {
- dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2] + src_ptr[stride + 3] +
- src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
- src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
- src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
- src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
- 8) >> 4;
- dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
- src_ptr[stride + 4] + src_ptr[stride + 5] +
- src_ptr[stride + 6] + src_ptr[stride + 7] +
- src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
- src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
- src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
- src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
- 8) >> 4;
- dst += 2;
- src_ptr += 8;
- }
- if (dst_width & 1) {
- dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2] + src_ptr[stride + 3] +
- src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
- src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
- src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
- src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
- 8) >> 4;
- }
-}
-
-void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width) {
- intptr_t stride = src_stride;
- int x;
- for (x = 0; x < dst_width - 1; x += 2) {
- dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2] + src_ptr[stride + 3] +
- src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
- src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
- src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
- src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
- 8) >> 4;
- dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
- src_ptr[stride + 4] + src_ptr[stride + 5] +
- src_ptr[stride + 6] + src_ptr[stride + 7] +
- src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
- src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
- src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
- src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
- 8) >> 4;
- dst += 2;
- src_ptr += 8;
- }
- if (dst_width & 1) {
- dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2] + src_ptr[stride + 3] +
- src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
- src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
- src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
- src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
- 8) >> 4;
- }
-}
-
-void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- int x;
- assert((dst_width % 3 == 0) && (dst_width > 0));
- for (x = 0; x < dst_width; x += 3) {
- dst[0] = src_ptr[0];
- dst[1] = src_ptr[1];
- dst[2] = src_ptr[3];
- dst += 3;
- src_ptr += 4;
- }
-}
-
-void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width) {
- int x;
- assert((dst_width % 3 == 0) && (dst_width > 0));
- for (x = 0; x < dst_width; x += 3) {
- dst[0] = src_ptr[0];
- dst[1] = src_ptr[1];
- dst[2] = src_ptr[3];
- dst += 3;
- src_ptr += 4;
- }
-}
-
-// Filter rows 0 and 1 together, 3 : 1
-void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width) {
- const uint8* s = src_ptr;
- const uint8* t = src_ptr + src_stride;
- int x;
- assert((dst_width % 3 == 0) && (dst_width > 0));
- for (x = 0; x < dst_width; x += 3) {
- uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
- uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
- uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
- uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
- uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
- uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
- d[0] = (a0 * 3 + b0 + 2) >> 2;
- d[1] = (a1 * 3 + b1 + 2) >> 2;
- d[2] = (a2 * 3 + b2 + 2) >> 2;
- d += 3;
- s += 4;
- t += 4;
- }
-}
-
-void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* d, int dst_width) {
- const uint16* s = src_ptr;
- const uint16* t = src_ptr + src_stride;
- int x;
- assert((dst_width % 3 == 0) && (dst_width > 0));
- for (x = 0; x < dst_width; x += 3) {
- uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
- uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
- uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
- uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
- uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
- uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
- d[0] = (a0 * 3 + b0 + 2) >> 2;
- d[1] = (a1 * 3 + b1 + 2) >> 2;
- d[2] = (a2 * 3 + b2 + 2) >> 2;
- d += 3;
- s += 4;
- t += 4;
- }
-}
-
-// Filter rows 1 and 2 together, 1 : 1
-void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width) {
- const uint8* s = src_ptr;
- const uint8* t = src_ptr + src_stride;
- int x;
- assert((dst_width % 3 == 0) && (dst_width > 0));
- for (x = 0; x < dst_width; x += 3) {
- uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
- uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
- uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
- uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
- uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
- uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
- d[0] = (a0 + b0 + 1) >> 1;
- d[1] = (a1 + b1 + 1) >> 1;
- d[2] = (a2 + b2 + 1) >> 1;
- d += 3;
- s += 4;
- t += 4;
- }
-}
-
-void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* d, int dst_width) {
- const uint16* s = src_ptr;
- const uint16* t = src_ptr + src_stride;
- int x;
- assert((dst_width % 3 == 0) && (dst_width > 0));
- for (x = 0; x < dst_width; x += 3) {
- uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
- uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
- uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
- uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
- uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
- uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
- d[0] = (a0 + b0 + 1) >> 1;
- d[1] = (a1 + b1 + 1) >> 1;
- d[2] = (a2 + b2 + 1) >> 1;
- d += 3;
- s += 4;
- t += 4;
- }
-}
-
-// Scales a single row of pixels using point sampling.
-void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
- int j;
- for (j = 0; j < dst_width - 1; j += 2) {
- dst_ptr[0] = src_ptr[x >> 16];
- x += dx;
- dst_ptr[1] = src_ptr[x >> 16];
- x += dx;
- dst_ptr += 2;
- }
- if (dst_width & 1) {
- dst_ptr[0] = src_ptr[x >> 16];
- }
-}
-
-void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
- int dst_width, int x, int dx) {
- int j;
- for (j = 0; j < dst_width - 1; j += 2) {
- dst_ptr[0] = src_ptr[x >> 16];
- x += dx;
- dst_ptr[1] = src_ptr[x >> 16];
- x += dx;
- dst_ptr += 2;
- }
- if (dst_width & 1) {
- dst_ptr[0] = src_ptr[x >> 16];
- }
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
- int j;
- for (j = 0; j < dst_width - 1; j += 2) {
- dst_ptr[1] = dst_ptr[0] = src_ptr[0];
- src_ptr += 1;
- dst_ptr += 2;
- }
- if (dst_width & 1) {
- dst_ptr[0] = src_ptr[0];
- }
-}
-
-void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
- int dst_width, int x, int dx) {
- int j;
- for (j = 0; j < dst_width - 1; j += 2) {
- dst_ptr[1] = dst_ptr[0] = src_ptr[0];
- src_ptr += 1;
- dst_ptr += 2;
- }
- if (dst_width & 1) {
- dst_ptr[0] = src_ptr[0];
- }
-}
-
-// (1-f)a + fb can be replaced with a + f(b-a)
-#define BLENDER(a, b, f) (uint8)((int)(a) + \
- ((int)(f) * ((int)(b) - (int)(a)) >> 16))
-
-void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
- int j;
- for (j = 0; j < dst_width - 1; j += 2) {
- int xi = x >> 16;
- int a = src_ptr[xi];
- int b = src_ptr[xi + 1];
- dst_ptr[0] = BLENDER(a, b, x & 0xffff);
- x += dx;
- xi = x >> 16;
- a = src_ptr[xi];
- b = src_ptr[xi + 1];
- dst_ptr[1] = BLENDER(a, b, x & 0xffff);
- x += dx;
- dst_ptr += 2;
- }
- if (dst_width & 1) {
- int xi = x >> 16;
- int a = src_ptr[xi];
- int b = src_ptr[xi + 1];
- dst_ptr[0] = BLENDER(a, b, x & 0xffff);
- }
-}
-
-void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x32, int dx) {
- int64 x = (int64)(x32);
- int j;
- for (j = 0; j < dst_width - 1; j += 2) {
- int64 xi = x >> 16;
- int a = src_ptr[xi];
- int b = src_ptr[xi + 1];
- dst_ptr[0] = BLENDER(a, b, x & 0xffff);
- x += dx;
- xi = x >> 16;
- a = src_ptr[xi];
- b = src_ptr[xi + 1];
- dst_ptr[1] = BLENDER(a, b, x & 0xffff);
- x += dx;
- dst_ptr += 2;
- }
- if (dst_width & 1) {
- int64 xi = x >> 16;
- int a = src_ptr[xi];
- int b = src_ptr[xi + 1];
- dst_ptr[0] = BLENDER(a, b, x & 0xffff);
- }
-}
-#undef BLENDER
-
-#define BLENDER(a, b, f) (uint16)((int)(a) + \
- ((int)(f) * ((int)(b) - (int)(a)) >> 16))
-
-void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
- int dst_width, int x, int dx) {
- int j;
- for (j = 0; j < dst_width - 1; j += 2) {
- int xi = x >> 16;
- int a = src_ptr[xi];
- int b = src_ptr[xi + 1];
- dst_ptr[0] = BLENDER(a, b, x & 0xffff);
- x += dx;
- xi = x >> 16;
- a = src_ptr[xi];
- b = src_ptr[xi + 1];
- dst_ptr[1] = BLENDER(a, b, x & 0xffff);
- x += dx;
- dst_ptr += 2;
- }
- if (dst_width & 1) {
- int xi = x >> 16;
- int a = src_ptr[xi];
- int b = src_ptr[xi + 1];
- dst_ptr[0] = BLENDER(a, b, x & 0xffff);
- }
-}
-
-void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
- int dst_width, int x32, int dx) {
- int64 x = (int64)(x32);
- int j;
- for (j = 0; j < dst_width - 1; j += 2) {
- int64 xi = x >> 16;
- int a = src_ptr[xi];
- int b = src_ptr[xi + 1];
- dst_ptr[0] = BLENDER(a, b, x & 0xffff);
- x += dx;
- xi = x >> 16;
- a = src_ptr[xi];
- b = src_ptr[xi + 1];
- dst_ptr[1] = BLENDER(a, b, x & 0xffff);
- x += dx;
- dst_ptr += 2;
- }
- if (dst_width & 1) {
- int64 xi = x >> 16;
- int a = src_ptr[xi];
- int b = src_ptr[xi + 1];
- dst_ptr[0] = BLENDER(a, b, x & 0xffff);
- }
-}
-#undef BLENDER
-
-void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- int x;
- assert(dst_width % 3 == 0);
- for (x = 0; x < dst_width; x += 3) {
- dst[0] = src_ptr[0];
- dst[1] = src_ptr[3];
- dst[2] = src_ptr[6];
- dst += 3;
- src_ptr += 8;
- }
-}
-
-void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width) {
- int x;
- assert(dst_width % 3 == 0);
- for (x = 0; x < dst_width; x += 3) {
- dst[0] = src_ptr[0];
- dst[1] = src_ptr[3];
- dst[2] = src_ptr[6];
- dst += 3;
- src_ptr += 8;
- }
-}
-
-// 8x3 -> 3x1
-void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- intptr_t stride = src_stride;
- int i;
- assert((dst_width % 3 == 0) && (dst_width > 0));
- for (i = 0; i < dst_width; i += 3) {
- dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
- src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
- (65536 / 9) >> 16;
- dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
- src_ptr[stride + 3] + src_ptr[stride + 4] +
- src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
- src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
- (65536 / 9) >> 16;
- dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
- src_ptr[stride + 6] + src_ptr[stride + 7] +
- src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
- (65536 / 6) >> 16;
- src_ptr += 8;
- dst_ptr += 3;
- }
-}
-
-void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
- ptrdiff_t src_stride,
- uint16* dst_ptr, int dst_width) {
- intptr_t stride = src_stride;
- int i;
- assert((dst_width % 3 == 0) && (dst_width > 0));
- for (i = 0; i < dst_width; i += 3) {
- dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
- src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
- (65536 / 9) >> 16;
- dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
- src_ptr[stride + 3] + src_ptr[stride + 4] +
- src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
- src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
- (65536 / 9) >> 16;
- dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
- src_ptr[stride + 6] + src_ptr[stride + 7] +
- src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
- (65536 / 6) >> 16;
- src_ptr += 8;
- dst_ptr += 3;
- }
-}
-
-// 8x2 -> 3x1
-void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- intptr_t stride = src_stride;
- int i;
- assert((dst_width % 3 == 0) && (dst_width > 0));
- for (i = 0; i < dst_width; i += 3) {
- dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2]) * (65536 / 6) >> 16;
- dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
- src_ptr[stride + 3] + src_ptr[stride + 4] +
- src_ptr[stride + 5]) * (65536 / 6) >> 16;
- dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
- src_ptr[stride + 6] + src_ptr[stride + 7]) *
- (65536 / 4) >> 16;
- src_ptr += 8;
- dst_ptr += 3;
- }
-}
-
-void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int dst_width) {
- intptr_t stride = src_stride;
- int i;
- assert((dst_width % 3 == 0) && (dst_width > 0));
- for (i = 0; i < dst_width; i += 3) {
- dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2]) * (65536 / 6) >> 16;
- dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
- src_ptr[stride + 3] + src_ptr[stride + 4] +
- src_ptr[stride + 5]) * (65536 / 6) >> 16;
- dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
- src_ptr[stride + 6] + src_ptr[stride + 7]) *
- (65536 / 4) >> 16;
- src_ptr += 8;
- dst_ptr += 3;
- }
-}
-
-void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int src_width, int src_height) {
- int x;
- assert(src_width > 0);
- assert(src_height > 0);
- for (x = 0; x < src_width; ++x) {
- const uint8* s = src_ptr + x;
- unsigned int sum = 0u;
- int y;
- for (y = 0; y < src_height; ++y) {
- sum += s[0];
- s += src_stride;
- }
- // TODO(fbarchard): Consider limitting height to 256 to avoid overflow.
- dst_ptr[x] = sum < 65535u ? sum : 65535u;
- }
-}
-
-void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint32* dst_ptr, int src_width, int src_height) {
- int x;
- assert(src_width > 0);
- assert(src_height > 0);
- for (x = 0; x < src_width; ++x) {
- const uint16* s = src_ptr + x;
- unsigned int sum = 0u;
- int y;
- for (y = 0; y < src_height; ++y) {
- sum += s[0];
- s += src_stride;
- }
- // No risk of overflow here now
- dst_ptr[x] = sum;
- }
-}
-
-void ScaleARGBRowDown2_C(const uint8* src_argb,
- ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
- const uint32* src = (const uint32*)(src_argb);
- uint32* dst = (uint32*)(dst_argb);
-
- int x;
- for (x = 0; x < dst_width - 1; x += 2) {
- dst[0] = src[1];
- dst[1] = src[3];
- src += 4;
- dst += 2;
- }
- if (dst_width & 1) {
- dst[0] = src[1];
- }
-}
-
-void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
- ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
- int x;
- for (x = 0; x < dst_width; ++x) {
- dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
- dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
- dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1;
- dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1;
- src_argb += 8;
- dst_argb += 4;
- }
-}
-
-void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
- int x;
- for (x = 0; x < dst_width; ++x) {
- dst_argb[0] = (src_argb[0] + src_argb[4] +
- src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
- dst_argb[1] = (src_argb[1] + src_argb[5] +
- src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
- dst_argb[2] = (src_argb[2] + src_argb[6] +
- src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
- dst_argb[3] = (src_argb[3] + src_argb[7] +
- src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
- src_argb += 8;
- dst_argb += 4;
- }
-}
-
-void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_argb, int dst_width) {
- const uint32* src = (const uint32*)(src_argb);
- uint32* dst = (uint32*)(dst_argb);
-
- int x;
- for (x = 0; x < dst_width - 1; x += 2) {
- dst[0] = src[0];
- dst[1] = src[src_stepx];
- src += src_stepx * 2;
- dst += 2;
- }
- if (dst_width & 1) {
- dst[0] = src[0];
- }
-}
-
-void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_argb, int dst_width) {
- int x;
- for (x = 0; x < dst_width; ++x) {
- dst_argb[0] = (src_argb[0] + src_argb[4] +
- src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
- dst_argb[1] = (src_argb[1] + src_argb[5] +
- src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
- dst_argb[2] = (src_argb[2] + src_argb[6] +
- src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
- dst_argb[3] = (src_argb[3] + src_argb[7] +
- src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
- src_argb += src_stepx * 4;
- dst_argb += 4;
- }
-}
-
-// Scales a single row of pixels using point sampling.
-void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
- const uint32* src = (const uint32*)(src_argb);
- uint32* dst = (uint32*)(dst_argb);
- int j;
- for (j = 0; j < dst_width - 1; j += 2) {
- dst[0] = src[x >> 16];
- x += dx;
- dst[1] = src[x >> 16];
- x += dx;
- dst += 2;
- }
- if (dst_width & 1) {
- dst[0] = src[x >> 16];
- }
-}
-
-void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x32, int dx) {
- int64 x = (int64)(x32);
- const uint32* src = (const uint32*)(src_argb);
- uint32* dst = (uint32*)(dst_argb);
- int j;
- for (j = 0; j < dst_width - 1; j += 2) {
- dst[0] = src[x >> 16];
- x += dx;
- dst[1] = src[x >> 16];
- x += dx;
- dst += 2;
- }
- if (dst_width & 1) {
- dst[0] = src[x >> 16];
- }
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
- const uint32* src = (const uint32*)(src_argb);
- uint32* dst = (uint32*)(dst_argb);
- int j;
- for (j = 0; j < dst_width - 1; j += 2) {
- dst[1] = dst[0] = src[0];
- src += 1;
- dst += 2;
- }
- if (dst_width & 1) {
- dst[0] = src[0];
- }
-}
-
-// Mimics SSSE3 blender
-#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
-#define BLENDERC(a, b, f, s) (uint32)( \
- BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
-#define BLENDER(a, b, f) \
- BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
- BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
-
-void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
- const uint32* src = (const uint32*)(src_argb);
- uint32* dst = (uint32*)(dst_argb);
- int j;
- for (j = 0; j < dst_width - 1; j += 2) {
- int xi = x >> 16;
- int xf = (x >> 9) & 0x7f;
- uint32 a = src[xi];
- uint32 b = src[xi + 1];
- dst[0] = BLENDER(a, b, xf);
- x += dx;
- xi = x >> 16;
- xf = (x >> 9) & 0x7f;
- a = src[xi];
- b = src[xi + 1];
- dst[1] = BLENDER(a, b, xf);
- x += dx;
- dst += 2;
- }
- if (dst_width & 1) {
- int xi = x >> 16;
- int xf = (x >> 9) & 0x7f;
- uint32 a = src[xi];
- uint32 b = src[xi + 1];
- dst[0] = BLENDER(a, b, xf);
- }
-}
-
-void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x32, int dx) {
- int64 x = (int64)(x32);
- const uint32* src = (const uint32*)(src_argb);
- uint32* dst = (uint32*)(dst_argb);
- int j;
- for (j = 0; j < dst_width - 1; j += 2) {
- int64 xi = x >> 16;
- int xf = (x >> 9) & 0x7f;
- uint32 a = src[xi];
- uint32 b = src[xi + 1];
- dst[0] = BLENDER(a, b, xf);
- x += dx;
- xi = x >> 16;
- xf = (x >> 9) & 0x7f;
- a = src[xi];
- b = src[xi + 1];
- dst[1] = BLENDER(a, b, xf);
- x += dx;
- dst += 2;
- }
- if (dst_width & 1) {
- int64 xi = x >> 16;
- int xf = (x >> 9) & 0x7f;
- uint32 a = src[xi];
- uint32 b = src[xi + 1];
- dst[0] = BLENDER(a, b, xf);
- }
-}
-#undef BLENDER1
-#undef BLENDERC
-#undef BLENDER
-
-// Scale plane vertically with bilinear interpolation.
-void ScalePlaneVertical(int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_argb, uint8* dst_argb,
- int x, int y, int dy,
- int bpp, enum FilterMode filtering) {
- // TODO(fbarchard): Allow higher bpp.
- int dst_width_bytes = dst_width * bpp;
- void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
- ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
- InterpolateRow_C;
- const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
- int j;
- assert(bpp >= 1 && bpp <= 4);
- assert(src_height != 0);
- assert(dst_width > 0);
- assert(dst_height > 0);
- src_argb += (x >> 16) * bpp;
-#if defined(HAS_INTERPOLATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
- InterpolateRow = InterpolateRow_Any_SSE2;
- if (IS_ALIGNED(dst_width_bytes, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
- InterpolateRow = InterpolateRow_Any_SSSE3;
- if (IS_ALIGNED(dst_width_bytes, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
- InterpolateRow = InterpolateRow_Any_AVX2;
- if (IS_ALIGNED(dst_width_bytes, 32)) {
- InterpolateRow = InterpolateRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {
- InterpolateRow = InterpolateRow_Any_NEON;
- if (IS_ALIGNED(dst_width_bytes, 16)) {
- InterpolateRow = InterpolateRow_NEON;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&
- IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
- IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
- InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
- if (IS_ALIGNED(dst_width_bytes, 4)) {
- InterpolateRow = InterpolateRow_MIPS_DSPR2;
- }
- }
-#endif
- for (j = 0; j < dst_height; ++j) {
- int yi;
- int yf;
- if (y > max_y) {
- y = max_y;
- }
- yi = y >> 16;
- yf = filtering ? ((y >> 8) & 255) : 0;
- InterpolateRow(dst_argb, src_argb + yi * src_stride,
- src_stride, dst_width_bytes, yf);
- dst_argb += dst_stride;
- y += dy;
- }
-}
-void ScalePlaneVertical_16(int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_argb, uint16* dst_argb,
- int x, int y, int dy,
- int wpp, enum FilterMode filtering) {
- // TODO(fbarchard): Allow higher wpp.
- int dst_width_words = dst_width * wpp;
- void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
- ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
- InterpolateRow_16_C;
- const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
- int j;
- assert(wpp >= 1 && wpp <= 2);
- assert(src_height != 0);
- assert(dst_width > 0);
- assert(dst_height > 0);
- src_argb += (x >> 16) * wpp;
-#if defined(HAS_INTERPOLATEROW_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
- InterpolateRow = InterpolateRow_Any_16_SSE2;
- if (IS_ALIGNED(dst_width_bytes, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_16_SSE2;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
- InterpolateRow = InterpolateRow_Any_16_SSSE3;
- if (IS_ALIGNED(dst_width_bytes, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_16_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
- InterpolateRow = InterpolateRow_Any_16_AVX2;
- if (IS_ALIGNED(dst_width_bytes, 32)) {
- InterpolateRow = InterpolateRow_16_AVX2;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_NEON)
- if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {
- InterpolateRow = InterpolateRow_Any_16_NEON;
- if (IS_ALIGNED(dst_width_bytes, 16)) {
- InterpolateRow = InterpolateRow_16_NEON;
- }
- }
-#endif
-#if defined(HAS_INTERPOLATEROWS_16_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&
- IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
- IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
- InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
- if (IS_ALIGNED(dst_width_bytes, 4)) {
- InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
- }
- }
-#endif
- for (j = 0; j < dst_height; ++j) {
- int yi;
- int yf;
- if (y > max_y) {
- y = max_y;
- }
- yi = y >> 16;
- yf = filtering ? ((y >> 8) & 255) : 0;
- InterpolateRow(dst_argb, src_argb + yi * src_stride,
- src_stride, dst_width_words, yf);
- dst_argb += dst_stride;
- y += dy;
- }
-}
-
-// Simplify the filtering based on scale factors.
-enum FilterMode ScaleFilterReduce(int src_width, int src_height,
- int dst_width, int dst_height,
- enum FilterMode filtering) {
- if (src_width < 0) {
- src_width = -src_width;
- }
- if (src_height < 0) {
- src_height = -src_height;
- }
- if (filtering == kFilterBox) {
- // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
- if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
- filtering = kFilterBilinear;
- }
- // If scaling to larger, switch from Box to Bilinear.
- if (dst_width >= src_width || dst_height >= src_height) {
- filtering = kFilterBilinear;
- }
- }
- if (filtering == kFilterBilinear) {
- if (src_height == 1) {
- filtering = kFilterLinear;
- }
- // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
- if (dst_height == src_height || dst_height * 3 == src_height) {
- filtering = kFilterLinear;
- }
- // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
- // avoid reading 2 pixels horizontally that causes memory exception.
- if (src_width == 1) {
- filtering = kFilterNone;
- }
- }
- if (filtering == kFilterLinear) {
- if (src_width == 1) {
- filtering = kFilterNone;
- }
- // TODO(fbarchard): Detect any odd scale factor and reduce to None.
- if (dst_width == src_width || dst_width * 3 == src_width) {
- filtering = kFilterNone;
- }
- }
- return filtering;
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-int FixedDiv_C(int num, int div) {
- return (int)(((int64)(num) << 16) / div);
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-int FixedDiv1_C(int num, int div) {
- return (int)((((int64)(num) << 16) - 0x00010001) /
- (div - 1));
-}
-
-#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
-
-// Compute slope values for stepping.
-void ScaleSlope(int src_width, int src_height,
- int dst_width, int dst_height,
- enum FilterMode filtering,
- int* x, int* y, int* dx, int* dy) {
- assert(x != NULL);
- assert(y != NULL);
- assert(dx != NULL);
- assert(dy != NULL);
- assert(src_width != 0);
- assert(src_height != 0);
- assert(dst_width > 0);
- assert(dst_height > 0);
- // Check for 1 pixel and avoid FixedDiv overflow.
- if (dst_width == 1 && src_width >= 32768) {
- dst_width = src_width;
- }
- if (dst_height == 1 && src_height >= 32768) {
- dst_height = src_height;
- }
- if (filtering == kFilterBox) {
- // Scale step for point sampling duplicates all pixels equally.
- *dx = FixedDiv(Abs(src_width), dst_width);
- *dy = FixedDiv(src_height, dst_height);
- *x = 0;
- *y = 0;
- } else if (filtering == kFilterBilinear) {
- // Scale step for bilinear sampling renders last pixel once for upsample.
- if (dst_width <= Abs(src_width)) {
- *dx = FixedDiv(Abs(src_width), dst_width);
- *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter.
- } else if (dst_width > 1) {
- *dx = FixedDiv1(Abs(src_width), dst_width);
- *x = 0;
- }
- if (dst_height <= src_height) {
- *dy = FixedDiv(src_height, dst_height);
- *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter.
- } else if (dst_height > 1) {
- *dy = FixedDiv1(src_height, dst_height);
- *y = 0;
- }
- } else if (filtering == kFilterLinear) {
- // Scale step for bilinear sampling renders last pixel once for upsample.
- if (dst_width <= Abs(src_width)) {
- *dx = FixedDiv(Abs(src_width), dst_width);
- *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter.
- } else if (dst_width > 1) {
- *dx = FixedDiv1(Abs(src_width), dst_width);
- *x = 0;
- }
- *dy = FixedDiv(src_height, dst_height);
- *y = *dy >> 1;
- } else {
- // Scale step for point sampling duplicates all pixels equally.
- *dx = FixedDiv(Abs(src_width), dst_width);
- *dy = FixedDiv(src_height, dst_height);
- *x = CENTERSTART(*dx, 0);
- *y = CENTERSTART(*dy, 0);
- }
- // Negative src_width means horizontally mirror.
- if (src_width < 0) {
- *x += (dst_width - 1) * *dx;
- *dx = -*dx;
- // src_width = -src_width; // Caller must do this.
- }
-}
-#undef CENTERSTART
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/scale_mips.cc b/src/main/jni/libyuv/source/scale_mips.cc
deleted file mode 100644
index 3eb4f27c4..000000000
--- a/src/main/jni/libyuv/source/scale_mips.cc
+++ /dev/null
@@ -1,654 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC MIPS DSPR2
-#if !defined(LIBYUV_DISABLE_MIPS) && \
- defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
- (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "srl $t9, %[dst_width], 4 \n" // iterations -> by 16
- "beqz $t9, 2f \n"
- " nop \n"
-
- ".p2align 2 \n"
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
- "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
- "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
- "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
- "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
- "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
- "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
- // TODO(fbarchard): Use odd pixels instead of even.
- "precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0|
- "precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8|
- "precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16|
- "precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24|
- "addiu %[src_ptr], %[src_ptr], 32 \n"
- "addiu $t9, $t9, -1 \n"
- "sw $t8, 0(%[dst]) \n"
- "sw $t0, 4(%[dst]) \n"
- "sw $t1, 8(%[dst]) \n"
- "sw $t2, 12(%[dst]) \n"
- "bgtz $t9, 1b \n"
- " addiu %[dst], %[dst], 16 \n"
-
- "2: \n"
- "andi $t9, %[dst_width], 0xf \n" // residue
- "beqz $t9, 3f \n"
- " nop \n"
-
- "21: \n"
- "lbu $t0, 0(%[src_ptr]) \n"
- "addiu %[src_ptr], %[src_ptr], 2 \n"
- "addiu $t9, $t9, -1 \n"
- "sb $t0, 0(%[dst]) \n"
- "bgtz $t9, 21b \n"
- " addiu %[dst], %[dst], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src_ptr] "+r" (src_ptr),
- [dst] "+r" (dst)
- : [dst_width] "r" (dst_width)
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6", "t7", "t8", "t9"
- );
-}
-
-void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- const uint8* t = src_ptr + src_stride;
-
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
-
- "srl $t9, %[dst_width], 3 \n" // iterations -> step 8
- "bltz $t9, 2f \n"
- " nop \n"
-
- ".p2align 2 \n"
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
- "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
- "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
- "lw $t4, 0(%[t]) \n" // |19|18|17|16|
- "lw $t5, 4(%[t]) \n" // |23|22|21|20|
- "lw $t6, 8(%[t]) \n" // |27|26|25|24|
- "lw $t7, 12(%[t]) \n" // |31|30|29|28|
- "addiu $t9, $t9, -1 \n"
- "srl $t8, $t0, 16 \n" // |X|X|3|2|
- "ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
- "ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
- "raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
- "raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
- "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
- "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
- "srl $t8, $t1, 16 \n" // |X|X|7|6|
- "ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
- "ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
- "raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
- "raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
- "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
- "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
- "srl $t8, $t2, 16 \n" // |X|X|11|10|
- "ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
- "ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
- "raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
- "raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
- "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
- "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
- "srl $t8, $t3, 16 \n" // |X|X|15|14|
- "ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
- "ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
- "raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
- "raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
- "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
- "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
- "addiu %[src_ptr], %[src_ptr], 16 \n"
- "addiu %[t], %[t], 16 \n"
- "sb $t0, 0(%[dst]) \n"
- "sb $t4, 1(%[dst]) \n"
- "sb $t1, 2(%[dst]) \n"
- "sb $t5, 3(%[dst]) \n"
- "sb $t2, 4(%[dst]) \n"
- "sb $t6, 5(%[dst]) \n"
- "sb $t3, 6(%[dst]) \n"
- "sb $t7, 7(%[dst]) \n"
- "bgtz $t9, 1b \n"
- " addiu %[dst], %[dst], 8 \n"
-
- "2: \n"
- "andi $t9, %[dst_width], 0x7 \n" // x = residue
- "beqz $t9, 3f \n"
- " nop \n"
-
- "21: \n"
- "lwr $t1, 0(%[src_ptr]) \n"
- "lwl $t1, 3(%[src_ptr]) \n"
- "lwr $t2, 0(%[t]) \n"
- "lwl $t2, 3(%[t]) \n"
- "srl $t8, $t1, 16 \n"
- "ins $t1, $t2, 16, 16 \n"
- "ins $t2, $t8, 0, 16 \n"
- "raddu.w.qb $t1, $t1 \n"
- "raddu.w.qb $t2, $t2 \n"
- "shra_r.w $t1, $t1, 2 \n"
- "shra_r.w $t2, $t2, 2 \n"
- "sb $t1, 0(%[dst]) \n"
- "sb $t2, 1(%[dst]) \n"
- "addiu %[src_ptr], %[src_ptr], 4 \n"
- "addiu $t9, $t9, -2 \n"
- "addiu %[t], %[t], 4 \n"
- "bgtz $t9, 21b \n"
- " addiu %[dst], %[dst], 2 \n"
-
- "3: \n"
- ".set pop \n"
-
- : [src_ptr] "+r" (src_ptr),
- [dst] "+r" (dst), [t] "+r" (t)
- : [dst_width] "r" (dst_width)
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6", "t7", "t8", "t9"
- );
-}
-
-void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
-
- "srl $t9, %[dst_width], 3 \n"
- "beqz $t9, 2f \n"
- " nop \n"
-
- ".p2align 2 \n"
- "1: \n"
- "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
- "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
- "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
- "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
- "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
- "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
- "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
- "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0|
- "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8|
- "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16|
- "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24|
- "precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0|
- "precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16|
- "addiu %[src_ptr], %[src_ptr], 32 \n"
- "addiu $t9, $t9, -1 \n"
- "sw $t1, 0(%[dst]) \n"
- "sw $t5, 4(%[dst]) \n"
- "bgtz $t9, 1b \n"
- " addiu %[dst], %[dst], 8 \n"
-
- "2: \n"
- "andi $t9, %[dst_width], 7 \n" // residue
- "beqz $t9, 3f \n"
- " nop \n"
-
- "21: \n"
- "lbu $t1, 0(%[src_ptr]) \n"
- "addiu %[src_ptr], %[src_ptr], 4 \n"
- "addiu $t9, $t9, -1 \n"
- "sb $t1, 0(%[dst]) \n"
- "bgtz $t9, 21b \n"
- " addiu %[dst], %[dst], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src_ptr] "+r" (src_ptr),
- [dst] "+r" (dst)
- : [dst_width] "r" (dst_width)
- : "t1", "t2", "t3", "t4", "t5",
- "t6", "t7", "t8", "t9"
- );
-}
-
-void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- intptr_t stride = src_stride;
- const uint8* s1 = src_ptr + stride;
- const uint8* s2 = s1 + stride;
- const uint8* s3 = s2 + stride;
-
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
-
- "srl $t9, %[dst_width], 1 \n"
- "andi $t8, %[dst_width], 1 \n"
-
- ".p2align 2 \n"
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
- "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
- "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
- "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16|
- "lw $t5, 4(%[s1]) \n" // |23|22|21|20|
- "lw $t6, 4(%[s2]) \n" // |27|26|25|24|
- "lw $t7, 4(%[s3]) \n" // |31|30|29|28|
- "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
- "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
- "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
- "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
- "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16|
- "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20|
- "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24|
- "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28|
- "add $t0, $t0, $t1 \n"
- "add $t1, $t2, $t3 \n"
- "add $t0, $t0, $t1 \n"
- "add $t4, $t4, $t5 \n"
- "add $t6, $t6, $t7 \n"
- "add $t4, $t4, $t6 \n"
- "shra_r.w $t0, $t0, 4 \n"
- "shra_r.w $t4, $t4, 4 \n"
- "sb $t0, 0(%[dst]) \n"
- "sb $t4, 1(%[dst]) \n"
- "addiu %[src_ptr], %[src_ptr], 8 \n"
- "addiu %[s1], %[s1], 8 \n"
- "addiu %[s2], %[s2], 8 \n"
- "addiu %[s3], %[s3], 8 \n"
- "addiu $t9, $t9, -1 \n"
- "bgtz $t9, 1b \n"
- " addiu %[dst], %[dst], 2 \n"
- "beqz $t8, 2f \n"
- " nop \n"
-
- "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
- "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
- "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
- "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
- "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
- "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
- "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
- "add $t0, $t0, $t1 \n"
- "add $t1, $t2, $t3 \n"
- "add $t0, $t0, $t1 \n"
- "shra_r.w $t0, $t0, 4 \n"
- "sb $t0, 0(%[dst]) \n"
-
- "2: \n"
- ".set pop \n"
-
- : [src_ptr] "+r" (src_ptr),
- [dst] "+r" (dst),
- [s1] "+r" (s1),
- [s2] "+r" (s2),
- [s3] "+r" (s3)
- : [dst_width] "r" (dst_width)
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6","t7", "t8", "t9"
- );
-}
-
-void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
- ".p2align 2 \n"
- "1: \n"
- "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
- "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
- "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
- "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
- "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
- "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
- "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
- "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13|
- "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30|
- "addiu %[dst_width], %[dst_width], -24 \n"
- "ins $t1, $t1, 8, 16 \n" // |3|1|0|X|
- "ins $t4, $t0, 8, 16 \n" // |X|15|13|12|
- "ins $t5, $t5, 8, 16 \n" // |19|17|16|X|
- "ins $t8, $t9, 8, 16 \n" // |X|31|29|28|
- "addiu %[src_ptr], %[src_ptr], 32 \n"
- "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5|
- "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21|
- "prepend $t1, $t2, 8 \n" // |4|3|1|0|
- "prepend $t3, $t4, 24 \n" // |15|13|12|11|
- "prepend $t5, $t6, 8 \n" // |20|19|17|16|
- "prepend $t7, $t8, 24 \n" // |31|29|28|27|
- "sw $t1, 0(%[dst]) \n"
- "sw $t0, 4(%[dst]) \n"
- "sw $t3, 8(%[dst]) \n"
- "sw $t5, 12(%[dst]) \n"
- "sw $t9, 16(%[dst]) \n"
- "sw $t7, 20(%[dst]) \n"
- "bnez %[dst_width], 1b \n"
- " addiu %[dst], %[dst], 24 \n"
- ".set pop \n"
- : [src_ptr] "+r" (src_ptr),
- [dst] "+r" (dst),
- [dst_width] "+r" (dst_width)
- :
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6","t7", "t8", "t9"
- );
-}
-
-void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width) {
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
- "repl.ph $t3, 3 \n" // 0x00030003
-
- ".p2align 2 \n"
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
- "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
- "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1|
- "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
- "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3|
- "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3|
- "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1|
- "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
- "raddu.w.qb $t0, $t0 \n"
- "raddu.w.qb $t1, $t1 \n"
- "shra_r.w $t0, $t0, 1 \n"
- "shra_r.w $t1, $t1, 1 \n"
- "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1|
- "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
- "rotr $t2, $t2, 16 \n" // |0|S1|0|S2|
- "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
- "addu.ph $t2, $t2, $t4 \n"
- "addu.ph $t6, $t6, $t5 \n"
- "sll $t5, $t0, 1 \n"
- "add $t0, $t5, $t0 \n"
- "shra_r.ph $t2, $t2, 2 \n"
- "shra_r.ph $t6, $t6, 2 \n"
- "shll.ph $t4, $t2, 1 \n"
- "addq.ph $t4, $t4, $t2 \n"
- "addu $t0, $t0, $t1 \n"
- "addiu %[src_ptr], %[src_ptr], 4 \n"
- "shra_r.w $t0, $t0, 2 \n"
- "addu.ph $t6, $t6, $t4 \n"
- "shra_r.ph $t6, $t6, 2 \n"
- "srl $t1, $t6, 16 \n"
- "addiu %[dst_width], %[dst_width], -3 \n"
- "sb $t1, 0(%[d]) \n"
- "sb $t0, 1(%[d]) \n"
- "sb $t6, 2(%[d]) \n"
- "bgtz %[dst_width], 1b \n"
- " addiu %[d], %[d], 3 \n"
- "3: \n"
- ".set pop \n"
- : [src_ptr] "+r" (src_ptr),
- [src_stride] "+r" (src_stride),
- [d] "+r" (d),
- [dst_width] "+r" (dst_width)
- :
- : "t0", "t1", "t2", "t3",
- "t4", "t5", "t6"
- );
-}
-
-void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width) {
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
- "repl.ph $t2, 3 \n" // 0x00030003
-
- ".p2align 2 \n"
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
- "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
- "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1|
- "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
- "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3|
- "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3|
- "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1|
- "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
- "raddu.w.qb $t0, $t0 \n"
- "raddu.w.qb $t1, $t1 \n"
- "shra_r.w $t0, $t0, 1 \n"
- "shra_r.w $t1, $t1, 1 \n"
- "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1|
- "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
- "rotr $t4, $t4, 16 \n" // |0|S1|0|S2|
- "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
- "addu.ph $t4, $t4, $t3 \n"
- "addu.ph $t6, $t6, $t5 \n"
- "shra_r.ph $t6, $t6, 2 \n"
- "shra_r.ph $t4, $t4, 2 \n"
- "addu.ph $t6, $t6, $t4 \n"
- "addiu %[src_ptr], %[src_ptr], 4 \n"
- "shra_r.ph $t6, $t6, 1 \n"
- "addu $t0, $t0, $t1 \n"
- "addiu %[dst_width], %[dst_width], -3 \n"
- "shra_r.w $t0, $t0, 1 \n"
- "srl $t1, $t6, 16 \n"
- "sb $t1, 0(%[d]) \n"
- "sb $t0, 1(%[d]) \n"
- "sb $t6, 2(%[d]) \n"
- "bgtz %[dst_width], 1b \n"
- " addiu %[d], %[d], 3 \n"
- "3: \n"
- ".set pop \n"
- : [src_ptr] "+r" (src_ptr),
- [src_stride] "+r" (src_stride),
- [d] "+r" (d),
- [dst_width] "+r" (dst_width)
- :
- : "t0", "t1", "t2", "t3",
- "t4", "t5", "t6"
- );
-}
-
-void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
-
- ".p2align 2 \n"
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
- "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
- "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
- "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
- "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
- "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
- "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
- "wsbh $t0, $t0 \n" // |2|3|0|1|
- "wsbh $t6, $t6 \n" // |26|27|24|25|
- "srl $t0, $t0, 8 \n" // |X|2|3|0|
- "srl $t3, $t3, 16 \n" // |X|X|15|14|
- "srl $t5, $t5, 16 \n" // |X|X|23|22|
- "srl $t7, $t7, 16 \n" // |X|X|31|30|
- "ins $t1, $t2, 24, 8 \n" // |8|6|5|4|
- "ins $t6, $t5, 0, 8 \n" // |26|27|24|22|
- "ins $t1, $t0, 0, 16 \n" // |8|6|3|0|
- "ins $t6, $t7, 24, 8 \n" // |30|27|24|22|
- "prepend $t2, $t3, 24 \n" // |X|15|14|11|
- "ins $t4, $t4, 16, 8 \n" // |19|16|17|X|
- "ins $t4, $t2, 0, 16 \n" // |19|16|14|11|
- "addiu %[src_ptr], %[src_ptr], 32 \n"
- "addiu %[dst_width], %[dst_width], -12 \n"
- "addiu $t8,%[dst_width], -12 \n"
- "sw $t1, 0(%[dst]) \n"
- "sw $t4, 4(%[dst]) \n"
- "sw $t6, 8(%[dst]) \n"
- "bgez $t8, 1b \n"
- " addiu %[dst], %[dst], 12 \n"
- ".set pop \n"
- : [src_ptr] "+r" (src_ptr),
- [dst] "+r" (dst),
- [dst_width] "+r" (dst_width)
- :
- : "t0", "t1", "t2", "t3", "t4",
- "t5", "t6", "t7", "t8"
- );
-}
-
-void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- intptr_t stride = src_stride;
- const uint8* t = src_ptr + stride;
- const int c = 0x2AAA;
-
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
-
- ".p2align 2 \n"
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
- "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
- "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0|
- "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4|
- "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
- "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6|
- "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4|
- "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6
- "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4
- "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1|
- "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3|
- "srl $t4, $t4, 2 \n" // t4 / 4
- "srl $t6, $t6, 16 \n" // |0|0|S3|T3|
- "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3
- "addu $t6, $t5, $t6 \n"
- "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA
- "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
- "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
- "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0
- "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0
- "addu $t0, $t0, $t2 \n"
- "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA
- "addiu %[src_ptr], %[src_ptr], 8 \n"
- "addiu %[t], %[t], 8 \n"
- "addiu %[dst_width], %[dst_width], -3 \n"
- "addiu %[dst_ptr], %[dst_ptr], 3 \n"
- "srl $t6, $t6, 16 \n"
- "srl $t0, $t0, 16 \n"
- "sb $t4, -1(%[dst_ptr]) \n"
- "sb $t6, -2(%[dst_ptr]) \n"
- "bgtz %[dst_width], 1b \n"
- " sb $t0, -3(%[dst_ptr]) \n"
- ".set pop \n"
- : [src_ptr] "+r" (src_ptr),
- [dst_ptr] "+r" (dst_ptr),
- [t] "+r" (t),
- [dst_width] "+r" (dst_width)
- : [c] "r" (c)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
- );
-}
-
-void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- intptr_t stride = src_stride;
- const uint8* s1 = src_ptr + stride;
- stride += stride;
- const uint8* s2 = src_ptr + stride;
- const int c1 = 0x1C71;
- const int c2 = 0x2AAA;
-
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
-
- ".p2align 2 \n"
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
- "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
- "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0|
- "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4|
- "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0|
- "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4|
- "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
- "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6|
- "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6
- "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4|
- "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4
- "sll $t8, $t5, 16 \n" // |R5|R4|0|0|
- "raddu.w.qb $t8, $t8 \n" // R5+R4
- "addu $t7, $t7, $t8 \n"
- "srl $t8, $t5, 16 \n" // |0|0|R7|R6|
- "raddu.w.qb $t8, $t8 \n" // R7 + R6
- "addu $t6, $t6, $t8 \n"
- "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA
- "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1|
- "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1|
- "srl $t8, $t8, 8 \n" // |0|S3|T3|R3|
- "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3
- "addu $t7, $t7, $t8 \n"
- "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71
- "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
- "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
- "sll $t4, $t4, 8 \n" // |R2|R1|R0|0|
- "raddu.w.qb $t0, $t0 \n"
- "raddu.w.qb $t2, $t2 \n"
- "raddu.w.qb $t4, $t4 \n"
- "addu $t0, $t0, $t2 \n"
- "addu $t0, $t0, $t4 \n"
- "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71
- "addiu %[src_ptr], %[src_ptr], 8 \n"
- "addiu %[s1], %[s1], 8 \n"
- "addiu %[s2], %[s2], 8 \n"
- "addiu %[dst_width], %[dst_width], -3 \n"
- "addiu %[dst_ptr], %[dst_ptr], 3 \n"
- "srl $t6, $t6, 16 \n"
- "srl $t7, $t7, 16 \n"
- "srl $t0, $t0, 16 \n"
- "sb $t6, -1(%[dst_ptr]) \n"
- "sb $t7, -2(%[dst_ptr]) \n"
- "bgtz %[dst_width], 1b \n"
- " sb $t0, -3(%[dst_ptr]) \n"
- ".set pop \n"
- : [src_ptr] "+r" (src_ptr),
- [dst_ptr] "+r" (dst_ptr),
- [s1] "+r" (s1),
- [s2] "+r" (s2),
- [dst_width] "+r" (dst_width)
- : [c1] "r" (c1), [c2] "r" (c2)
- : "t0", "t1", "t2", "t3", "t4",
- "t5", "t6", "t7", "t8"
- );
-}
-
-#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
-
diff --git a/src/main/jni/libyuv/source/scale_neon.cc b/src/main/jni/libyuv/source/scale_neon.cc
deleted file mode 100644
index 1b8a5ba58..000000000
--- a/src/main/jni/libyuv/source/scale_neon.cc
+++ /dev/null
@@ -1,764 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
-
-// NEON downscalers with interpolation.
-// Provided by Fritz Koenig
-
-// Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- // load even pixels into q0, odd into q1
- MEMACCESS(0)
- "vld2.8 {q0, q1}, [%0]! \n"
- "subs %2, %2, #16 \n" // 16 processed per loop
- MEMACCESS(1)
- "vst1.8 {q1}, [%1]! \n" // store odd pixels
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
- :
- : "q0", "q1" // Clobber List
- );
-}
-
-// Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- asm volatile (
- // change the stride to row 2 pointer
- "add %1, %0 \n"
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
- MEMACCESS(1)
- "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
- "subs %3, %3, #16 \n" // 16 processed per loop
- "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
- "vpaddl.u8 q1, q1 \n"
- "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
- "vpadal.u8 q1, q3 \n"
- "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
- "vrshrn.u16 d1, q1, #2 \n"
- MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stride), // %1
- "+r"(dst), // %2
- "+r"(dst_width) // %3
- :
- : "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "subs %2, %2, #8 \n" // 8 processed per loop
- MEMACCESS(1)
- "vst1.8 {d2}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "q0", "q1", "memory", "cc"
- );
-}
-
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- const uint8* src_ptr1 = src_ptr + src_stride;
- const uint8* src_ptr2 = src_ptr + src_stride * 2;
- const uint8* src_ptr3 = src_ptr + src_stride * 3;
-asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load up 16x4
- MEMACCESS(3)
- "vld1.8 {q1}, [%3]! \n"
- MEMACCESS(4)
- "vld1.8 {q2}, [%4]! \n"
- MEMACCESS(5)
- "vld1.8 {q3}, [%5]! \n"
- "subs %2, %2, #4 \n"
- "vpaddl.u8 q0, q0 \n"
- "vpadal.u8 q0, q1 \n"
- "vpadal.u8 q0, q2 \n"
- "vpadal.u8 q0, q3 \n"
- "vpaddl.u16 q0, q0 \n"
- "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
- "vmovn.u16 d0, q0 \n"
- MEMACCESS(1)
- "vst1.32 {d0[0]}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_ptr1), // %3
- "+r"(src_ptr2), // %4
- "+r"(src_ptr3) // %5
- :
- : "q0", "q1", "q2", "q3", "memory", "cc"
- );
-}
-
-// Down scale from 4 to 3 pixels. Use the neon multilane read/write
-// to load up the every 4th pixel into a 4 different registers.
-// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "subs %2, %2, #24 \n"
- "vmov d2, d3 \n" // order d0, d1, d2
- MEMACCESS(1)
- "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "d0", "d1", "d2", "d3", "memory", "cc"
- );
-}
-
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- MEMACCESS(3)
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
- "subs %2, %2, #24 \n"
-
- // filter src line 0 with src line 1
- // expand chars to shorts to allow for room
- // when adding lines together
- "vmovl.u8 q8, d4 \n"
- "vmovl.u8 q9, d5 \n"
- "vmovl.u8 q10, d6 \n"
- "vmovl.u8 q11, d7 \n"
-
- // 3 * line_0 + line_1
- "vmlal.u8 q8, d0, d24 \n"
- "vmlal.u8 q9, d1, d24 \n"
- "vmlal.u8 q10, d2, d24 \n"
- "vmlal.u8 q11, d3, d24 \n"
-
- // (3 * line_0 + line_1) >> 2
- "vqrshrn.u16 d0, q8, #2 \n"
- "vqrshrn.u16 d1, q9, #2 \n"
- "vqrshrn.u16 d2, q10, #2 \n"
- "vqrshrn.u16 d3, q11, #2 \n"
-
- // a0 = (src[0] * 3 + s[1] * 1) >> 2
- "vmovl.u8 q8, d1 \n"
- "vmlal.u8 q8, d0, d24 \n"
- "vqrshrn.u16 d0, q8, #2 \n"
-
- // a1 = (src[1] * 1 + s[2] * 1) >> 1
- "vrhadd.u8 d1, d1, d2 \n"
-
- // a2 = (src[2] * 1 + s[3] * 3) >> 2
- "vmovl.u8 q8, d2 \n"
- "vmlal.u8 q8, d3, d24 \n"
- "vqrshrn.u16 d2, q8, #2 \n"
-
- MEMACCESS(1)
- "vst3.8 {d0, d1, d2}, [%1]! \n"
-
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- :
- : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
- );
-}
-
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- MEMACCESS(3)
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
- "subs %2, %2, #24 \n"
- // average src line 0 with src line 1
- "vrhadd.u8 q0, q0, q2 \n"
- "vrhadd.u8 q1, q1, q3 \n"
-
- // a0 = (src[0] * 3 + s[1] * 1) >> 2
- "vmovl.u8 q3, d1 \n"
- "vmlal.u8 q3, d0, d24 \n"
- "vqrshrn.u16 d0, q3, #2 \n"
-
- // a1 = (src[1] * 1 + s[2] * 1) >> 1
- "vrhadd.u8 d1, d1, d2 \n"
-
- // a2 = (src[2] * 1 + s[3] * 3) >> 2
- "vmovl.u8 q3, d2 \n"
- "vmlal.u8 q3, d3, d24 \n"
- "vqrshrn.u16 d2, q3, #2 \n"
-
- MEMACCESS(1)
- "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- :
- : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
- );
-}
-
-#define HAS_SCALEROWDOWN38_NEON
-static uvec8 kShuf38 =
- { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-static uvec8 kShuf38_2 =
- { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
-static vec16 kMult38_Div6 =
- { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
- 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-static vec16 kMult38_Div9 =
- { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
- 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
-
-// 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- MEMACCESS(3)
- "vld1.8 {q3}, [%3] \n"
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
- "subs %2, %2, #12 \n"
- "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
- "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
- MEMACCESS(1)
- "vst1.8 {d4}, [%1]! \n"
- MEMACCESS(1)
- "vst1.32 {d5[0]}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(&kShuf38) // %3
- : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
- );
-}
-
-// 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- const uint8* src_ptr1 = src_ptr + src_stride * 2;
-
- asm volatile (
- MEMACCESS(5)
- "vld1.16 {q13}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {q14}, [%6] \n"
- MEMACCESS(7)
- "vld1.8 {q15}, [%7] \n"
- "add %3, %0 \n"
- ".p2align 2 \n"
- "1: \n"
-
- // d0 = 00 40 01 41 02 42 03 43
- // d1 = 10 50 11 51 12 52 13 53
- // d2 = 20 60 21 61 22 62 23 63
- // d3 = 30 70 31 71 32 72 33 73
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
- MEMACCESS(3)
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
- MEMACCESS(4)
- "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
- "subs %2, %2, #12 \n"
-
- // Shuffle the input data around to get align the data
- // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
- // d0 = 00 10 01 11 02 12 03 13
- // d1 = 40 50 41 51 42 52 43 53
- "vtrn.u8 d0, d1 \n"
- "vtrn.u8 d4, d5 \n"
- "vtrn.u8 d16, d17 \n"
-
- // d2 = 20 30 21 31 22 32 23 33
- // d3 = 60 70 61 71 62 72 63 73
- "vtrn.u8 d2, d3 \n"
- "vtrn.u8 d6, d7 \n"
- "vtrn.u8 d18, d19 \n"
-
- // d0 = 00+10 01+11 02+12 03+13
- // d2 = 40+50 41+51 42+52 43+53
- "vpaddl.u8 q0, q0 \n"
- "vpaddl.u8 q2, q2 \n"
- "vpaddl.u8 q8, q8 \n"
-
- // d3 = 60+70 61+71 62+72 63+73
- "vpaddl.u8 d3, d3 \n"
- "vpaddl.u8 d7, d7 \n"
- "vpaddl.u8 d19, d19 \n"
-
- // combine source lines
- "vadd.u16 q0, q2 \n"
- "vadd.u16 q0, q8 \n"
- "vadd.u16 d4, d3, d7 \n"
- "vadd.u16 d4, d19 \n"
-
- // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
- // + s[6 + st * 1] + s[7 + st * 1]
- // + s[6 + st * 2] + s[7 + st * 2]) / 6
- "vqrdmulh.s16 q2, q2, q13 \n"
- "vmovn.u16 d4, q2 \n"
-
- // Shuffle 2,3 reg around so that 2 can be added to the
- // 0,1 reg and 3 can be added to the 4,5 reg. This
- // requires expanding from u8 to u16 as the 0,1 and 4,5
- // registers are already expanded. Then do transposes
- // to get aligned.
- // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q3, d6 \n"
- "vmovl.u8 q9, d18 \n"
-
- // combine source lines
- "vadd.u16 q1, q3 \n"
- "vadd.u16 q1, q9 \n"
-
- // d4 = xx 20 xx 30 xx 22 xx 32
- // d5 = xx 21 xx 31 xx 23 xx 33
- "vtrn.u32 d2, d3 \n"
-
- // d4 = xx 20 xx 21 xx 22 xx 23
- // d5 = xx 30 xx 31 xx 32 xx 33
- "vtrn.u16 d2, d3 \n"
-
- // 0+1+2, 3+4+5
- "vadd.u16 q0, q1 \n"
-
- // Need to divide, but can't downshift as the the value
- // isn't a power of 2. So multiply by 65536 / n
- // and take the upper 16 bits.
- "vqrdmulh.s16 q0, q0, q15 \n"
-
- // Align for table lookup, vtbl requires registers to
- // be adjacent
- "vmov.u8 d2, d4 \n"
-
- "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
- "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
-
- MEMACCESS(1)
- "vst1.8 {d3}, [%1]! \n"
- MEMACCESS(1)
- "vst1.32 {d4[0]}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride), // %3
- "+r"(src_ptr1) // %4
- : "r"(&kMult38_Div6), // %5
- "r"(&kShuf38_2), // %6
- "r"(&kMult38_Div9) // %7
- : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
- );
-}
-
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- MEMACCESS(4)
- "vld1.16 {q13}, [%4] \n"
- MEMACCESS(5)
- "vld1.8 {q14}, [%5] \n"
- "add %3, %0 \n"
- ".p2align 2 \n"
- "1: \n"
-
- // d0 = 00 40 01 41 02 42 03 43
- // d1 = 10 50 11 51 12 52 13 53
- // d2 = 20 60 21 61 22 62 23 63
- // d3 = 30 70 31 71 32 72 33 73
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
- MEMACCESS(3)
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
- "subs %2, %2, #12 \n"
-
- // Shuffle the input data around to get align the data
- // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
- // d0 = 00 10 01 11 02 12 03 13
- // d1 = 40 50 41 51 42 52 43 53
- "vtrn.u8 d0, d1 \n"
- "vtrn.u8 d4, d5 \n"
-
- // d2 = 20 30 21 31 22 32 23 33
- // d3 = 60 70 61 71 62 72 63 73
- "vtrn.u8 d2, d3 \n"
- "vtrn.u8 d6, d7 \n"
-
- // d0 = 00+10 01+11 02+12 03+13
- // d2 = 40+50 41+51 42+52 43+53
- "vpaddl.u8 q0, q0 \n"
- "vpaddl.u8 q2, q2 \n"
-
- // d3 = 60+70 61+71 62+72 63+73
- "vpaddl.u8 d3, d3 \n"
- "vpaddl.u8 d7, d7 \n"
-
- // combine source lines
- "vadd.u16 q0, q2 \n"
- "vadd.u16 d4, d3, d7 \n"
-
- // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
- "vqrshrn.u16 d4, q2, #2 \n"
-
- // Shuffle 2,3 reg around so that 2 can be added to the
- // 0,1 reg and 3 can be added to the 4,5 reg. This
- // requires expanding from u8 to u16 as the 0,1 and 4,5
- // registers are already expanded. Then do transposes
- // to get aligned.
- // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q3, d6 \n"
-
- // combine source lines
- "vadd.u16 q1, q3 \n"
-
- // d4 = xx 20 xx 30 xx 22 xx 32
- // d5 = xx 21 xx 31 xx 23 xx 33
- "vtrn.u32 d2, d3 \n"
-
- // d4 = xx 20 xx 21 xx 22 xx 23
- // d5 = xx 30 xx 31 xx 32 xx 33
- "vtrn.u16 d2, d3 \n"
-
- // 0+1+2, 3+4+5
- "vadd.u16 q0, q1 \n"
-
- // Need to divide, but can't downshift as the the value
- // isn't a power of 2. So multiply by 65536 / n
- // and take the upper 16 bits.
- "vqrdmulh.s16 q0, q0, q13 \n"
-
- // Align for table lookup, vtbl requires registers to
- // be adjacent
- "vmov.u8 d2, d4 \n"
-
- "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
- "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
-
- MEMACCESS(1)
- "vst1.8 {d3}, [%1]! \n"
- MEMACCESS(1)
- "vst1.32 {d4[0]}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- : "r"(&kMult38_Div6), // %4
- "r"(&kShuf38_2) // %5
- : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
- );
-}
-
-// 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8* dst_ptr,
- const uint8* src_ptr, ptrdiff_t src_stride,
- int dst_width, int source_y_fraction) {
- asm volatile (
- "cmp %4, #0 \n"
- "beq 100f \n"
- "add %2, %1 \n"
- "cmp %4, #64 \n"
- "beq 75f \n"
- "cmp %4, #128 \n"
- "beq 50f \n"
- "cmp %4, #192 \n"
- "beq 25f \n"
-
- "vdup.8 d5, %4 \n"
- "rsb %4, #256 \n"
- "vdup.8 d4, %4 \n"
- // General purpose row blend.
- "1: \n"
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
- MEMACCESS(2)
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vmull.u8 q13, d0, d4 \n"
- "vmull.u8 q14, d1, d4 \n"
- "vmlal.u8 q13, d2, d5 \n"
- "vmlal.u8 q14, d3, d5 \n"
- "vrshrn.u16 d0, q13, #8 \n"
- "vrshrn.u16 d1, q14, #8 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 1b \n"
- "b 99f \n"
-
- // Blend 25 / 75.
- "25: \n"
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
- MEMACCESS(2)
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 25b \n"
- "b 99f \n"
-
- // Blend 50 / 50.
- "50: \n"
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
- MEMACCESS(2)
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 50b \n"
- "b 99f \n"
-
- // Blend 75 / 25.
- "75: \n"
- MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n"
- MEMACCESS(2)
- "vld1.8 {q0}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 75b \n"
- "b 99f \n"
-
- // Blend 100 / 0 - Copy row unchanged.
- "100: \n"
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
- "subs %3, %3, #16 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 100b \n"
-
- "99: \n"
- MEMACCESS(0)
- "vst1.8 {d1[7]}, [%0] \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(src_stride), // %2
- "+r"(dst_width), // %3
- "+r"(source_y_fraction) // %4
- :
- : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
- );
-}
-
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- // load even pixels into q0, odd into q1
- MEMACCESS(0)
- "vld2.32 {q0, q1}, [%0]! \n"
- MEMACCESS(0)
- "vld2.32 {q2, q3}, [%0]! \n"
- "subs %2, %2, #8 \n" // 8 processed per loop
- MEMACCESS(1)
- "vst1.8 {q1}, [%1]! \n" // store odd pixels
- MEMACCESS(1)
- "vst1.8 {q3}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- asm volatile (
- // change the stride to row 2 pointer
- "add %1, %1, %0 \n"
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
- MEMACCESS(1)
- "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
- "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
- "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
- "vrshrn.u16 d1, q1, #2 \n"
- "vrshrn.u16 d2, q2, #2 \n"
- "vrshrn.u16 d3, q3, #2 \n"
- MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stride), // %1
- "+r"(dst), // %2
- "+r"(dst_width) // %3
- :
- : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
- );
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
- int src_stepx, uint8* dst_argb, int dst_width) {
- asm volatile (
- "mov r12, %3, lsl #2 \n"
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.32 {d0[0]}, [%0], r12 \n"
- MEMACCESS(0)
- "vld1.32 {d0[1]}, [%0], r12 \n"
- MEMACCESS(0)
- "vld1.32 {d1[0]}, [%0], r12 \n"
- MEMACCESS(0)
- "vld1.32 {d1[1]}, [%0], r12 \n"
- "subs %2, %2, #4 \n" // 4 pixels per loop.
- MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- : "r"(src_stepx) // %3
- : "memory", "cc", "r12", "q0"
- );
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_argb, int dst_width) {
- asm volatile (
- "mov r12, %4, lsl #2 \n"
- "add %1, %1, %0 \n"
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
- MEMACCESS(1)
- "vld1.8 {d1}, [%1], r12 \n"
- MEMACCESS(0)
- "vld1.8 {d2}, [%0], r12 \n"
- MEMACCESS(1)
- "vld1.8 {d3}, [%1], r12 \n"
- MEMACCESS(0)
- "vld1.8 {d4}, [%0], r12 \n"
- MEMACCESS(1)
- "vld1.8 {d5}, [%1], r12 \n"
- MEMACCESS(0)
- "vld1.8 {d6}, [%0], r12 \n"
- MEMACCESS(1)
- "vld1.8 {d7}, [%1], r12 \n"
- "vaddl.u8 q0, d0, d1 \n"
- "vaddl.u8 q1, d2, d3 \n"
- "vaddl.u8 q2, d4, d5 \n"
- "vaddl.u8 q3, d6, d7 \n"
- "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
- "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
- "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
- "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
- "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
- "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
- "subs %3, %3, #4 \n" // 4 pixels per loop.
- MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stride), // %1
- "+r"(dst_argb), // %2
- "+r"(dst_width) // %3
- : "r"(src_stepx) // %4
- : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
- );
-}
-
-#endif // __ARM_NEON__
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/scale_neon64.cc b/src/main/jni/libyuv/source/scale_neon64.cc
deleted file mode 100644
index 44df55c6c..000000000
--- a/src/main/jni/libyuv/source/scale_neon64.cc
+++ /dev/null
@@ -1,789 +0,0 @@
-/*
- * Copyright 2014 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-#ifdef HAS_SCALEROWDOWN2_NEON
-// Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- asm volatile (
- "1: \n"
- // load even pixels into v0, odd into v1
- MEMACCESS(0)
- "ld2 {v0.16b, v1.16b}, [%0], #32 \n"
- "subs %2, %2, #16 \n" // 16 processed per loop
- MEMACCESS(1)
- "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
- :
- : "v0", "v1" // Clobber List
- );
-}
-#endif //HAS_SCALEROWDOWN2_NEON
-
-#ifdef HAS_SCALEROWDOWN2_NEON
-// Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- asm volatile (
- // change the stride to row 2 pointer
- "add %1, %1, %0 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
- MEMACCESS(1)
- "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
- "subs %3, %3, #16 \n" // 16 processed per loop
- "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
- "uaddlp v1.8h, v1.16b \n"
- "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1
- "uadalp v1.8h, v3.16b \n"
- "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
- "rshrn2 v0.16b, v1.8h, #2 \n"
- MEMACCESS(2)
- "st1 {v0.16b}, [%2], #16 \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stride), // %1
- "+r"(dst), // %2
- "+r"(dst_width) // %3
- :
- : "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-#endif //HAS_SCALEROWDOWN2_NEON
-
-#ifdef HAS_SCALEROWDOWN4_NEON
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-3.8b}, [%0], #32 \n" // src line 0
- "subs %2, %2, #8 \n" // 8 processed per loop
- MEMACCESS(1)
- "st1 {v2.8b}, [%1], #8 \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "v0", "v1", "v2", "v3", "memory", "cc"
- );
-}
-#endif //HAS_SCALEROWDOWN4_NEON
-
-#ifdef HAS_SCALEROWDOWN4_NEON
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- const uint8* src_ptr1 = src_ptr + src_stride;
- const uint8* src_ptr2 = src_ptr + src_stride * 2;
- const uint8* src_ptr3 = src_ptr + src_stride * 3;
-asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
- MEMACCESS(3)
- "ld1 {v1.16b}, [%3], #16 \n"
- MEMACCESS(4)
- "ld1 {v2.16b}, [%4], #16 \n"
- MEMACCESS(5)
- "ld1 {v3.16b}, [%5], #16 \n"
- "subs %2, %2, #4 \n"
- "uaddlp v0.8h, v0.16b \n"
- "uadalp v0.8h, v1.16b \n"
- "uadalp v0.8h, v2.16b \n"
- "uadalp v0.8h, v3.16b \n"
- "addp v0.8h, v0.8h, v0.8h \n"
- "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
- MEMACCESS(1)
- "st1 {v0.s}[0], [%1], #4 \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_ptr1), // %3
- "+r"(src_ptr2), // %4
- "+r"(src_ptr3) // %5
- :
- : "v0", "v1", "v2", "v3", "memory", "cc"
- );
-}
-#endif //HAS_SCALEROWDOWN4_NEON
-
-#ifdef HAS_SCALEROWDOWN34_NEON
-// Down scale from 4 to 3 pixels. Use the neon multilane read/write
-// to load up the every 4th pixel into a 4 different registers.
-// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // src line 0
- "subs %2, %2, #24 \n"
- "mov v2.8b, v3.8b \n" // order v0, v1, v2
- MEMACCESS(1)
- "st3 {v0.8b-v2.8b}, [%1], #24 \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "v0", "v1", "v2", "v3", "memory", "cc"
- );
-}
-#endif //HAS_SCALEROWDOWN34_NEON
-
-#ifdef HAS_SCALEROWDOWN34_NEON
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movi v20.8b, #3 \n"
- "add %3, %3, %0 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // src line 0
- MEMACCESS(3)
- "ld4 {v4.8b-v7.8b}, [%3], #32 \n" // src line 1
- "subs %2, %2, #24 \n"
-
- // filter src line 0 with src line 1
- // expand chars to shorts to allow for room
- // when adding lines together
- "ushll v16.8h, v4.8b, #0 \n"
- "ushll v17.8h, v5.8b, #0 \n"
- "ushll v18.8h, v6.8b, #0 \n"
- "ushll v19.8h, v7.8b, #0 \n"
-
- // 3 * line_0 + line_1
- "umlal v16.8h, v0.8b, v20.8b \n"
- "umlal v17.8h, v1.8b, v20.8b \n"
- "umlal v18.8h, v2.8b, v20.8b \n"
- "umlal v19.8h, v3.8b, v20.8b \n"
-
- // (3 * line_0 + line_1) >> 2
- "uqrshrn v0.8b, v16.8h, #2 \n"
- "uqrshrn v1.8b, v17.8h, #2 \n"
- "uqrshrn v2.8b, v18.8h, #2 \n"
- "uqrshrn v3.8b, v19.8h, #2 \n"
-
- // a0 = (src[0] * 3 + s[1] * 1) >> 2
- "ushll v16.8h, v1.8b, #0 \n"
- "umlal v16.8h, v0.8b, v20.8b \n"
- "uqrshrn v0.8b, v16.8h, #2 \n"
-
- // a1 = (src[1] * 1 + s[2] * 1) >> 1
- "urhadd v1.8b, v1.8b, v2.8b \n"
-
- // a2 = (src[2] * 1 + s[3] * 3) >> 2
- "ushll v16.8h, v2.8b, #0 \n"
- "umlal v16.8h, v3.8b, v20.8b \n"
- "uqrshrn v2.8b, v16.8h, #2 \n"
-
- MEMACCESS(1)
- "st3 {v0.8b-v2.8b}, [%1], #24 \n"
-
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
- "v20", "memory", "cc"
- );
-}
-#endif //ScaleRowDown34_0_Box_NEON
-
-#ifdef HAS_SCALEROWDOWN34_NEON
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movi v20.8b, #3 \n"
- "add %3, %3, %0 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // src line 0
- MEMACCESS(3)
- "ld4 {v4.8b-v7.8b}, [%3], #32 \n" // src line 1
- "subs %2, %2, #24 \n"
- // average src line 0 with src line 1
- "urhadd v0.8b, v0.8b, v4.8b \n"
- "urhadd v1.8b, v1.8b, v5.8b \n"
- "urhadd v2.8b, v2.8b, v6.8b \n"
- "urhadd v3.8b, v3.8b, v7.8b \n"
-
- // a0 = (src[0] * 3 + s[1] * 1) >> 2
- "ushll v4.8h, v1.8b, #0 \n"
- "umlal v4.8h, v0.8b, v20.8b \n"
- "uqrshrn v0.8b, v4.8h, #2 \n"
-
- // a1 = (src[1] * 1 + s[2] * 1) >> 1
- "urhadd v1.8b, v1.8b, v2.8b \n"
-
- // a2 = (src[2] * 1 + s[3] * 3) >> 2
- "ushll v4.8h, v2.8b, #0 \n"
- "umlal v4.8h, v3.8b, v20.8b \n"
- "uqrshrn v2.8b, v4.8h, #2 \n"
-
- MEMACCESS(1)
- "st3 {v0.8b-v2.8b}, [%1], #24 \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
- );
-}
-#endif //HAS_SCALEROWDOWN34_NEON
-
-#ifdef HAS_SCALEROWDOWN38_NEON
-static uvec8 kShuf38 =
- { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-static uvec8 kShuf38_2 =
- { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
-static vec16 kMult38_Div6 =
- { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
- 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-static vec16 kMult38_Div9 =
- { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
- 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
-
-// 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- MEMACCESS(3)
- "ld1 {v3.16b}, [%3] \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b, v1.16b}, [%0], #32 \n"
- "subs %2, %2, #12 \n"
- "tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
- MEMACCESS(1)
- "st1 {v2.8b}, [%1], #8 \n"
- MEMACCESS(1)
- "st1 {v2.s}[2], [%1], #4 \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(&kShuf38) // %3
- : "v0", "v1", "v2", "v3", "memory", "cc"
- );
-}
-
-#endif //HAS_SCALEROWDOWN38_NEON
-
-#ifdef HAS_SCALEROWDOWN38_NEON
-// 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- const uint8* src_ptr1 = src_ptr + src_stride * 2;
-
- asm volatile (
- MEMACCESS(5)
- "ld1 {v29.8h}, [%5] \n"
- MEMACCESS(6)
- "ld1 {v30.16b}, [%6] \n"
- MEMACCESS(7)
- "ld1 {v31.8h}, [%7] \n"
- "add %3, %3, %0 \n"
- "1: \n"
-
- // 00 40 01 41 02 42 03 43
- // 10 50 11 51 12 52 13 53
- // 20 60 21 61 22 62 23 63
- // 30 70 31 71 32 72 33 73
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n"
- MEMACCESS(3)
- "ld4 {v4.8b-v7.8b}, [%3], #32 \n"
- MEMACCESS(4)
- "ld4 {v16.8b-v19.8b}, [%4], #32 \n"
- "subs %2, %2, #12 \n"
-
- // Shuffle the input data around to get align the data
- // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
- // 00 10 01 11 02 12 03 13
- // 40 50 41 51 42 52 43 53
- "trn1 v20.8b, v0.8b, v1.8b \n"
- "trn2 v21.8b, v0.8b, v1.8b \n"
- "trn1 v22.8b, v4.8b, v5.8b \n"
- "trn2 v23.8b, v4.8b, v5.8b \n"
- "trn1 v24.8b, v16.8b, v17.8b \n"
- "trn2 v25.8b, v16.8b, v17.8b \n"
-
- // 20 30 21 31 22 32 23 33
- // 60 70 61 71 62 72 63 73
- "trn1 v0.8b, v2.8b, v3.8b \n"
- "trn2 v1.8b, v2.8b, v3.8b \n"
- "trn1 v4.8b, v6.8b, v7.8b \n"
- "trn2 v5.8b, v6.8b, v7.8b \n"
- "trn1 v16.8b, v18.8b, v19.8b \n"
- "trn2 v17.8b, v18.8b, v19.8b \n"
-
- // 00+10 01+11 02+12 03+13
- // 40+50 41+51 42+52 43+53
- "uaddlp v20.4h, v20.8b \n"
- "uaddlp v21.4h, v21.8b \n"
- "uaddlp v22.4h, v22.8b \n"
- "uaddlp v23.4h, v23.8b \n"
- "uaddlp v24.4h, v24.8b \n"
- "uaddlp v25.4h, v25.8b \n"
-
- // 60+70 61+71 62+72 63+73
- "uaddlp v1.4h, v1.8b \n"
- "uaddlp v5.4h, v5.8b \n"
- "uaddlp v17.4h, v17.8b \n"
-
- // combine source lines
- "add v20.4h, v20.4h, v22.4h \n"
- "add v21.4h, v21.4h, v23.4h \n"
- "add v20.4h, v20.4h, v24.4h \n"
- "add v21.4h, v21.4h, v25.4h \n"
- "add v2.4h, v1.4h, v5.4h \n"
- "add v2.4h, v2.4h, v17.4h \n"
-
- // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
- // + s[6 + st * 1] + s[7 + st * 1]
- // + s[6 + st * 2] + s[7 + st * 2]) / 6
- "sqrdmulh v2.8h, v2.8h, v29.8h \n"
- "xtn v2.8b, v2.8h \n"
-
- // Shuffle 2,3 reg around so that 2 can be added to the
- // 0,1 reg and 3 can be added to the 4,5 reg. This
- // requires expanding from u8 to u16 as the 0,1 and 4,5
- // registers are already expanded. Then do transposes
- // to get aligned.
- // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "ushll v16.8h, v16.8b, #0 \n"
- "uaddl v0.8h, v0.8b, v4.8b \n"
-
- // combine source lines
- "add v0.8h, v0.8h, v16.8h \n"
-
- // xx 20 xx 21 xx 22 xx 23
- // xx 30 xx 31 xx 32 xx 33
- "trn1 v1.8h, v0.8h, v0.8h \n"
- "trn2 v4.8h, v0.8h, v0.8h \n"
- "xtn v0.4h, v1.4s \n"
- "xtn v4.4h, v4.4s \n"
-
- // 0+1+2, 3+4+5
- "add v20.8h, v20.8h, v0.8h \n"
- "add v21.8h, v21.8h, v4.8h \n"
-
- // Need to divide, but can't downshift as the the value
- // isn't a power of 2. So multiply by 65536 / n
- // and take the upper 16 bits.
- "sqrdmulh v0.8h, v20.8h, v31.8h \n"
- "sqrdmulh v1.8h, v21.8h, v31.8h \n"
-
- // Align for table lookup, vtbl requires registers to
- // be adjacent
- "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
-
- MEMACCESS(1)
- "st1 {v3.8b}, [%1], #8 \n"
- MEMACCESS(1)
- "st1 {v3.s}[2], [%1], #4 \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride), // %3
- "+r"(src_ptr1) // %4
- : "r"(&kMult38_Div6), // %5
- "r"(&kShuf38_2), // %6
- "r"(&kMult38_Div9) // %7
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
- "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
- "v30", "v31", "memory", "cc"
- );
-}
-#endif //HAS_SCALEROWDOWN38_NEON
-
-#ifdef HAS_SCALEROWDOWN38_NEON
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- MEMACCESS(4)
- "ld1 {v30.8h}, [%4] \n"
- MEMACCESS(5)
- "ld1 {v31.16b}, [%5] \n"
- "add %3, %3, %0 \n"
- "1: \n"
-
- // 00 40 01 41 02 42 03 43
- // 10 50 11 51 12 52 13 53
- // 20 60 21 61 22 62 23 63
- // 30 70 31 71 32 72 33 73
- MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n"
- MEMACCESS(3)
- "ld4 {v4.8b-v7.8b}, [%3], #32 \n"
- "subs %2, %2, #12 \n"
-
- // Shuffle the input data around to get align the data
- // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
- // 00 10 01 11 02 12 03 13
- // 40 50 41 51 42 52 43 53
- "trn1 v16.8b, v0.8b, v1.8b \n"
- "trn2 v17.8b, v0.8b, v1.8b \n"
- "trn1 v18.8b, v4.8b, v5.8b \n"
- "trn2 v19.8b, v4.8b, v5.8b \n"
-
- // 20 30 21 31 22 32 23 33
- // 60 70 61 71 62 72 63 73
- "trn1 v0.8b, v2.8b, v3.8b \n"
- "trn2 v1.8b, v2.8b, v3.8b \n"
- "trn1 v4.8b, v6.8b, v7.8b \n"
- "trn2 v5.8b, v6.8b, v7.8b \n"
-
- // 00+10 01+11 02+12 03+13
- // 40+50 41+51 42+52 43+53
- "uaddlp v16.4h, v16.8b \n"
- "uaddlp v17.4h, v17.8b \n"
- "uaddlp v18.4h, v18.8b \n"
- "uaddlp v19.4h, v19.8b \n"
-
- // 60+70 61+71 62+72 63+73
- "uaddlp v1.4h, v1.8b \n"
- "uaddlp v5.4h, v5.8b \n"
-
- // combine source lines
- "add v16.4h, v16.4h, v18.4h \n"
- "add v17.4h, v17.4h, v19.4h \n"
- "add v2.4h, v1.4h, v5.4h \n"
-
- // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
- "uqrshrn v2.8b, v2.8h, #2 \n"
-
- // Shuffle 2,3 reg around so that 2 can be added to the
- // 0,1 reg and 3 can be added to the 4,5 reg. This
- // requires expanding from u8 to u16 as the 0,1 and 4,5
- // registers are already expanded. Then do transposes
- // to get aligned.
- // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-
- // combine source lines
- "uaddl v0.8h, v0.8b, v4.8b \n"
-
- // xx 20 xx 21 xx 22 xx 23
- // xx 30 xx 31 xx 32 xx 33
- "trn1 v1.8h, v0.8h, v0.8h \n"
- "trn2 v4.8h, v0.8h, v0.8h \n"
- "xtn v0.4h, v1.4s \n"
- "xtn v4.4h, v4.4s \n"
-
- // 0+1+2, 3+4+5
- "add v16.8h, v16.8h, v0.8h \n"
- "add v17.8h, v17.8h, v4.8h \n"
-
- // Need to divide, but can't downshift as the the value
- // isn't a power of 2. So multiply by 65536 / n
- // and take the upper 16 bits.
- "sqrdmulh v0.8h, v16.8h, v30.8h \n"
- "sqrdmulh v1.8h, v17.8h, v30.8h \n"
-
- // Align for table lookup, vtbl requires registers to
- // be adjacent
-
- "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
-
- MEMACCESS(1)
- "st1 {v3.8b}, [%1], #8 \n"
- MEMACCESS(1)
- "st1 {v3.s}[2], [%1], #4 \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- : "r"(&kMult38_Div6), // %4
- "r"(&kShuf38_2) // %5
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
- "v18", "v19", "v30", "v31", "memory", "cc"
- );
-}
-#endif //HAS_SCALEROWDOWN38_NEON
-
-// 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8* dst_ptr,
- const uint8* src_ptr, ptrdiff_t src_stride,
- int dst_width, int source_y_fraction) {
- int y_fraction = 256 - source_y_fraction;
- asm volatile (
- "cmp %4, #0 \n"
- "beq 100f \n"
- "add %2, %2, %1 \n"
- "cmp %4, #64 \n"
- "beq 75f \n"
- "cmp %4, #128 \n"
- "beq 50f \n"
- "cmp %4, #192 \n"
- "beq 25f \n"
-
- "dup v5.8b, %w4 \n"
- "dup v4.8b, %w5 \n"
- // General purpose row blend.
- "1: \n"
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n"
- MEMACCESS(2)
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %3, %3, #16 \n"
- "umull v6.8h, v0.8b, v4.8b \n"
- "umull2 v7.8h, v0.16b, v4.16b \n"
- "umlal v6.8h, v1.8b, v5.8b \n"
- "umlal2 v7.8h, v1.16b, v5.16b \n"
- "rshrn v0.8b, v6.8h, #8 \n"
- "rshrn2 v0.16b, v7.8h, #8 \n"
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n"
- "bgt 1b \n"
- "b 99f \n"
-
- // Blend 25 / 75.
- "25: \n"
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n"
- MEMACCESS(2)
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %3, %3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n"
- "bgt 25b \n"
- "b 99f \n"
-
- // Blend 50 / 50.
- "50: \n"
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n"
- MEMACCESS(2)
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %3, %3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n"
- "bgt 50b \n"
- "b 99f \n"
-
- // Blend 75 / 25.
- "75: \n"
- MEMACCESS(1)
- "ld1 {v1.16b}, [%1], #16 \n"
- MEMACCESS(2)
- "ld1 {v0.16b}, [%2], #16 \n"
- "subs %3, %3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n"
- "bgt 75b \n"
- "b 99f \n"
-
- // Blend 100 / 0 - Copy row unchanged.
- "100: \n"
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n"
- "subs %3, %3, #16 \n"
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n"
- "bgt 100b \n"
-
- "99: \n"
- MEMACCESS(0)
- "st1 {v0.b}[15], [%0] \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(src_stride), // %2
- "+r"(dst_width), // %3
- "+r"(source_y_fraction),// %4
- "+r"(y_fraction) // %5
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
- );
-}
-
-#ifdef HAS_SCALEARGBROWDOWN2_NEON
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- asm volatile (
- "1: \n"
- // load even pixels into q0, odd into q1
- MEMACCESS (0)
- "ld2 {v0.4s, v1.4s}, [%0], #32 \n"
- MEMACCESS (0)
- "ld2 {v2.4s, v3.4s}, [%0], #32 \n"
- "subs %2, %2, #8 \n" // 8 processed per loop
- MEMACCESS (1)
- "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
- MEMACCESS (1)
- "st1 {v3.16b}, [%1], #16 \n"
- "bgt 1b \n"
- : "+r" (src_ptr), // %0
- "+r" (dst), // %1
- "+r" (dst_width) // %2
- :
- : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-#endif //HAS_SCALEARGBROWDOWN2_NEON
-
-#ifdef HAS_SCALEARGBROWDOWN2_NEON
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- asm volatile (
- // change the stride to row 2 pointer
- "add %1, %1, %0 \n"
- "1: \n"
- MEMACCESS (0)
- "ld4 {v0.16b - v3.16b}, [%0], #64 \n" // load 8 ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
- MEMACCESS (1)
- "ld4 {v16.16b - v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels.
- "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
- "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
- "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
- "rshrn v1.8b, v1.8h, #2 \n"
- "rshrn v2.8b, v2.8h, #2 \n"
- "rshrn v3.8b, v3.8h, #2 \n"
- MEMACCESS (2)
- "st4 {v0.8b - v3.8b}, [%2], #32 \n"
- "bgt 1b \n"
- : "+r" (src_ptr), // %0
- "+r" (src_stride), // %1
- "+r" (dst), // %2
- "+r" (dst_width) // %3
- :
- : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
- );
-}
-#endif //HAS_SCALEARGBROWDOWN2_NEON
-
-#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
- int src_stepx, uint8* dst_argb, int dst_width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.s}[0], [%0], %3 \n"
- MEMACCESS(0)
- "ld1 {v0.s}[1], [%0], %3 \n"
- MEMACCESS(0)
- "ld1 {v0.s}[2], [%0], %3 \n"
- MEMACCESS(0)
- "ld1 {v0.s}[3], [%0], %3 \n"
- "subs %2, %2, #4 \n" // 4 pixels per loop.
- MEMACCESS(1)
- "st1 {v0.16b}, [%1], #16 \n"
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- : "r"(src_stepx * 4) // %3
- : "memory", "cc", "v0"
- );
-}
-#endif //HAS_SCALEARGBROWDOWNEVEN_NEON
-
-#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-// TODO, might be worth another optimization pass in future.
-// It could be upgraded to 8 pixels at a time to start with.
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_argb, int dst_width) {
- asm volatile (
- "add %1, %1, %0 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1
- MEMACCESS(1)
- "ld1 {v1.8b}, [%1], %4 \n"
- MEMACCESS(0)
- "ld1 {v2.8b}, [%0], %4 \n"
- MEMACCESS(1)
- "ld1 {v3.8b}, [%1], %4 \n"
- MEMACCESS(0)
- "ld1 {v4.8b}, [%0], %4 \n"
- MEMACCESS(1)
- "ld1 {v5.8b}, [%1], %4 \n"
- MEMACCESS(0)
- "ld1 {v6.8b}, [%0], %4 \n"
- MEMACCESS(1)
- "ld1 {v7.8b}, [%1], %4 \n"
- "uaddl v0.8h, v0.8b, v1.8b \n"
- "uaddl v2.8h, v2.8b, v3.8b \n"
- "uaddl v4.8h, v4.8b, v5.8b \n"
- "uaddl v6.8h, v6.8b, v7.8b \n"
- "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
- "mov v0.d[1], v2.d[0] \n"
- "mov v2.d[0], v16.d[1] \n"
- "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
- "mov v4.d[1], v6.d[0] \n"
- "mov v6.d[0], v16.d[1] \n"
- "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
- "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
- "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
- "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
- "subs %3, %3, #4 \n" // 4 pixels per loop.
- MEMACCESS(2)
- "st1 {v0.16b}, [%2], #16 \n"
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stride), // %1
- "+r"(dst_argb), // %2
- "+r"(dst_width) // %3
- : "r"(src_stepx * 4) // %4
- : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
- );
-}
-#endif // HAS_SCALEARGBROWDOWNEVEN_NEON
-#endif // __aarch64__
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/scale_posix.cc b/src/main/jni/libyuv/source/scale_posix.cc
deleted file mode 100644
index 352e66782..000000000
--- a/src/main/jni/libyuv/source/scale_posix.cc
+++ /dev/null
@@ -1,1315 +0,0 @@
-/*
- * Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-
-// Offsets for source bytes 0 to 9
-static uvec8 kShuf0 =
- { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static uvec8 kShuf1 =
- { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf2 =
- { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 0 to 10
-static uvec8 kShuf01 =
- { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
-
-// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static uvec8 kShuf11 =
- { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf21 =
- { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
-
-// Coefficients for source bytes 0 to 10
-static uvec8 kMadd01 =
- { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
-
-// Coefficients for source bytes 10 to 21
-static uvec8 kMadd11 =
- { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
-
-// Coefficients for source bytes 21 to 31
-static uvec8 kMadd21 =
- { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
-
-// Coefficients for source bytes 21 to 31
-static vec16 kRound34 =
- { 2, 2, 2, 2, 2, 2, 2, 2 };
-
-static uvec8 kShuf38a =
- { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-static uvec8 kShuf38b =
- { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
-
-// Arrange words 0,3,6 into 0,1,2
-static uvec8 kShufAc =
- { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Arrange words 0,3,6 into 3,4,5
-static uvec8 kShufAc3 =
- { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
-
-// Scaling values for boxes of 3x3 and 2x3
-static uvec16 kScaleAc33 =
- { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
-
-// Arrange first value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb0 =
- { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
-
-// Arrange second value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb1 =
- { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
-
-// Arrange third value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb2 =
- { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
-
-// Scaling values for boxes of 3x2 and 2x2
-static uvec16 kScaleAb2 =
- { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
-
-// GCC versions of row functions are verbatim conversions from Visual C.
-// Generated using gcc disassembly on Visual C object file:
-// objdump -D yuvscaler.obj >yuvscaler.txt
-
-void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-
-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psrlw $0x8,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm5,%%xmm3 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "pavgw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
- BUNDLEALIGN
- MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psrlw $0x8,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm5,%%xmm3 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "pavgw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-
-void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-
-void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psrlw $0x8,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm5,%%xmm3 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "pavgw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
- BUNDLEALIGN
- MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psrlw $0x8,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm5,%%xmm3 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "pavgw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-
-void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrld $0x18,%%xmm5 \n"
- "pslld $0x10,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- intptr_t stridex3 = 0;
- asm volatile (
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $0x8,%%xmm7 \n"
- "lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
-
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
- BUNDLEALIGN
- MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2
- BUNDLEALIGN
- MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3
- MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4
- MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm4,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm5,%%xmm3 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psrlw $0x8,%%xmm1 \n"
- "pand %%xmm7,%%xmm2 \n"
- "pand %%xmm7,%%xmm3 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "pavgw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "pand %%xmm7,%%xmm2 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(stridex3) // %3
- : "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
-#endif
- );
-}
-
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm3 \n"
- "movdqa %1,%%xmm4 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kShuf0), // %0
- "m"(kShuf1), // %1
- "m"(kShuf2) // %2
- );
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "palignr $0x8,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
- "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x18,1) ",%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm2 \n" // kShuf01
- "movdqa %1,%%xmm3 \n" // kShuf11
- "movdqa %2,%%xmm4 \n" // kShuf21
- :
- : "m"(kShuf01), // %0
- "m"(kShuf11), // %1
- "m"(kShuf21) // %2
- );
- asm volatile (
- "movdqa %0,%%xmm5 \n" // kMadd01
- "movdqa %1,%%xmm0 \n" // kMadd11
- "movdqa %2,%%xmm1 \n" // kRound34
- :
- : "m"(kMadd01), // %0
- "m"(kMadd11), // %1
- "m"(kRound34) // %2
- );
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm6 \n"
- MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6," MEMACCESS(1) " \n"
- "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
- MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "pmaddubsw %4,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x18,1) ",%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "m"(kMadd21) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm2 \n" // kShuf01
- "movdqa %1,%%xmm3 \n" // kShuf11
- "movdqa %2,%%xmm4 \n" // kShuf21
- :
- : "m"(kShuf01), // %0
- "m"(kShuf11), // %1
- "m"(kShuf21) // %2
- );
- asm volatile (
- "movdqa %0,%%xmm5 \n" // kMadd01
- "movdqa %1,%%xmm0 \n" // kMadd11
- "movdqa %2,%%xmm1 \n" // kRound34
- :
- : "m"(kMadd01), // %0
- "m"(kMadd11), // %1
- "m"(kRound34) // %2
- );
-
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm6 \n"
- MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6," MEMACCESS(1) " \n"
- "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
- MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
- MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "pmaddubsw %4,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x18,1) ",%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "m"(kMadd21) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "movd %%xmm1," MEMACCESS2(0x8,1) " \n"
- "lea " MEMLEA(0xc,1) ",%1 \n"
- "sub $0xc,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "m"(kShuf38a), // %3
- "m"(kShuf38b) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm4", "xmm5"
-#endif
- );
-}
-
-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm2 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm4 \n"
- "movdqa %3,%%xmm5 \n"
- :
- : "m"(kShufAb0), // %0
- "m"(kShufAb1), // %1
- "m"(kShufAb2), // %2
- "m"(kScaleAb2) // %3
- );
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "paddusw %%xmm6,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "sub $0x6,%2 \n"
- "movd %%xmm1," MEMACCESS(1) " \n"
- "psrlq $0x10,%%xmm1 \n"
- "movd %%xmm1," MEMACCESS2(0x2,1) " \n"
- "lea " MEMLEA(0x6,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
- );
-}
-
-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm2 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
- :
- : "m"(kShufAc), // %0
- "m"(kShufAc3), // %1
- "m"(kScaleAc33) // %2
- );
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6
- "movhlps %%xmm0,%%xmm1 \n"
- "movhlps %%xmm6,%%xmm7 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm6 \n"
- "punpcklbw %%xmm5,%%xmm7 \n"
- "paddusw %%xmm6,%%xmm0 \n"
- "paddusw %%xmm7,%%xmm1 \n"
- MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movhlps %%xmm6,%%xmm7 \n"
- "punpcklbw %%xmm5,%%xmm6 \n"
- "punpcklbw %%xmm5,%%xmm7 \n"
- "paddusw %%xmm6,%%xmm0 \n"
- "paddusw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm6 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "movdqa %%xmm1,%%xmm7 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm7 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm7 \n"
- "pshufb %%xmm3,%%xmm7 \n"
- "paddusw %%xmm7,%%xmm6 \n"
- "pmulhuw %%xmm4,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "sub $0x6,%2 \n"
- "movd %%xmm6," MEMACCESS(1) " \n"
- "psrlq $0x10,%%xmm6 \n"
- "movd %%xmm6," MEMACCESS2(0x2,1) " \n"
- "lea " MEMLEA(0x6,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int src_width, int src_height) {
- int tmp_height = 0;
- intptr_t tmp_src = 0;
- asm volatile (
- "pxor %%xmm4,%%xmm4 \n"
- "sub $0x1,%5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "mov %0,%3 \n"
- "add %6,%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm4,%%xmm0 \n"
- "punpckhbw %%xmm4,%%xmm1 \n"
- "mov %5,%2 \n"
- "test %2,%2 \n"
- "je 3f \n"
-
- LABELALIGN
- "2: \n"
- "movdqa " MEMACCESS(0) ",%%xmm2 \n"
- "add %6,%0 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm4,%%xmm2 \n"
- "punpckhbw %%xmm4,%%xmm3 \n"
- "paddusw %%xmm2,%%xmm0 \n"
- "paddusw %%xmm3,%%xmm1 \n"
- "sub $0x1,%2 \n"
- "jg 2b \n"
-
- LABELALIGN
- "3: \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x10,3) ",%0 \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(tmp_height), // %2
- "+r"(tmp_src), // %3
- "+r"(src_width), // %4
- "+rm"(src_height) // %5
- : "rm"((intptr_t)(src_stride)) // %6
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
- );
-}
-
-// Bilinear column filtering. SSSE3 version.
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
- intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
- asm volatile (
- "movd %6,%%xmm2 \n"
- "movd %7,%%xmm3 \n"
- "movl $0x04040000,%k2 \n"
- "movd %k2,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x9,%%xmm6 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "subl $0x2,%5 \n"
- "jl 29f \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "punpckldq %%xmm0,%%xmm2 \n"
- "punpckldq %%xmm3,%%xmm3 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
-
- LABELALIGN
- "2: \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
- "movd %k2,%%xmm0 \n"
- "psrlw $0x9,%%xmm1 \n"
- BUNDLEALIGN
- MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2
- "movd %k2,%%xmm4 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "punpcklwd %%xmm4,%%xmm0 \n"
- "pxor %%xmm6,%%xmm1 \n"
- "pmaddubsw %%xmm1,%%xmm0 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,%k2 \n"
- "mov %w2," MEMACCESS(0) " \n"
- "lea " MEMLEA(0x2,0) ",%0 \n"
- "sub $0x2,%5 \n"
- "jge 2b \n"
-
- LABELALIGN
- "29: \n"
- "addl $0x1,%5 \n"
- "jl 99f \n"
- MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
- "movd %k2,%%xmm0 \n"
- "psrlw $0x9,%%xmm2 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "pxor %%xmm6,%%xmm2 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,%k2 \n"
- "mov %b2," MEMACCESS(0) " \n"
- "99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+a"(temp_pixel), // %2
- "+r"(x0), // %3
- "+r"(x1), // %4
- "+rm"(dst_width) // %5
- : "rm"(x), // %6
- "rm"(dx) // %7
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
- );
-}
-
-// Reads 4 pixels, duplicates them and writes 8 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "sub $0x20,%2 \n"
- "movdqa %%xmm0," MEMACCESS(0) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "jg 1b \n"
-
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
- ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "shufps $0xdd,%%xmm1,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
- ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
- ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- BUNDLEALIGN
- MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
- MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
- );
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_argb, int dst_width) {
- intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
- intptr_t src_stepx_x12 = 0;
- asm volatile (
- "lea " MEMLEA3(0x00,1,4) ",%1 \n"
- "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
- LABELALIGN
- "1: \n"
- "movd " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
- "punpckldq %%xmm1,%%xmm0 \n"
- BUNDLEALIGN
- MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2
- MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3
- "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
- "punpckldq %%xmm3,%%xmm2 \n"
- "punpcklqdq %%xmm2,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stepx_x4), // %1
- "+r"(dst_argb), // %2
- "+r"(dst_width), // %3
- "+r"(src_stepx_x12) // %4
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
- );
-}
-
-// Blends four 2x2 to 4x1.
-// Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
- ptrdiff_t src_stride, int src_stepx,
- uint8* dst_argb, int dst_width) {
- intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
- intptr_t src_stepx_x12 = 0;
- intptr_t row1 = (intptr_t)(src_stride);
- asm volatile (
- "lea " MEMLEA3(0x00,1,4) ",%1 \n"
- "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
- "lea " MEMLEA4(0x00,0,5,1) ",%5 \n"
-
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0
- MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1
- BUNDLEALIGN
- MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1
- "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
- "movq " MEMACCESS(5) ",%%xmm2 \n"
- BUNDLEALIGN
- MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2
- MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3
- MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3
- "lea " MEMLEA4(0x00,5,1,4) ",%5 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stepx_x4), // %1
- "+r"(dst_argb), // %2
- "+rm"(dst_width), // %3
- "+r"(src_stepx_x12), // %4
- "+r"(row1) // %5
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
- );
-}
-
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
- intptr_t x0 = 0, x1 = 0;
- asm volatile (
- "movd %5,%%xmm2 \n"
- "movd %6,%%xmm3 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
- "pshufd $0x11,%%xmm3,%%xmm0 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pshufd $0x5,%%xmm3,%%xmm0 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pextrw $0x1,%%xmm2,%k0 \n"
- "pextrw $0x3,%%xmm2,%k1 \n"
- "cmp $0x0,%4 \n"
- "jl 99f \n"
- "sub $0x4,%4 \n"
- "jl 49f \n"
-
- LABELALIGN
- "40: \n"
- MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
- MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
- "pextrw $0x5,%%xmm2,%k0 \n"
- "pextrw $0x7,%%xmm2,%k1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1
- MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4
- "pextrw $0x1,%%xmm2,%k0 \n"
- "pextrw $0x3,%%xmm2,%k1 \n"
- "punpckldq %%xmm4,%%xmm1 \n"
- "punpcklqdq %%xmm1,%%xmm0 \n"
- "sub $0x4,%4 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "jge 40b \n"
-
- "49: \n"
- "test $0x2,%4 \n"
- "je 29f \n"
- BUNDLEALIGN
- MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
- MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
- "pextrw $0x5,%%xmm2,%k0 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movq %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x8,2) ",%2 \n"
- "29: \n"
- "test $0x1,%4 \n"
- "je 99f \n"
- MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
- "movd %%xmm0," MEMACCESS(2) " \n"
- "99: \n"
- : "+a"(x0), // %0
- "+d"(x1), // %1
- "+r"(dst_argb), // %2
- "+r"(src_argb), // %3
- "+r"(dst_width) // %4
- : "rm"(x), // %5
- "rm"(dx) // %6
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
- );
-}
-
-// Reads 4 pixels, duplicates them and writes 8 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpckldq %%xmm0,%%xmm0 \n"
- "punpckhdq %%xmm1,%%xmm1 \n"
- "sub $0x8,%2 \n"
- "movdqa %%xmm0," MEMACCESS(0) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "jg 1b \n"
-
- : "+r"(dst_argb), // %0
- "+r"(src_argb), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-
-// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static uvec8 kShuffleColARGB = {
- 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
- 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
-};
-
-// Shuffle table for duplicating 2 fractions into 8 bytes each
-static uvec8 kShuffleFractions = {
- 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
-};
-
-// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
- intptr_t x0 = 0, x1 = 0;
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm5 \n"
- :
- : "m"(kShuffleColARGB), // %0
- "m"(kShuffleFractions) // %1
- );
-
- asm volatile (
- "movd %5,%%xmm2 \n"
- "movd %6,%%xmm3 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x9,%%xmm6 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "sub $0x2,%2 \n"
- "jl 29f \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "punpckldq %%xmm0,%%xmm2 \n"
- "punpckldq %%xmm3,%%xmm3 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
-
- LABELALIGN
- "2: \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
- "psrlw $0x9,%%xmm1 \n"
- BUNDLEALIGN
- MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0
- "pshufb %%xmm5,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pxor %%xmm6,%%xmm1 \n"
- "pmaddubsw %%xmm1,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0," MEMACCESS(0) " \n"
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "sub $0x2,%2 \n"
- "jge 2b \n"
-
- LABELALIGN
- "29: \n"
- "add $0x1,%2 \n"
- "jl 99f \n"
- "psrlw $0x9,%%xmm2 \n"
- BUNDLEALIGN
- MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
- "pshufb %%xmm5,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pxor %%xmm6,%%xmm2 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0," MEMACCESS(0) " \n"
-
- LABELALIGN
- "99: \n"
- : "+r"(dst_argb), // %0
- "+r"(src_argb), // %1
- "+rm"(dst_width), // %2
- "+r"(x0), // %3
- "+r"(x1) // %4
- : "rm"(x), // %5
- "rm"(dx) // %6
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
- );
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-int FixedDiv_X86(int num, int div) {
- asm volatile (
- "cdq \n"
- "shld $0x10,%%eax,%%edx \n"
- "shl $0x10,%%eax \n"
- "idiv %1 \n"
- "mov %0, %%eax \n"
- : "+a"(num) // %0
- : "c"(div) // %1
- : "memory", "cc", "edx"
- );
- return num;
-}
-
-// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
-int FixedDiv1_X86(int num, int div) {
- asm volatile (
- "cdq \n"
- "shld $0x10,%%eax,%%edx \n"
- "shl $0x10,%%eax \n"
- "sub $0x10001,%%eax \n"
- "sbb $0x0,%%edx \n"
- "sub $0x1,%1 \n"
- "idiv %1 \n"
- "mov %0, %%eax \n"
- : "+a"(num) // %0
- : "c"(div) // %1
- : "memory", "cc", "edx"
- );
- return num;
-}
-
-#endif // defined(__x86_64__) || defined(__i386__)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/scale_win.cc b/src/main/jni/libyuv/source/scale_win.cc
deleted file mode 100644
index 840b9738d..000000000
--- a/src/main/jni/libyuv/source/scale_win.cc
+++ /dev/null
@@ -1,1320 +0,0 @@
-/*
- * Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Visual C x86.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-
-// Offsets for source bytes 0 to 9
-static uvec8 kShuf0 =
- { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static uvec8 kShuf1 =
- { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf2 =
- { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 0 to 10
-static uvec8 kShuf01 =
- { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
-
-// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static uvec8 kShuf11 =
- { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf21 =
- { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
-
-// Coefficients for source bytes 0 to 10
-static uvec8 kMadd01 =
- { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
-
-// Coefficients for source bytes 10 to 21
-static uvec8 kMadd11 =
- { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
-
-// Coefficients for source bytes 21 to 31
-static uvec8 kMadd21 =
- { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
-
-// Coefficients for source bytes 21 to 31
-static vec16 kRound34 =
- { 2, 2, 2, 2, 2, 2, 2, 2 };
-
-static uvec8 kShuf38a =
- { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-static uvec8 kShuf38b =
- { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
-
-// Arrange words 0,3,6 into 0,1,2
-static uvec8 kShufAc =
- { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Arrange words 0,3,6 into 3,4,5
-static uvec8 kShufAc3 =
- { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
-
-// Scaling values for boxes of 3x3 and 2x3
-static uvec16 kScaleAc33 =
- { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
-
-// Arrange first value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb0 =
- { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
-
-// Arrange second value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb1 =
- { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
-
-// Arrange third value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb2 =
- { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
-
-// Scaling values for boxes of 3x2 and 2x2
-static uvec16 kScaleAb2 =
- { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
-
-// Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // isolate odd pixels.
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- ret
- }
-}
-
-// Blends 32x1 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
-
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
- psrlw xmm0, 8
- movdqa xmm3, xmm1
- psrlw xmm1, 8
- pand xmm2, xmm5
- pand xmm3, xmm5
- pavgw xmm0, xmm2
- pavgw xmm1, xmm3
- packuswb xmm0, xmm1
-
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- ret
- }
-}
-
-// Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
-
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
- psrlw xmm0, 8
- movdqa xmm3, xmm1
- psrlw xmm1, 8
- pand xmm2, xmm5
- pand xmm3, xmm5
- pavgw xmm0, xmm2
- pavgw xmm1, xmm3
- packuswb xmm0, xmm1
-
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- pop esi
- ret
- }
-}
-
-// Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
-
- align 4
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // isolate odd pixels.
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- ret
- }
-}
-
-// Blends 32x1 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 4
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
-
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
- psrlw xmm0, 8
- movdqa xmm3, xmm1
- psrlw xmm1, 8
- pand xmm2, xmm5
- pand xmm3, xmm5
- pavgw xmm0, xmm2
- pavgw xmm1, xmm3
- packuswb xmm0, xmm1
-
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- ret
- }
-}
-
-// Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 4
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + esi]
- movdqu xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
-
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
- psrlw xmm0, 8
- movdqa xmm3, xmm1
- psrlw xmm1, 8
- pand xmm2, xmm5
- pand xmm3, xmm5
- pavgw xmm0, xmm2
- pavgw xmm1, xmm3
- packuswb xmm0, xmm1
-
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- pop esi
- ret
- }
-}
-
-// Point samples 32 pixels to 8 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
- psrld xmm5, 24
- pslld xmm5, 16
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- psrlw xmm0, 8
- packuswb xmm0, xmm0
- sub ecx, 8
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- jg wloop
-
- ret
- }
-}
-
-// Blends 32x4 rectangle to 8x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_ptr
- mov esi, [esp + 8 + 8] // src_stride
- mov edx, [esp + 8 + 12] // dst_ptr
- mov ecx, [esp + 8 + 16] // dst_width
- lea edi, [esi + esi * 2] // src_stride * 3
- pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
- psrlw xmm7, 8
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
- movdqa xmm2, [eax + esi * 2]
- movdqa xmm3, [eax + esi * 2 + 16]
- movdqa xmm4, [eax + edi]
- movdqa xmm5, [eax + edi + 16]
- lea eax, [eax + 32]
- pavgb xmm2, xmm4
- pavgb xmm3, xmm5
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
-
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
- psrlw xmm0, 8
- movdqa xmm3, xmm1
- psrlw xmm1, 8
- pand xmm2, xmm7
- pand xmm3, xmm7
- pavgw xmm0, xmm2
- pavgw xmm1, xmm3
- packuswb xmm0, xmm1
-
- movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
- psrlw xmm0, 8
- pand xmm2, xmm7
- pavgw xmm0, xmm2
- packuswb xmm0, xmm0
-
- sub ecx, 8
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- jg wloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-// Point samples 32 pixels to 24 pixels.
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
-// Then shuffled to do the scaling.
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- movdqa xmm3, kShuf0
- movdqa xmm4, kShuf1
- movdqa xmm5, kShuf2
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqa xmm2, xmm1
- palignr xmm1, xmm0, 8
- pshufb xmm0, xmm3
- pshufb xmm1, xmm4
- pshufb xmm2, xmm5
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + 8], xmm1
- movq qword ptr [edx + 16], xmm2
- lea edx, [edx + 24]
- sub ecx, 24
- jg wloop
-
- ret
- }
-}
-
-// Blends 32x2 rectangle to 24x1
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
-// Then shuffled to do the scaling.
-
-// Register usage:
-// xmm0 src_row 0
-// xmm1 src_row 1
-// xmm2 shuf 0
-// xmm3 shuf 1
-// xmm4 shuf 2
-// xmm5 madd 0
-// xmm6 madd 1
-// xmm7 kRound34
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, kShuf01
- movdqa xmm3, kShuf11
- movdqa xmm4, kShuf21
- movdqa xmm5, kMadd01
- movdqa xmm6, kMadd11
- movdqa xmm7, kRound34
-
- align 4
- wloop:
- movdqa xmm0, [eax] // pixels 0..7
- movdqa xmm1, [eax + esi]
- pavgb xmm0, xmm1
- pshufb xmm0, xmm2
- pmaddubsw xmm0, xmm5
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- movdqu xmm0, [eax + 8] // pixels 8..15
- movdqu xmm1, [eax + esi + 8]
- pavgb xmm0, xmm1
- pshufb xmm0, xmm3
- pmaddubsw xmm0, xmm6
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx + 8], xmm0
- movdqa xmm0, [eax + 16] // pixels 16..23
- movdqa xmm1, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm1
- pshufb xmm0, xmm4
- movdqa xmm1, kMadd21
- pmaddubsw xmm0, xmm1
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- sub ecx, 24
- movq qword ptr [edx + 16], xmm0
- lea edx, [edx + 24]
- jg wloop
-
- pop esi
- ret
- }
-}
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, kShuf01
- movdqa xmm3, kShuf11
- movdqa xmm4, kShuf21
- movdqa xmm5, kMadd01
- movdqa xmm6, kMadd11
- movdqa xmm7, kRound34
-
- align 4
- wloop:
- movdqa xmm0, [eax] // pixels 0..7
- movdqa xmm1, [eax + esi]
- pavgb xmm1, xmm0
- pavgb xmm0, xmm1
- pshufb xmm0, xmm2
- pmaddubsw xmm0, xmm5
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- movdqu xmm0, [eax + 8] // pixels 8..15
- movdqu xmm1, [eax + esi + 8]
- pavgb xmm1, xmm0
- pavgb xmm0, xmm1
- pshufb xmm0, xmm3
- pmaddubsw xmm0, xmm6
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx + 8], xmm0
- movdqa xmm0, [eax + 16] // pixels 16..23
- movdqa xmm1, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm1, xmm0
- pavgb xmm0, xmm1
- pshufb xmm0, xmm4
- movdqa xmm1, kMadd21
- pmaddubsw xmm0, xmm1
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- sub ecx, 24
- movq qword ptr [edx + 16], xmm0
- lea edx, [edx+24]
- jg wloop
-
- pop esi
- ret
- }
-}
-
-// 3/8 point sampler
-
-// Scale 32 pixels to 12
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- movdqa xmm4, kShuf38a
- movdqa xmm5, kShuf38b
-
- align 4
- xloop:
- movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
- movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
- lea eax, [eax + 32]
- pshufb xmm0, xmm4
- pshufb xmm1, xmm5
- paddusb xmm0, xmm1
-
- sub ecx, 12
- movq qword ptr [edx], xmm0 // write 12 pixels
- movhlps xmm1, xmm0
- movd [edx + 8], xmm1
- lea edx, [edx + 12]
- jg xloop
-
- ret
- }
-}
-
-// Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, kShufAc
- movdqa xmm3, kShufAc3
- movdqa xmm4, kScaleAc33
- pxor xmm5, xmm5
-
- align 4
- xloop:
- movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
- movdqa xmm6, [eax + esi]
- movhlps xmm1, xmm0
- movhlps xmm7, xmm6
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- punpcklbw xmm6, xmm5
- punpcklbw xmm7, xmm5
- paddusw xmm0, xmm6
- paddusw xmm1, xmm7
- movdqa xmm6, [eax + esi * 2]
- lea eax, [eax + 16]
- movhlps xmm7, xmm6
- punpcklbw xmm6, xmm5
- punpcklbw xmm7, xmm5
- paddusw xmm0, xmm6
- paddusw xmm1, xmm7
-
- movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
- psrldq xmm0, 2
- paddusw xmm6, xmm0
- psrldq xmm0, 2
- paddusw xmm6, xmm0
- pshufb xmm6, xmm2
-
- movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
- psrldq xmm1, 2
- paddusw xmm7, xmm1
- psrldq xmm1, 2
- paddusw xmm7, xmm1
- pshufb xmm7, xmm3
- paddusw xmm6, xmm7
-
- pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
- packuswb xmm6, xmm6
-
- sub ecx, 6
- movd [edx], xmm6 // write 6 pixels
- psrlq xmm6, 16
- movd [edx + 2], xmm6
- lea edx, [edx + 6]
- jg xloop
-
- pop esi
- ret
- }
-}
-
-// Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, kShufAb0
- movdqa xmm3, kShufAb1
- movdqa xmm4, kShufAb2
- movdqa xmm5, kScaleAb2
-
- align 4
- xloop:
- movdqa xmm0, [eax] // average 2 rows into xmm0
- pavgb xmm0, [eax + esi]
- lea eax, [eax + 16]
-
- movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
- pshufb xmm1, xmm2
- movdqa xmm6, xmm0
- pshufb xmm6, xmm3
- paddusw xmm1, xmm6
- pshufb xmm0, xmm4
- paddusw xmm1, xmm0
-
- pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
- packuswb xmm1, xmm1
-
- sub ecx, 6
- movd [edx], xmm1 // write 6 pixels
- psrlq xmm1, 16
- movd [edx + 2], xmm1
- lea edx, [edx + 6]
- jg xloop
-
- pop esi
- ret
- }
-}
-
-// Reads 16xN bytes and produces 16 shorts at a time.
-// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
-__declspec(naked) __declspec(align(16))
-void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int src_width,
- int src_height) {
- __asm {
- push esi
- push edi
- push ebx
- push ebp
- mov esi, [esp + 16 + 4] // src_ptr
- mov edx, [esp + 16 + 8] // src_stride
- mov edi, [esp + 16 + 12] // dst_ptr
- mov ecx, [esp + 16 + 16] // dst_width
- mov ebx, [esp + 16 + 20] // height
- pxor xmm4, xmm4
- dec ebx
-
- align 4
- xloop:
- // first row
- movdqa xmm0, [esi]
- lea eax, [esi + edx]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm4
- punpckhbw xmm1, xmm4
- lea esi, [esi + 16]
- mov ebp, ebx
- test ebp, ebp
- je ydone
-
- // sum remaining rows
- align 4
- yloop:
- movdqa xmm2, [eax] // read 16 pixels
- lea eax, [eax + edx] // advance to next row
- movdqa xmm3, xmm2
- punpcklbw xmm2, xmm4
- punpckhbw xmm3, xmm4
- paddusw xmm0, xmm2 // sum 16 words
- paddusw xmm1, xmm3
- sub ebp, 1
- jg yloop
-
- align 4
- ydone:
- movdqa [edi], xmm0
- movdqa [edi + 16], xmm1
- lea edi, [edi + 32]
-
- sub ecx, 16
- jg xloop
-
- pop ebp
- pop ebx
- pop edi
- pop esi
- ret
- }
-}
-
-// Bilinear column filtering. SSSE3 version.
-// TODO(fbarchard): Port to Neon
-// TODO(fbarchard): Switch the following:
-// xor ebx, ebx
-// mov bx, word ptr [esi + eax] // 2 source x0 pixels
-// To
-// movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
-// when drmemory bug fixed.
-// https://code.google.com/p/drmemory/issues/detail?id=1396
-
-__declspec(naked) __declspec(align(16))
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
- __asm {
- push ebx
- push esi
- push edi
- mov edi, [esp + 12 + 4] // dst_ptr
- mov esi, [esp + 12 + 8] // src_ptr
- mov ecx, [esp + 12 + 12] // dst_width
- movd xmm2, [esp + 12 + 16] // x
- movd xmm3, [esp + 12 + 20] // dx
- mov eax, 0x04040000 // shuffle to line up fractions with pixel.
- movd xmm5, eax
- pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
- psrlw xmm6, 9
- pextrw eax, xmm2, 1 // get x0 integer. preroll
- sub ecx, 2
- jl xloop29
-
- movdqa xmm0, xmm2 // x1 = x0 + dx
- paddd xmm0, xmm3
- punpckldq xmm2, xmm0 // x0 x1
- punpckldq xmm3, xmm3 // dx dx
- paddd xmm3, xmm3 // dx * 2, dx * 2
- pextrw edx, xmm2, 3 // get x1 integer. preroll
-
- // 2 Pixel loop.
- align 4
- xloop2:
- movdqa xmm1, xmm2 // x0, x1 fractions.
- paddd xmm2, xmm3 // x += dx
- movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
- movd xmm0, ebx
- psrlw xmm1, 9 // 7 bit fractions.
- movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
- movd xmm4, ebx
- pshufb xmm1, xmm5 // 0011
- punpcklwd xmm0, xmm4
- pxor xmm1, xmm6 // 0..7f and 7f..0
- pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels.
- pextrw eax, xmm2, 1 // get x0 integer. next iteration.
- pextrw edx, xmm2, 3 // get x1 integer. next iteration.
- psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
- packuswb xmm0, xmm0 // 8 bits, 2 pixels.
- movd ebx, xmm0
- mov [edi], bx
- lea edi, [edi + 2]
- sub ecx, 2 // 2 pixels
- jge xloop2
-
- align 4
- xloop29:
-
- add ecx, 2 - 1
- jl xloop99
-
- // 1 pixel remainder
- movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
- movd xmm0, ebx
- psrlw xmm2, 9 // 7 bit fractions.
- pshufb xmm2, xmm5 // 0011
- pxor xmm2, xmm6 // 0..7f and 7f..0
- pmaddubsw xmm0, xmm2 // 16 bit
- psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
- packuswb xmm0, xmm0 // 8 bits
- movd ebx, xmm0
- mov [edi], bl
-
- align 4
- xloop99:
-
- pop edi
- pop esi
- pop ebx
- ret
- }
-}
-
-// Reads 16 pixels, duplicates them and writes 32 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
- __asm {
- mov edx, [esp + 4] // dst_ptr
- mov eax, [esp + 8] // src_ptr
- mov ecx, [esp + 12] // dst_width
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm0
- punpckhbw xmm1, xmm1
- sub ecx, 32
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- jg wloop
-
- ret
- }
-}
-
-// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
- ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- // src_stride ignored
- mov edx, [esp + 12] // dst_argb
- mov ecx, [esp + 16] // dst_width
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- shufps xmm0, xmm1, 0xdd
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- ret
- }
-}
-
-// Blends 8x1 rectangle to 4x1.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
- ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- // src_stride ignored
- mov edx, [esp + 12] // dst_argb
- mov ecx, [esp + 16] // dst_width
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqa xmm2, xmm0
- shufps xmm0, xmm1, 0x88 // even pixels
- shufps xmm2, xmm1, 0xdd // odd pixels
- pavgb xmm0, xmm2
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- ret
- }
-}
-
-// Blends 8x2 rectangle to 4x1.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
- ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // dst_width
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
- movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
- shufps xmm0, xmm1, 0x88 // even pixels
- shufps xmm2, xmm1, 0xdd // odd pixels
- pavgb xmm0, xmm2
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- pop esi
- ret
- }
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_argb, int dst_width) {
- __asm {
- push ebx
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- // src_stride ignored
- mov ebx, [esp + 8 + 12] // src_stepx
- mov edx, [esp + 8 + 16] // dst_argb
- mov ecx, [esp + 8 + 20] // dst_width
- lea ebx, [ebx * 4]
- lea edi, [ebx + ebx * 2]
-
- align 4
- wloop:
- movd xmm0, [eax]
- movd xmm1, [eax + ebx]
- punpckldq xmm0, xmm1
- movd xmm2, [eax + ebx * 2]
- movd xmm3, [eax + edi]
- lea eax, [eax + ebx * 4]
- punpckldq xmm2, xmm3
- punpcklqdq xmm0, xmm2
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- pop edi
- pop ebx
- ret
- }
-}
-
-// Blends four 2x2 to 4x1.
-// Alignment requirement: dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_argb, int dst_width) {
- __asm {
- push ebx
- push esi
- push edi
- mov eax, [esp + 12 + 4] // src_argb
- mov esi, [esp + 12 + 8] // src_stride
- mov ebx, [esp + 12 + 12] // src_stepx
- mov edx, [esp + 12 + 16] // dst_argb
- mov ecx, [esp + 12 + 20] // dst_width
- lea esi, [eax + esi] // row1 pointer
- lea ebx, [ebx * 4]
- lea edi, [ebx + ebx * 2]
-
- align 4
- wloop:
- movq xmm0, qword ptr [eax] // row0 4 pairs
- movhps xmm0, qword ptr [eax + ebx]
- movq xmm1, qword ptr [eax + ebx * 2]
- movhps xmm1, qword ptr [eax + edi]
- lea eax, [eax + ebx * 4]
- movq xmm2, qword ptr [esi] // row1 4 pairs
- movhps xmm2, qword ptr [esi + ebx]
- movq xmm3, qword ptr [esi + ebx * 2]
- movhps xmm3, qword ptr [esi + edi]
- lea esi, [esi + ebx * 4]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
- movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
- shufps xmm0, xmm1, 0x88 // even pixels
- shufps xmm2, xmm1, 0xdd // odd pixels
- pavgb xmm0, xmm2
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- pop edi
- pop esi
- pop ebx
- ret
- }
-}
-
-// Column scaling unfiltered. SSE2 version.
-__declspec(naked) __declspec(align(16))
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
- __asm {
- push edi
- push esi
- mov edi, [esp + 8 + 4] // dst_argb
- mov esi, [esp + 8 + 8] // src_argb
- mov ecx, [esp + 8 + 12] // dst_width
- movd xmm2, [esp + 8 + 16] // x
- movd xmm3, [esp + 8 + 20] // dx
-
- pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
- pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
- paddd xmm2, xmm0
- paddd xmm3, xmm3 // 0, 0, 0, dx * 2
- pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
- paddd xmm2, xmm0 // x3 x2 x1 x0
- paddd xmm3, xmm3 // 0, 0, 0, dx * 4
- pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
-
- pextrw eax, xmm2, 1 // get x0 integer.
- pextrw edx, xmm2, 3 // get x1 integer.
-
- cmp ecx, 0
- jle xloop99
- sub ecx, 4
- jl xloop49
-
- // 4 Pixel loop.
- align 4
- xloop4:
- movd xmm0, [esi + eax * 4] // 1 source x0 pixels
- movd xmm1, [esi + edx * 4] // 1 source x1 pixels
- pextrw eax, xmm2, 5 // get x2 integer.
- pextrw edx, xmm2, 7 // get x3 integer.
- paddd xmm2, xmm3 // x += dx
- punpckldq xmm0, xmm1 // x0 x1
-
- movd xmm1, [esi + eax * 4] // 1 source x2 pixels
- movd xmm4, [esi + edx * 4] // 1 source x3 pixels
- pextrw eax, xmm2, 1 // get x0 integer. next iteration.
- pextrw edx, xmm2, 3 // get x1 integer. next iteration.
- punpckldq xmm1, xmm4 // x2 x3
- punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
- sub ecx, 4 // 4 pixels
- movdqu [edi], xmm0
- lea edi, [edi + 16]
- jge xloop4
-
- align 4
- xloop49:
- test ecx, 2
- je xloop29
-
- // 2 Pixels.
- movd xmm0, [esi + eax * 4] // 1 source x0 pixels
- movd xmm1, [esi + edx * 4] // 1 source x1 pixels
- pextrw eax, xmm2, 5 // get x2 integer.
- punpckldq xmm0, xmm1 // x0 x1
-
- movq qword ptr [edi], xmm0
- lea edi, [edi + 8]
-
- xloop29:
- test ecx, 1
- je xloop99
-
- // 1 Pixels.
- movd xmm0, [esi + eax * 4] // 1 source x2 pixels
- movd dword ptr [edi], xmm0
- align 4
- xloop99:
-
- pop esi
- pop edi
- ret
- }
-}
-
-// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
-// TODO(fbarchard): Port to Neon
-
-// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static uvec8 kShuffleColARGB = {
- 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
- 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
-};
-
-// Shuffle table for duplicating 2 fractions into 8 bytes each
-static uvec8 kShuffleFractions = {
- 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
-};
-
-__declspec(naked) __declspec(align(16))
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_argb
- mov esi, [esp + 8 + 8] // src_argb
- mov ecx, [esp + 8 + 12] // dst_width
- movd xmm2, [esp + 8 + 16] // x
- movd xmm3, [esp + 8 + 20] // dx
- movdqa xmm4, kShuffleColARGB
- movdqa xmm5, kShuffleFractions
- pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
- psrlw xmm6, 9
- pextrw eax, xmm2, 1 // get x0 integer. preroll
- sub ecx, 2
- jl xloop29
-
- movdqa xmm0, xmm2 // x1 = x0 + dx
- paddd xmm0, xmm3
- punpckldq xmm2, xmm0 // x0 x1
- punpckldq xmm3, xmm3 // dx dx
- paddd xmm3, xmm3 // dx * 2, dx * 2
- pextrw edx, xmm2, 3 // get x1 integer. preroll
-
- // 2 Pixel loop.
- align 4
- xloop2:
- movdqa xmm1, xmm2 // x0, x1 fractions.
- paddd xmm2, xmm3 // x += dx
- movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
- psrlw xmm1, 9 // 7 bit fractions.
- movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
- pshufb xmm1, xmm5 // 0000000011111111
- pshufb xmm0, xmm4 // arrange pixels into pairs
- pxor xmm1, xmm6 // 0..7f and 7f..0
- pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
- pextrw eax, xmm2, 1 // get x0 integer. next iteration.
- pextrw edx, xmm2, 3 // get x1 integer. next iteration.
- psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
- packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
- movq qword ptr [edi], xmm0
- lea edi, [edi + 8]
- sub ecx, 2 // 2 pixels
- jge xloop2
-
- align 4
- xloop29:
-
- add ecx, 2 - 1
- jl xloop99
-
- // 1 pixel remainder
- psrlw xmm2, 9 // 7 bit fractions.
- movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
- pshufb xmm2, xmm5 // 00000000
- pshufb xmm0, xmm4 // arrange pixels into pairs
- pxor xmm2, xmm6 // 0..7f and 7f..0
- pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
- psrlw xmm0, 7
- packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
- movd [edi], xmm0
-
- align 4
- xloop99:
-
- pop edi
- pop esi
- ret
- }
-}
-
-// Reads 4 pixels, duplicates them and writes 8 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
- __asm {
- mov edx, [esp + 4] // dst_argb
- mov eax, [esp + 8] // src_argb
- mov ecx, [esp + 12] // dst_width
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpckldq xmm0, xmm0
- punpckhdq xmm1, xmm1
- sub ecx, 8
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- jg wloop
-
- ret
- }
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) __declspec(align(16))
-int FixedDiv_X86(int num, int div) {
- __asm {
- mov eax, [esp + 4] // num
- cdq // extend num to 64 bits
- shld edx, eax, 16 // 32.16
- shl eax, 16
- idiv dword ptr [esp + 8]
- ret
- }
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) __declspec(align(16))
-int FixedDiv1_X86(int num, int div) {
- __asm {
- mov eax, [esp + 4] // num
- mov ecx, [esp + 8] // denom
- cdq // extend num to 64 bits
- shld edx, eax, 16 // 32.16
- shl eax, 16
- sub eax, 0x00010001
- sbb edx, 0
- sub ecx, 1
- idiv ecx
- ret
- }
-}
-
-#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/src/main/jni/libyuv/source/video_common.cc b/src/main/jni/libyuv/source/video_common.cc
deleted file mode 100644
index efbedf46e..000000000
--- a/src/main/jni/libyuv/source/video_common.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))
-
-struct FourCCAliasEntry {
- uint32 alias;
- uint32 canonical;
-};
-
-static const struct FourCCAliasEntry kFourCCAliases[] = {
- {FOURCC_IYUV, FOURCC_I420},
- {FOURCC_YU16, FOURCC_I422},
- {FOURCC_YU24, FOURCC_I444},
- {FOURCC_YUYV, FOURCC_YUY2},
- {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs
- {FOURCC_HDYC, FOURCC_UYVY},
- {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8
- {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not.
- {FOURCC_DMB1, FOURCC_MJPG},
- {FOURCC_BA81, FOURCC_BGGR},
- {FOURCC_RGB3, FOURCC_RAW },
- {FOURCC_BGR3, FOURCC_24BG},
- {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB
- {FOURCC_CM24, FOURCC_RAW }, // kCMPixelFormat_24RGB
- {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555
- {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565
- {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551
-};
-// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
-// {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA
-
-LIBYUV_API
-uint32 CanonicalFourCC(uint32 fourcc) {
- int i;
- for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
- if (kFourCCAliases[i].alias == fourcc) {
- return kFourCCAliases[i].canonical;
- }
- }
- // Not an alias, so return it as-is.
- return fourcc;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
-
diff --git a/src/main/jni/libyuv/source/x86inc.asm b/src/main/jni/libyuv/source/x86inc.asm
deleted file mode 100644
index cb5c32df3..000000000
--- a/src/main/jni/libyuv/source/x86inc.asm
+++ /dev/null
@@ -1,1136 +0,0 @@
-;*****************************************************************************
-;* x86inc.asm: x264asm abstraction layer
-;*****************************************************************************
-;* Copyright (C) 2005-2012 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;* Anton Mitrofanov <BugMaster@narod.ru>
-;* Jason Garrett-Glaser <darkshikari@gmail.com>
-;* Henrik Gramner <hengar-6@student.ltu.se>
-;*
-;* Permission to use, copy, modify, and/or distribute this software for any
-;* purpose with or without fee is hereby granted, provided that the above
-;* copyright notice and this permission notice appear in all copies.
-;*
-;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
-;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
-;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-;*****************************************************************************
-
-; This is a header file for the x264ASM assembly language, which uses
-; NASM/YASM syntax combined with a large number of macros to provide easy
-; abstraction between different calling conventions (x86_32, win64, linux64).
-; It also has various other useful features to simplify writing the kind of
-; DSP functions that are most often used in x264.
-
-; Unlike the rest of x264, this file is available under an ISC license, as it
-; has significant usefulness outside of x264 and we want it to be available
-; to the largest audience possible. Of course, if you modify it for your own
-; purposes to add a new feature, we strongly encourage contributing a patch
-; as this feature might be useful for others as well. Send patches or ideas
-; to x264-devel@videolan.org .
-
-; Local changes for libyuv:
-; remove %define program_name and references in labels
-; rename cpus to uppercase
-
-%define WIN64 0
-%define UNIX64 0
-%if ARCH_X86_64
- %ifidn __OUTPUT_FORMAT__,win32
- %define WIN64 1
- %elifidn __OUTPUT_FORMAT__,win64
- %define WIN64 1
- %else
- %define UNIX64 1
- %endif
-%endif
-
-%ifdef PREFIX
- %define mangle(x) _ %+ x
-%else
- %define mangle(x) x
-%endif
-
-; Name of the .rodata section.
-; Kludge: Something on OS X fails to align .rodata even given an align attribute,
-; so use a different read-only section.
-%macro SECTION_RODATA 0-1 16
- %ifidn __OUTPUT_FORMAT__,macho64
- SECTION .text align=%1
- %elifidn __OUTPUT_FORMAT__,macho
- SECTION .text align=%1
- fakegot:
- %elifidn __OUTPUT_FORMAT__,aout
- section .text
- %else
- SECTION .rodata align=%1
- %endif
-%endmacro
-
-; aout does not support align=
-%macro SECTION_TEXT 0-1 16
- %ifidn __OUTPUT_FORMAT__,aout
- SECTION .text
- %else
- SECTION .text align=%1
- %endif
-%endmacro
-
-%if WIN64
- %define PIC
-%elif ARCH_X86_64 == 0
-; x86_32 doesn't require PIC.
-; Some distros prefer shared objects to be PIC, but nothing breaks if
-; the code contains a few textrels, so we'll skip that complexity.
- %undef PIC
-%endif
-%ifdef PIC
- default rel
-%endif
-
-; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
-CPU amdnop
-
-; Macros to eliminate most code duplication between x86_32 and x86_64:
-; Currently this works only for leaf functions which load all their arguments
-; into registers at the start, and make no other use of the stack. Luckily that
-; covers most of x264's asm.
-
-; PROLOGUE:
-; %1 = number of arguments. loads them from stack if needed.
-; %2 = number of registers used. pushes callee-saved regs if needed.
-; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
-; %4 = list of names to define to registers
-; PROLOGUE can also be invoked by adding the same options to cglobal
-
-; e.g.
-; cglobal foo, 2,3,0, dst, src, tmp
-; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
-
-; TODO Some functions can use some args directly from the stack. If they're the
-; last args then you can just not declare them, but if they're in the middle
-; we need more flexible macro.
-
-; RET:
-; Pops anything that was pushed by PROLOGUE, and returns.
-
-; REP_RET:
-; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
-; which are slow when a normal ret follows a branch.
-
-; registers:
-; rN and rNq are the native-size register holding function argument N
-; rNd, rNw, rNb are dword, word, and byte size
-; rNh is the high 8 bits of the word size
-; rNm is the original location of arg N (a register or on the stack), dword
-; rNmp is native size
-
-%macro DECLARE_REG 2-3
- %define r%1q %2
- %define r%1d %2d
- %define r%1w %2w
- %define r%1b %2b
- %define r%1h %2h
- %if %0 == 2
- %define r%1m %2d
- %define r%1mp %2
- %elif ARCH_X86_64 ; memory
- %define r%1m [rsp + stack_offset + %3]
- %define r%1mp qword r %+ %1m
- %else
- %define r%1m [esp + stack_offset + %3]
- %define r%1mp dword r %+ %1m
- %endif
- %define r%1 %2
-%endmacro
-
-%macro DECLARE_REG_SIZE 3
- %define r%1q r%1
- %define e%1q r%1
- %define r%1d e%1
- %define e%1d e%1
- %define r%1w %1
- %define e%1w %1
- %define r%1h %3
- %define e%1h %3
- %define r%1b %2
- %define e%1b %2
-%if ARCH_X86_64 == 0
- %define r%1 e%1
-%endif
-%endmacro
-
-DECLARE_REG_SIZE ax, al, ah
-DECLARE_REG_SIZE bx, bl, bh
-DECLARE_REG_SIZE cx, cl, ch
-DECLARE_REG_SIZE dx, dl, dh
-DECLARE_REG_SIZE si, sil, null
-DECLARE_REG_SIZE di, dil, null
-DECLARE_REG_SIZE bp, bpl, null
-
-; t# defines for when per-arch register allocation is more complex than just function arguments
-
-%macro DECLARE_REG_TMP 1-*
- %assign %%i 0
- %rep %0
- CAT_XDEFINE t, %%i, r%1
- %assign %%i %%i+1
- %rotate 1
- %endrep
-%endmacro
-
-%macro DECLARE_REG_TMP_SIZE 0-*
- %rep %0
- %define t%1q t%1 %+ q
- %define t%1d t%1 %+ d
- %define t%1w t%1 %+ w
- %define t%1h t%1 %+ h
- %define t%1b t%1 %+ b
- %rotate 1
- %endrep
-%endmacro
-
-DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
-
-%if ARCH_X86_64
- %define gprsize 8
-%else
- %define gprsize 4
-%endif
-
-%macro PUSH 1
- push %1
- %assign stack_offset stack_offset+gprsize
-%endmacro
-
-%macro POP 1
- pop %1
- %assign stack_offset stack_offset-gprsize
-%endmacro
-
-%macro PUSH_IF_USED 1-*
- %rep %0
- %if %1 < regs_used
- PUSH r%1
- %endif
- %rotate 1
- %endrep
-%endmacro
-
-%macro POP_IF_USED 1-*
- %rep %0
- %if %1 < regs_used
- pop r%1
- %endif
- %rotate 1
- %endrep
-%endmacro
-
-%macro LOAD_IF_USED 1-*
- %rep %0
- %if %1 < num_args
- mov r%1, r %+ %1 %+ mp
- %endif
- %rotate 1
- %endrep
-%endmacro
-
-%macro SUB 2
- sub %1, %2
- %ifidn %1, rsp
- %assign stack_offset stack_offset+(%2)
- %endif
-%endmacro
-
-%macro ADD 2
- add %1, %2
- %ifidn %1, rsp
- %assign stack_offset stack_offset-(%2)
- %endif
-%endmacro
-
-%macro movifnidn 2
- %ifnidn %1, %2
- mov %1, %2
- %endif
-%endmacro
-
-%macro movsxdifnidn 2
- %ifnidn %1, %2
- movsxd %1, %2
- %endif
-%endmacro
-
-%macro ASSERT 1
- %if (%1) == 0
- %error assert failed
- %endif
-%endmacro
-
-%macro DEFINE_ARGS 0-*
- %ifdef n_arg_names
- %assign %%i 0
- %rep n_arg_names
- CAT_UNDEF arg_name %+ %%i, q
- CAT_UNDEF arg_name %+ %%i, d
- CAT_UNDEF arg_name %+ %%i, w
- CAT_UNDEF arg_name %+ %%i, h
- CAT_UNDEF arg_name %+ %%i, b
- CAT_UNDEF arg_name %+ %%i, m
- CAT_UNDEF arg_name %+ %%i, mp
- CAT_UNDEF arg_name, %%i
- %assign %%i %%i+1
- %endrep
- %endif
-
- %xdefine %%stack_offset stack_offset
- %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
- %assign %%i 0
- %rep %0
- %xdefine %1q r %+ %%i %+ q
- %xdefine %1d r %+ %%i %+ d
- %xdefine %1w r %+ %%i %+ w
- %xdefine %1h r %+ %%i %+ h
- %xdefine %1b r %+ %%i %+ b
- %xdefine %1m r %+ %%i %+ m
- %xdefine %1mp r %+ %%i %+ mp
- CAT_XDEFINE arg_name, %%i, %1
- %assign %%i %%i+1
- %rotate 1
- %endrep
- %xdefine stack_offset %%stack_offset
- %assign n_arg_names %0
-%endmacro
-
-%if WIN64 ; Windows x64 ;=================================================
-
-DECLARE_REG 0, rcx
-DECLARE_REG 1, rdx
-DECLARE_REG 2, R8
-DECLARE_REG 3, R9
-DECLARE_REG 4, R10, 40
-DECLARE_REG 5, R11, 48
-DECLARE_REG 6, rax, 56
-DECLARE_REG 7, rdi, 64
-DECLARE_REG 8, rsi, 72
-DECLARE_REG 9, rbx, 80
-DECLARE_REG 10, rbp, 88
-DECLARE_REG 11, R12, 96
-DECLARE_REG 12, R13, 104
-DECLARE_REG 13, R14, 112
-DECLARE_REG 14, R15, 120
-
-%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
- %assign num_args %1
- %assign regs_used %2
- ASSERT regs_used >= num_args
- ASSERT regs_used <= 15
- PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
- %if mmsize == 8
- %assign xmm_regs_used 0
- %else
- WIN64_SPILL_XMM %3
- %endif
- LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
- DEFINE_ARGS %4
-%endmacro
-
-%macro WIN64_SPILL_XMM 1
- %assign xmm_regs_used %1
- ASSERT xmm_regs_used <= 16
- %if xmm_regs_used > 6
- SUB rsp, (xmm_regs_used-6)*16+16
- %assign %%i xmm_regs_used
- %rep (xmm_regs_used-6)
- %assign %%i %%i-1
- movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
- %endrep
- %endif
-%endmacro
-
-%macro WIN64_RESTORE_XMM_INTERNAL 1
- %if xmm_regs_used > 6
- %assign %%i xmm_regs_used
- %rep (xmm_regs_used-6)
- %assign %%i %%i-1
- movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
- %endrep
- add %1, (xmm_regs_used-6)*16+16
- %endif
-%endmacro
-
-%macro WIN64_RESTORE_XMM 1
- WIN64_RESTORE_XMM_INTERNAL %1
- %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
- %assign xmm_regs_used 0
-%endmacro
-
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32
-
-%macro RET 0
- WIN64_RESTORE_XMM_INTERNAL rsp
- POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
-%if mmsize == 32
- vzeroupper
-%endif
- ret
-%endmacro
-
-%elif ARCH_X86_64 ; *nix x64 ;=============================================
-
-DECLARE_REG 0, rdi
-DECLARE_REG 1, rsi
-DECLARE_REG 2, rdx
-DECLARE_REG 3, rcx
-DECLARE_REG 4, R8
-DECLARE_REG 5, R9
-DECLARE_REG 6, rax, 8
-DECLARE_REG 7, R10, 16
-DECLARE_REG 8, R11, 24
-DECLARE_REG 9, rbx, 32
-DECLARE_REG 10, rbp, 40
-DECLARE_REG 11, R12, 48
-DECLARE_REG 12, R13, 56
-DECLARE_REG 13, R14, 64
-DECLARE_REG 14, R15, 72
-
-%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
- %assign num_args %1
- %assign regs_used %2
- ASSERT regs_used >= num_args
- ASSERT regs_used <= 15
- PUSH_IF_USED 9, 10, 11, 12, 13, 14
- LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
- DEFINE_ARGS %4
-%endmacro
-
-%define has_epilogue regs_used > 9 || mmsize == 32
-
-%macro RET 0
- POP_IF_USED 14, 13, 12, 11, 10, 9
-%if mmsize == 32
- vzeroupper
-%endif
- ret
-%endmacro
-
-%else ; X86_32 ;==============================================================
-
-DECLARE_REG 0, eax, 4
-DECLARE_REG 1, ecx, 8
-DECLARE_REG 2, edx, 12
-DECLARE_REG 3, ebx, 16
-DECLARE_REG 4, esi, 20
-DECLARE_REG 5, edi, 24
-DECLARE_REG 6, ebp, 28
-%define rsp esp
-
-%macro DECLARE_ARG 1-*
- %rep %0
- %define r%1m [esp + stack_offset + 4*%1 + 4]
- %define r%1mp dword r%1m
- %rotate 1
- %endrep
-%endmacro
-
-DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
-
-%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
- %assign num_args %1
- %assign regs_used %2
- %if regs_used > 7
- %assign regs_used 7
- %endif
- ASSERT regs_used >= num_args
- PUSH_IF_USED 3, 4, 5, 6
- LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
- DEFINE_ARGS %4
-%endmacro
-
-%define has_epilogue regs_used > 3 || mmsize == 32
-
-%macro RET 0
- POP_IF_USED 6, 5, 4, 3
-%if mmsize == 32
- vzeroupper
-%endif
- ret
-%endmacro
-
-%endif ;======================================================================
-
-%if WIN64 == 0
-%macro WIN64_SPILL_XMM 1
-%endmacro
-%macro WIN64_RESTORE_XMM 1
-%endmacro
-%endif
-
-%macro REP_RET 0
- %if has_epilogue
- RET
- %else
- rep ret
- %endif
-%endmacro
-
-%macro TAIL_CALL 2 ; callee, is_nonadjacent
- %if has_epilogue
- call %1
- RET
- %elif %2
- jmp %1
- %endif
-%endmacro
-
-;=============================================================================
-; arch-independent part
-;=============================================================================
-
-%assign function_align 16
-
-; Begin a function.
-; Applies any symbol mangling needed for C linkage, and sets up a define such that
-; subsequent uses of the function name automatically refer to the mangled version.
-; Appends cpuflags to the function name if cpuflags has been specified.
-%macro cglobal 1-2+ ; name, [PROLOGUE args]
-%if %0 == 1
- cglobal_internal %1 %+ SUFFIX
-%else
- cglobal_internal %1 %+ SUFFIX, %2
-%endif
-%endmacro
-%macro cglobal_internal 1-2+
- %ifndef cglobaled_%1
- %xdefine %1 mangle(%1)
- %xdefine %1.skip_prologue %1 %+ .skip_prologue
- CAT_XDEFINE cglobaled_, %1, 1
- %endif
- %xdefine current_function %1
- %ifidn __OUTPUT_FORMAT__,elf
- global %1:function hidden
- %else
- global %1
- %endif
- align function_align
- %1:
- RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
- %assign stack_offset 0
- %if %0 > 1
- PROLOGUE %2
- %endif
-%endmacro
-
-%macro cextern 1
- %xdefine %1 mangle(%1)
- CAT_XDEFINE cglobaled_, %1, 1
- extern %1
-%endmacro
-
-; like cextern, but without the prefix
-%macro cextern_naked 1
- %xdefine %1 mangle(%1)
- CAT_XDEFINE cglobaled_, %1, 1
- extern %1
-%endmacro
-
-%macro const 2+
- %xdefine %1 mangle(%1)
- global %1
- %1: %2
-%endmacro
-
-; This is needed for ELF, otherwise the GNU linker assumes the stack is
-; executable by default.
-%ifidn __OUTPUT_FORMAT__,elf
-SECTION .note.GNU-stack noalloc noexec nowrite progbits
-%endif
-%ifidn __OUTPUT_FORMAT__,elf32
-section .note.GNU-stack noalloc noexec nowrite progbits
-%endif
-%ifidn __OUTPUT_FORMAT__,elf64
-section .note.GNU-stack noalloc noexec nowrite progbits
-%endif
-
-; cpuflags
-
-%assign cpuflags_MMX (1<<0)
-%assign cpuflags_MMX2 (1<<1) | cpuflags_MMX
-%assign cpuflags_3dnow (1<<2) | cpuflags_MMX
-%assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow
-%assign cpuflags_SSE (1<<4) | cpuflags_MMX2
-%assign cpuflags_SSE2 (1<<5) | cpuflags_SSE
-%assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2
-%assign cpuflags_SSE3 (1<<7) | cpuflags_SSE2
-%assign cpuflags_SSSE3 (1<<8) | cpuflags_SSE3
-%assign cpuflags_SSE4 (1<<9) | cpuflags_SSSE3
-%assign cpuflags_SSE42 (1<<10)| cpuflags_SSE4
-%assign cpuflags_AVX (1<<11)| cpuflags_SSE42
-%assign cpuflags_xop (1<<12)| cpuflags_AVX
-%assign cpuflags_fma4 (1<<13)| cpuflags_AVX
-%assign cpuflags_AVX2 (1<<14)| cpuflags_AVX
-%assign cpuflags_fma3 (1<<15)| cpuflags_AVX
-
-%assign cpuflags_cache32 (1<<16)
-%assign cpuflags_cache64 (1<<17)
-%assign cpuflags_slowctz (1<<18)
-%assign cpuflags_lzcnt (1<<19)
-%assign cpuflags_misalign (1<<20)
-%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
-%assign cpuflags_atom (1<<22)
-%assign cpuflags_bmi1 (1<<23)
-%assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1
-%assign cpuflags_tbm (1<<25)|cpuflags_bmi1
-
-%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
-%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
-
-; Takes up to 2 cpuflags from the above list.
-; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
-; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
-%macro INIT_CPUFLAGS 0-2
- %if %0 >= 1
- %xdefine cpuname %1
- %assign cpuflags cpuflags_%1
- %if %0 >= 2
- %xdefine cpuname %1_%2
- %assign cpuflags cpuflags | cpuflags_%2
- %endif
- %xdefine SUFFIX _ %+ cpuname
- %if cpuflag(AVX)
- %assign AVX_enabled 1
- %endif
- %if mmsize == 16 && notcpuflag(SSE2)
- %define mova movaps
- %define movu movups
- %define movnta movntps
- %endif
- %if cpuflag(aligned)
- %define movu mova
- %elifidn %1, SSE3
- %define movu lddqu
- %endif
- %else
- %xdefine SUFFIX
- %undef cpuname
- %undef cpuflags
- %endif
-%endmacro
-
-; merge MMX and SSE*
-
-%macro CAT_XDEFINE 3
- %xdefine %1%2 %3
-%endmacro
-
-%macro CAT_UNDEF 2
- %undef %1%2
-%endmacro
-
-%macro INIT_MMX 0-1+
- %assign AVX_enabled 0
- %define RESET_MM_PERMUTATION INIT_MMX %1
- %define mmsize 8
- %define num_mmregs 8
- %define mova movq
- %define movu movq
- %define movh movd
- %define movnta movntq
- %assign %%i 0
- %rep 8
- CAT_XDEFINE m, %%i, mm %+ %%i
- CAT_XDEFINE nmm, %%i, %%i
- %assign %%i %%i+1
- %endrep
- %rep 8
- CAT_UNDEF m, %%i
- CAT_UNDEF nmm, %%i
- %assign %%i %%i+1
- %endrep
- INIT_CPUFLAGS %1
-%endmacro
-
-%macro INIT_XMM 0-1+
- %assign AVX_enabled 0
- %define RESET_MM_PERMUTATION INIT_XMM %1
- %define mmsize 16
- %define num_mmregs 8
- %if ARCH_X86_64
- %define num_mmregs 16
- %endif
- %define mova movdqa
- %define movu movdqu
- %define movh movq
- %define movnta movntdq
- %assign %%i 0
- %rep num_mmregs
- CAT_XDEFINE m, %%i, xmm %+ %%i
- CAT_XDEFINE nxmm, %%i, %%i
- %assign %%i %%i+1
- %endrep
- INIT_CPUFLAGS %1
-%endmacro
-
-%macro INIT_YMM 0-1+
- %assign AVX_enabled 1
- %define RESET_MM_PERMUTATION INIT_YMM %1
- %define mmsize 32
- %define num_mmregs 8
- %if ARCH_X86_64
- %define num_mmregs 16
- %endif
- %define mova vmovaps
- %define movu vmovups
- %undef movh
- %define movnta vmovntps
- %assign %%i 0
- %rep num_mmregs
- CAT_XDEFINE m, %%i, ymm %+ %%i
- CAT_XDEFINE nymm, %%i, %%i
- %assign %%i %%i+1
- %endrep
- INIT_CPUFLAGS %1
-%endmacro
-
-INIT_XMM
-
-; I often want to use macros that permute their arguments. e.g. there's no
-; efficient way to implement butterfly or transpose or dct without swapping some
-; arguments.
-;
-; I would like to not have to manually keep track of the permutations:
-; If I insert a permutation in the middle of a function, it should automatically
-; change everything that follows. For more complex macros I may also have multiple
-; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
-;
-; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
-; permutes its arguments. It's equivalent to exchanging the contents of the
-; registers, except that this way you exchange the register names instead, so it
-; doesn't cost any cycles.
-
-%macro PERMUTE 2-* ; takes a list of pairs to swap
-%rep %0/2
- %xdefine tmp%2 m%2
- %xdefine ntmp%2 nm%2
- %rotate 2
-%endrep
-%rep %0/2
- %xdefine m%1 tmp%2
- %xdefine nm%1 ntmp%2
- %undef tmp%2
- %undef ntmp%2
- %rotate 2
-%endrep
-%endmacro
-
-%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
-%rep %0-1
-%ifdef m%1
- %xdefine tmp m%1
- %xdefine m%1 m%2
- %xdefine m%2 tmp
- CAT_XDEFINE n, m%1, %1
- CAT_XDEFINE n, m%2, %2
-%else
- ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
- ; Be careful using this mode in nested macros though, as in some cases there may be
- ; other copies of m# that have already been dereferenced and don't get updated correctly.
- %xdefine %%n1 n %+ %1
- %xdefine %%n2 n %+ %2
- %xdefine tmp m %+ %%n1
- CAT_XDEFINE m, %%n1, m %+ %%n2
- CAT_XDEFINE m, %%n2, tmp
- CAT_XDEFINE n, m %+ %%n1, %%n1
- CAT_XDEFINE n, m %+ %%n2, %%n2
-%endif
- %undef tmp
- %rotate 1
-%endrep
-%endmacro
-
-; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
-; calls to that function will automatically load the permutation, so values can
-; be returned in mmregs.
-%macro SAVE_MM_PERMUTATION 0-1
- %if %0
- %xdefine %%f %1_m
- %else
- %xdefine %%f current_function %+ _m
- %endif
- %assign %%i 0
- %rep num_mmregs
- CAT_XDEFINE %%f, %%i, m %+ %%i
- %assign %%i %%i+1
- %endrep
-%endmacro
-
-%macro LOAD_MM_PERMUTATION 1 ; name to load from
- %ifdef %1_m0
- %assign %%i 0
- %rep num_mmregs
- CAT_XDEFINE m, %%i, %1_m %+ %%i
- CAT_XDEFINE n, m %+ %%i, %%i
- %assign %%i %%i+1
- %endrep
- %endif
-%endmacro
-
-; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
-%macro call 1
- call_internal %1, %1 %+ SUFFIX
-%endmacro
-%macro call_internal 2
- %xdefine %%i %1
- %ifndef cglobaled_%1
- %ifdef cglobaled_%2
- %xdefine %%i %2
- %endif
- %endif
- call %%i
- LOAD_MM_PERMUTATION %%i
-%endmacro
-
-; Substitutions that reduce instruction size but are functionally equivalent
-%macro add 2
- %ifnum %2
- %if %2==128
- sub %1, -128
- %else
- add %1, %2
- %endif
- %else
- add %1, %2
- %endif
-%endmacro
-
-%macro sub 2
- %ifnum %2
- %if %2==128
- add %1, -128
- %else
- sub %1, %2
- %endif
- %else
- sub %1, %2
- %endif
-%endmacro
-
-;=============================================================================
-; AVX abstraction layer
-;=============================================================================
-
-%assign i 0
-%rep 16
- %if i < 8
- CAT_XDEFINE sizeofmm, i, 8
- %endif
- CAT_XDEFINE sizeofxmm, i, 16
- CAT_XDEFINE sizeofymm, i, 32
-%assign i i+1
-%endrep
-%undef i
-
-%macro CHECK_AVX_INSTR_EMU 3-*
- %xdefine %%opcode %1
- %xdefine %%dst %2
- %rep %0-2
- %ifidn %%dst, %3
- %error non-AVX emulation of ``%%opcode'' is not supported
- %endif
- %rotate 1
- %endrep
-%endmacro
-
-;%1 == instruction
-;%2 == 1 if float, 0 if int
-;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
-;%4 == number of operands given
-;%5+: operands
-%macro RUN_AVX_INSTR 6-7+
- %ifid %6
- %define %%sizeofreg sizeof%6
- %elifid %5
- %define %%sizeofreg sizeof%5
- %else
- %define %%sizeofreg mmsize
- %endif
- %if %%sizeofreg==32
- %if %4>=3
- v%1 %5, %6, %7
- %else
- v%1 %5, %6
- %endif
- %else
- %if %%sizeofreg==8
- %define %%regmov movq
- %elif %2
- %define %%regmov movaps
- %else
- %define %%regmov movdqa
- %endif
-
- %if %4>=3+%3
- %ifnidn %5, %6
- %if AVX_enabled && %%sizeofreg==16
- v%1 %5, %6, %7
- %else
- CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
- %%regmov %5, %6
- %1 %5, %7
- %endif
- %else
- %1 %5, %7
- %endif
- %elif %4>=3
- %1 %5, %6, %7
- %else
- %1 %5, %6
- %endif
- %endif
-%endmacro
-
-; 3arg AVX ops with a memory arg can only have it in src2,
-; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
-; So, if the op is symmetric and the wrong one is memory, swap them.
-%macro RUN_AVX_INSTR1 8
- %assign %%swap 0
- %if AVX_enabled
- %ifnid %6
- %assign %%swap 1
- %endif
- %elifnidn %5, %6
- %ifnid %7
- %assign %%swap 1
- %endif
- %endif
- %if %%swap && %3 == 0 && %8 == 1
- RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
- %else
- RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
- %endif
-%endmacro
-
-;%1 == instruction
-;%2 == 1 if float, 0 if int
-;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
-;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
-%macro AVX_INSTR 4
- %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
- %ifidn %3, fnord
- RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
- %elifidn %4, fnord
- RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
- %elifidn %5, fnord
- RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
- %else
- RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
- %endif
- %endmacro
-%endmacro
-
-AVX_INSTR addpd, 1, 0, 1
-AVX_INSTR addps, 1, 0, 1
-AVX_INSTR addsd, 1, 0, 1
-AVX_INSTR addss, 1, 0, 1
-AVX_INSTR addsubpd, 1, 0, 0
-AVX_INSTR addsubps, 1, 0, 0
-AVX_INSTR andpd, 1, 0, 1
-AVX_INSTR andps, 1, 0, 1
-AVX_INSTR andnpd, 1, 0, 0
-AVX_INSTR andnps, 1, 0, 0
-AVX_INSTR blendpd, 1, 0, 0
-AVX_INSTR blendps, 1, 0, 0
-AVX_INSTR blendvpd, 1, 0, 0
-AVX_INSTR blendvps, 1, 0, 0
-AVX_INSTR cmppd, 1, 0, 0
-AVX_INSTR cmpps, 1, 0, 0
-AVX_INSTR cmpsd, 1, 0, 0
-AVX_INSTR cmpss, 1, 0, 0
-AVX_INSTR cvtdq2ps, 1, 0, 0
-AVX_INSTR cvtps2dq, 1, 0, 0
-AVX_INSTR divpd, 1, 0, 0
-AVX_INSTR divps, 1, 0, 0
-AVX_INSTR divsd, 1, 0, 0
-AVX_INSTR divss, 1, 0, 0
-AVX_INSTR dppd, 1, 1, 0
-AVX_INSTR dpps, 1, 1, 0
-AVX_INSTR haddpd, 1, 0, 0
-AVX_INSTR haddps, 1, 0, 0
-AVX_INSTR hsubpd, 1, 0, 0
-AVX_INSTR hsubps, 1, 0, 0
-AVX_INSTR maxpd, 1, 0, 1
-AVX_INSTR maxps, 1, 0, 1
-AVX_INSTR maxsd, 1, 0, 1
-AVX_INSTR maxss, 1, 0, 1
-AVX_INSTR minpd, 1, 0, 1
-AVX_INSTR minps, 1, 0, 1
-AVX_INSTR minsd, 1, 0, 1
-AVX_INSTR minss, 1, 0, 1
-AVX_INSTR movhlps, 1, 0, 0
-AVX_INSTR movlhps, 1, 0, 0
-AVX_INSTR movsd, 1, 0, 0
-AVX_INSTR movss, 1, 0, 0
-AVX_INSTR mpsadbw, 0, 1, 0
-AVX_INSTR mulpd, 1, 0, 1
-AVX_INSTR mulps, 1, 0, 1
-AVX_INSTR mulsd, 1, 0, 1
-AVX_INSTR mulss, 1, 0, 1
-AVX_INSTR orpd, 1, 0, 1
-AVX_INSTR orps, 1, 0, 1
-AVX_INSTR pabsb, 0, 0, 0
-AVX_INSTR pabsw, 0, 0, 0
-AVX_INSTR pabsd, 0, 0, 0
-AVX_INSTR packsswb, 0, 0, 0
-AVX_INSTR packssdw, 0, 0, 0
-AVX_INSTR packuswb, 0, 0, 0
-AVX_INSTR packusdw, 0, 0, 0
-AVX_INSTR paddb, 0, 0, 1
-AVX_INSTR paddw, 0, 0, 1
-AVX_INSTR paddd, 0, 0, 1
-AVX_INSTR paddq, 0, 0, 1
-AVX_INSTR paddsb, 0, 0, 1
-AVX_INSTR paddsw, 0, 0, 1
-AVX_INSTR paddusb, 0, 0, 1
-AVX_INSTR paddusw, 0, 0, 1
-AVX_INSTR palignr, 0, 1, 0
-AVX_INSTR pand, 0, 0, 1
-AVX_INSTR pandn, 0, 0, 0
-AVX_INSTR pavgb, 0, 0, 1
-AVX_INSTR pavgw, 0, 0, 1
-AVX_INSTR pblendvb, 0, 0, 0
-AVX_INSTR pblendw, 0, 1, 0
-AVX_INSTR pcmpestri, 0, 0, 0
-AVX_INSTR pcmpestrm, 0, 0, 0
-AVX_INSTR pcmpistri, 0, 0, 0
-AVX_INSTR pcmpistrm, 0, 0, 0
-AVX_INSTR pcmpeqb, 0, 0, 1
-AVX_INSTR pcmpeqw, 0, 0, 1
-AVX_INSTR pcmpeqd, 0, 0, 1
-AVX_INSTR pcmpeqq, 0, 0, 1
-AVX_INSTR pcmpgtb, 0, 0, 0
-AVX_INSTR pcmpgtw, 0, 0, 0
-AVX_INSTR pcmpgtd, 0, 0, 0
-AVX_INSTR pcmpgtq, 0, 0, 0
-AVX_INSTR phaddw, 0, 0, 0
-AVX_INSTR phaddd, 0, 0, 0
-AVX_INSTR phaddsw, 0, 0, 0
-AVX_INSTR phsubw, 0, 0, 0
-AVX_INSTR phsubd, 0, 0, 0
-AVX_INSTR phsubsw, 0, 0, 0
-AVX_INSTR pmaddwd, 0, 0, 1
-AVX_INSTR pmaddubsw, 0, 0, 0
-AVX_INSTR pmaxsb, 0, 0, 1
-AVX_INSTR pmaxsw, 0, 0, 1
-AVX_INSTR pmaxsd, 0, 0, 1
-AVX_INSTR pmaxub, 0, 0, 1
-AVX_INSTR pmaxuw, 0, 0, 1
-AVX_INSTR pmaxud, 0, 0, 1
-AVX_INSTR pminsb, 0, 0, 1
-AVX_INSTR pminsw, 0, 0, 1
-AVX_INSTR pminsd, 0, 0, 1
-AVX_INSTR pminub, 0, 0, 1
-AVX_INSTR pminuw, 0, 0, 1
-AVX_INSTR pminud, 0, 0, 1
-AVX_INSTR pmovmskb, 0, 0, 0
-AVX_INSTR pmulhuw, 0, 0, 1
-AVX_INSTR pmulhrsw, 0, 0, 1
-AVX_INSTR pmulhw, 0, 0, 1
-AVX_INSTR pmullw, 0, 0, 1
-AVX_INSTR pmulld, 0, 0, 1
-AVX_INSTR pmuludq, 0, 0, 1
-AVX_INSTR pmuldq, 0, 0, 1
-AVX_INSTR por, 0, 0, 1
-AVX_INSTR psadbw, 0, 0, 1
-AVX_INSTR pshufb, 0, 0, 0
-AVX_INSTR pshufd, 0, 1, 0
-AVX_INSTR pshufhw, 0, 1, 0
-AVX_INSTR pshuflw, 0, 1, 0
-AVX_INSTR psignb, 0, 0, 0
-AVX_INSTR psignw, 0, 0, 0
-AVX_INSTR psignd, 0, 0, 0
-AVX_INSTR psllw, 0, 0, 0
-AVX_INSTR pslld, 0, 0, 0
-AVX_INSTR psllq, 0, 0, 0
-AVX_INSTR pslldq, 0, 0, 0
-AVX_INSTR psraw, 0, 0, 0
-AVX_INSTR psrad, 0, 0, 0
-AVX_INSTR psrlw, 0, 0, 0
-AVX_INSTR psrld, 0, 0, 0
-AVX_INSTR psrlq, 0, 0, 0
-AVX_INSTR psrldq, 0, 0, 0
-AVX_INSTR psubb, 0, 0, 0
-AVX_INSTR psubw, 0, 0, 0
-AVX_INSTR psubd, 0, 0, 0
-AVX_INSTR psubq, 0, 0, 0
-AVX_INSTR psubsb, 0, 0, 0
-AVX_INSTR psubsw, 0, 0, 0
-AVX_INSTR psubusb, 0, 0, 0
-AVX_INSTR psubusw, 0, 0, 0
-AVX_INSTR ptest, 0, 0, 0
-AVX_INSTR punpckhbw, 0, 0, 0
-AVX_INSTR punpckhwd, 0, 0, 0
-AVX_INSTR punpckhdq, 0, 0, 0
-AVX_INSTR punpckhqdq, 0, 0, 0
-AVX_INSTR punpcklbw, 0, 0, 0
-AVX_INSTR punpcklwd, 0, 0, 0
-AVX_INSTR punpckldq, 0, 0, 0
-AVX_INSTR punpcklqdq, 0, 0, 0
-AVX_INSTR pxor, 0, 0, 1
-AVX_INSTR shufps, 1, 1, 0
-AVX_INSTR subpd, 1, 0, 0
-AVX_INSTR subps, 1, 0, 0
-AVX_INSTR subsd, 1, 0, 0
-AVX_INSTR subss, 1, 0, 0
-AVX_INSTR unpckhpd, 1, 0, 0
-AVX_INSTR unpckhps, 1, 0, 0
-AVX_INSTR unpcklpd, 1, 0, 0
-AVX_INSTR unpcklps, 1, 0, 0
-AVX_INSTR xorpd, 1, 0, 1
-AVX_INSTR xorps, 1, 0, 1
-
-; 3DNow instructions, for sharing code between AVX, SSE and 3DN
-AVX_INSTR pfadd, 1, 0, 1
-AVX_INSTR pfsub, 1, 0, 0
-AVX_INSTR pfmul, 1, 0, 1
-
-; base-4 constants for shuffles
-%assign i 0
-%rep 256
- %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
- %if j < 10
- CAT_XDEFINE q000, j, i
- %elif j < 100
- CAT_XDEFINE q00, j, i
- %elif j < 1000
- CAT_XDEFINE q0, j, i
- %else
- CAT_XDEFINE q, j, i
- %endif
-%assign i i+1
-%endrep
-%undef i
-%undef j
-
-%macro FMA_INSTR 3
- %macro %1 4-7 %1, %2, %3
- %if cpuflag(xop)
- v%5 %1, %2, %3, %4
- %else
- %6 %1, %2, %3
- %7 %1, %4
- %endif
- %endmacro
-%endmacro
-
-FMA_INSTR pmacsdd, pmulld, paddd
-FMA_INSTR pmacsww, pmullw, paddw
-FMA_INSTR pmadcswd, pmaddwd, paddd
-
-; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
-; This lets us use tzcnt without bumping the yasm version requirement yet.
-%define tzcnt rep bsf