Discussion:
[FFmpeg-devel] [PATCH] swscale/ppc: Move VSX-using code to its own file
Lauri Kasanen
2018-11-26 12:24:15 UTC
Permalink
Passes fate on LE (with "lavc/jrevdct: Avoid an aliasing violation" applied). Can anyone test BE?

Signed-off-by: Lauri Kasanen <***@gmx.com>
---
libswscale/ppc/Makefile | 1 +
libswscale/ppc/swscale_altivec.c | 291 ++--------------------------------
libswscale/ppc/swscale_ppc_template.c | 217 +++++++++++++++++++++++++
libswscale/ppc/swscale_vsx.c | 164 +++++++++++++++++++
libswscale/swscale_internal.h | 1 +
5 files changed, 393 insertions(+), 281 deletions(-)
create mode 100644 libswscale/ppc/swscale_ppc_template.c
create mode 100644 libswscale/ppc/swscale_vsx.c

diff --git a/libswscale/ppc/Makefile b/libswscale/ppc/Makefile
index d1b596e..0a31a30 100644
--- a/libswscale/ppc/Makefile
+++ b/libswscale/ppc/Makefile
@@ -1,3 +1,4 @@
OBJS += ppc/swscale_altivec.o \
ppc/yuv2rgb_altivec.o \
ppc/yuv2yuv_altivec.o \
+ ppc/swscale_vsx.o
diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c
index 8c6056d..1d2b2fa 100644
--- a/libswscale/ppc/swscale_altivec.c
+++ b/libswscale/ppc/swscale_altivec.c
@@ -31,21 +31,14 @@
#include "yuv2rgb_altivec.h"
#include "libavutil/ppc/util_altivec.h"

-#if HAVE_ALTIVEC
+#if HAVE_ALTIVEC && HAVE_BIGENDIAN
#define vzero vec_splat_s32(0)

-#if HAVE_BIGENDIAN
#define GET_LS(a,b,c,s) {\
vector signed short l2 = vec_ld(((b) << 1) + 16, s);\
ls = vec_perm(a, l2, c);\
a = l2;\
}
-#else
-#define GET_LS(a,b,c,s) {\
- ls = a;\
- a = vec_vsx_ld(((b) << 1) + 16, s);\
- }
-#endif

#define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
vector signed short ls;\
@@ -59,7 +52,6 @@
d2 = vec_add(d2, vf2);\
} while (0)

-#if HAVE_BIGENDIAN
#define LOAD_FILTER(vf,f) {\
vector unsigned char perm0 = vec_lvsl(joffset, f);\
vf = vec_ld(joffset, f);\
@@ -69,89 +61,7 @@
p = vec_lvsl(xoffset, s);\
ll1 = vec_ld(xoffset, s);\
}
-#else
-#define LOAD_FILTER(vf,f) {\
- vf = vec_vsx_ld(joffset, f);\
-}
-#define LOAD_L1(ll1,s,p){\
- ll1 = vec_vsx_ld(xoffset, s);\
-}
-#endif
-
-static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize,
- const int16_t **src, uint8_t *dest,
- const uint8_t *dither, int offset, int x)
-{
- register int i, j;
- LOCAL_ALIGNED(16, int, val, [16]);
- vector signed int vo1, vo2, vo3, vo4;
- vector unsigned short vs1, vs2;
- vector unsigned char vf;
- vector unsigned int altivec_vectorShiftInt19 =
- vec_add(vec_splat_u32(10), vec_splat_u32(9));
-
- for (i = 0; i < 16; i++)
- val[i] = dither[(x + i + offset) & 7] << 12;
-
- vo1 = vec_ld(0, val);
- vo2 = vec_ld(16, val);
- vo3 = vec_ld(32, val);
- vo4 = vec_ld(48, val);
-
- for (j = 0; j < filterSize; j++) {
- unsigned int joffset=j<<1;
- unsigned int xoffset=x<<1;
- vector unsigned char perm;
- vector signed short l1,vLumFilter;
- LOAD_FILTER(vLumFilter,filter);
- vLumFilter = vec_splat(vLumFilter, 0);
- LOAD_L1(l1,src[j],perm);
- yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter);
- yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
- }
-
- vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
- vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
- vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
- vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
- vs1 = vec_packsu(vo1, vo2);
- vs2 = vec_packsu(vo3, vo4);
- vf = vec_packsu(vs1, vs2);
- VEC_ST(vf, 0, dest);
-}
-
-
-static inline void yuv2planeX_u(const int16_t *filter, int filterSize,
- const int16_t **src, uint8_t *dest, int dstW,
- const uint8_t *dither, int offset, int x)
-{
- int i, j;
-
- for (i = x; i < dstW; i++) {
- int t = dither[(i + offset) & 7] << 12;
- for (j = 0; j < filterSize; j++)
- t += src[j][i] * filter[j];
- dest[i] = av_clip_uint8(t >> 19);
- }
-}
-
-static void yuv2planeX_altivec(const int16_t *filter, int filterSize,
- const int16_t **src, uint8_t *dest, int dstW,
- const uint8_t *dither, int offset)
-{
- int dst_u = -(uintptr_t)dest & 15;
- int i;
-
- yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
-
- for (i = dst_u; i < dstW - 15; i += 16)
- yuv2planeX_16_altivec(filter, filterSize, src, dest + i, dither,
- offset, i);
-
- yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
-}

-#if HAVE_BIGENDIAN
// The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).

// The neat trick: We only care for half the elements,
@@ -187,191 +97,12 @@ static void yuv2planeX_altivec(const int16_t *filter, int filterSize,
vf1 = vec_ld((a * 2 * filterSize) + (b * 2) + 16 + off, f);\
vf = vec_perm(vf0, vf1, per);\
}
-#else /* else of #if HAVE_BIGENDIAN */
-#define GET_VF4(a, vf, f) {\
- vf = (vector signed short)vec_vsx_ld(a << 3, f);\
- vf = vec_mergeh(vf, (vector signed short)vzero);\
-}
-#define FIRST_LOAD(sv, pos, s, per) {}
-#define UPDATE_PTR(s0, d0, s1, d1) {}
-#define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
- vf = vec_vsx_ld(pos + a, s);\
-}
-#define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) LOAD_SRCV(pos, a, s, per, v0, v1, vf)
-#define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
- vf = vec_vsx_ld((a * 2 * filterSize) + (b * 2) + off, f);\
-}
-#endif /* end of #if HAVE_BIGENDIAN */

-static void hScale_altivec_real(SwsContext *c, int16_t *dst, int dstW,
- const uint8_t *src, const int16_t *filter,
- const int32_t *filterPos, int filterSize)
-{
- register int i;
- LOCAL_ALIGNED(16, int, tempo, [4]);
+#define FUNC(name) name ## _altivec
+#include "swscale_ppc_template.c"
+#undef FUNC

- if (filterSize % 4) {
- for (i = 0; i < dstW; i++) {
- register int j;
- register int srcPos = filterPos[i];
- register int val = 0;
- for (j = 0; j < filterSize; j++)
- val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
- dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
- }
- } else
- switch (filterSize) {
- case 4:
- for (i = 0; i < dstW; i++) {
- register int srcPos = filterPos[i];
-
- vector unsigned char src_vF = unaligned_load(srcPos, src);
- vector signed short src_v, filter_v;
- vector signed int val_vEven, val_s;
- src_v = // vec_unpackh sign-extends...
- (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
- // now put our elements in the even slots
- src_v = vec_mergeh(src_v, (vector signed short)vzero);
- GET_VF4(i, filter_v, filter);
- val_vEven = vec_mule(src_v, filter_v);
- val_s = vec_sums(val_vEven, vzero);
- vec_st(val_s, 0, tempo);
- dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
- }
- break;
- case 8:
- for (i = 0; i < dstW; i++) {
- register int srcPos = filterPos[i];
- vector unsigned char src_vF, src_v0, src_v1;
- vector unsigned char permS;
- vector signed short src_v, filter_v;
- vector signed int val_v, val_s;
- FIRST_LOAD(src_v0, srcPos, src, permS);
- LOAD_SRCV8(srcPos, 0, src, permS, src_v0, src_v1, src_vF);
- src_v = // vec_unpackh sign-extends...
- (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
- filter_v = vec_ld(i << 4, filter);
- val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
- val_s = vec_sums(val_v, vzero);
- vec_st(val_s, 0, tempo);
- dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
- }
- break;
-
- case 16:
- for (i = 0; i < dstW; i++) {
- register int srcPos = filterPos[i];
-
- vector unsigned char src_vF = unaligned_load(srcPos, src);
- vector signed short src_vA = // vec_unpackh sign-extends...
- (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
- vector signed short src_vB = // vec_unpackh sign-extends...
- (vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
- vector signed short filter_v0 = vec_ld(i << 5, filter);
- vector signed short filter_v1 = vec_ld((i << 5) + 16, filter);
-
- vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero);
- vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc);
-
- vector signed int val_s = vec_sums(val_v, vzero);
-
- VEC_ST(val_s, 0, tempo);
- dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
- }
- break;
-
- default:
- for (i = 0; i < dstW; i++) {
- register int j, offset = i * 2 * filterSize;
- register int srcPos = filterPos[i];
-
- vector signed int val_s, val_v = (vector signed int)vzero;
- vector signed short filter_v0R;
- vector unsigned char permF, src_v0, permS;
- FIRST_LOAD(filter_v0R, offset, filter, permF);
- FIRST_LOAD(src_v0, srcPos, src, permS);
-
- for (j = 0; j < filterSize - 15; j += 16) {
- vector unsigned char src_v1, src_vF;
- vector signed short filter_v1R, filter_v2R, filter_v0, filter_v1;
- LOAD_SRCV(srcPos, j, src, permS, src_v0, src_v1, src_vF);
- vector signed short src_vA = // vec_unpackh sign-extends...
- (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
- vector signed short src_vB = // vec_unpackh sign-extends...
- (vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
- GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v0, 0);
- GET_VFD(i, j, filter, filter_v1R, filter_v2R, permF, filter_v1, 16);
-
- vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v);
- val_v = vec_msums(src_vB, filter_v1, val_acc);
- UPDATE_PTR(filter_v2R, filter_v0R, src_v1, src_v0);
- }
-
- if (j < filterSize - 7) {
- // loading src_v0 is useless, it's already done above
- vector unsigned char src_v1, src_vF;
- vector signed short src_v, filter_v1R, filter_v;
- LOAD_SRCV8(srcPos, j, src, permS, src_v0, src_v1, src_vF);
- src_v = // vec_unpackh sign-extends...
- (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
- GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v, 0);
- val_v = vec_msums(src_v, filter_v, val_v);
- }
- val_s = vec_sums(val_v, vzero);
-
- VEC_ST(val_s, 0, tempo);
- dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
- }
- }
-}
-
-static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
- const uint8_t *dither, int offset, int start)
-{
- int i;
- for (i = start; i < dstW; i++) {
- int val = (src[i] + dither[(i + offset) & 7]) >> 7;
- dest[i] = av_clip_uint8(val);
- }
-}
-
-static void yuv2plane1_8_altivec(const int16_t *src, uint8_t *dest, int dstW,
- const uint8_t *dither, int offset)
-{
- const int dst_u = -(uintptr_t)dest & 15;
- int i, j;
- LOCAL_ALIGNED(16, int16_t, val, [16]);
- const vector uint16_t shifts = (vector uint16_t) {7, 7, 7, 7, 7, 7, 7, 7};
- vector int16_t vi, vileft, ditherleft, ditherright;
- vector uint8_t vd;
-
- for (j = 0; j < 16; j++) {
- val[j] = dither[(dst_u + offset + j) & 7];
- }
-
- ditherleft = vec_ld(0, val);
- ditherright = vec_ld(0, &val[8]);
-
- yuv2plane1_8_u(src, dest, dst_u, dither, offset, 0);
-
- for (i = dst_u; i < dstW - 15; i += 16) {
-
- vi = vec_vsx_ld(0, &src[i]);
- vi = vec_adds(ditherleft, vi);
- vileft = vec_sra(vi, shifts);
-
- vi = vec_vsx_ld(0, &src[i + 8]);
- vi = vec_adds(ditherright, vi);
- vi = vec_sra(vi, shifts);
-
- vd = vec_packsu(vileft, vi);
- vec_st(vd, 0, &dest[i]);
- }
-
- yuv2plane1_8_u(src, dest, dstW, dither, offset, i);
-}
-
-#endif /* HAVE_ALTIVEC */
+#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */

av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
{
@@ -381,8 +112,9 @@ av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;

+#if HAVE_BIGENDIAN
if (c->srcBpc == 8 && c->dstBpc <= 14) {
- c->hyScale = c->hcScale = hScale_altivec_real;
+ c->hyScale = c->hcScale = hScale_real_altivec;
}
if (!is16BPS(dstFormat) && !isNBPS(dstFormat) &&
dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21 &&
@@ -390,6 +122,7 @@ av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
!c->needAlpha) {
c->yuv2planeX = yuv2planeX_altivec;
}
+#endif

/* The following list of supported dstFormat values should
* match what's found in the body of ff_yuv2packedX_altivec() */
@@ -414,12 +147,8 @@ av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
c->yuv2packedX = ff_yuv2rgb24_X_altivec;
break;
}
-
- switch (c->dstBpc) {
- case 8:
- c->yuv2plane1 = yuv2plane1_8_altivec;
- break;
- }
}
#endif /* HAVE_ALTIVEC */
+
+ ff_sws_init_swscale_vsx(c);
}
diff --git a/libswscale/ppc/swscale_ppc_template.c b/libswscale/ppc/swscale_ppc_template.c
new file mode 100644
index 0000000..00e4b99
--- /dev/null
+++ b/libswscale/ppc/swscale_ppc_template.c
@@ -0,0 +1,217 @@
+/*
+ * AltiVec-enhanced yuv2yuvX
+ *
+ * Copyright (C) 2004 Romain Dolbeau <***@dolbeau.org>
+ * based on the equivalent C code in swscale.c
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest,
+ const uint8_t *dither, int offset, int x)
+{
+ register int i, j;
+ LOCAL_ALIGNED(16, int, val, [16]);
+ vector signed int vo1, vo2, vo3, vo4;
+ vector unsigned short vs1, vs2;
+ vector unsigned char vf;
+ vector unsigned int altivec_vectorShiftInt19 =
+ vec_add(vec_splat_u32(10), vec_splat_u32(9));
+
+ for (i = 0; i < 16; i++)
+ val[i] = dither[(x + i + offset) & 7] << 12;
+
+ vo1 = vec_ld(0, val);
+ vo2 = vec_ld(16, val);
+ vo3 = vec_ld(32, val);
+ vo4 = vec_ld(48, val);
+
+ for (j = 0; j < filterSize; j++) {
+ unsigned int joffset=j<<1;
+ unsigned int xoffset=x<<1;
+ vector unsigned char perm;
+ vector signed short l1,vLumFilter;
+ LOAD_FILTER(vLumFilter,filter);
+ vLumFilter = vec_splat(vLumFilter, 0);
+ LOAD_L1(l1,src[j],perm);
+ yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter);
+ yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
+ }
+
+ vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
+ vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
+ vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
+ vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
+ vs1 = vec_packsu(vo1, vo2);
+ vs2 = vec_packsu(vo3, vo4);
+ vf = vec_packsu(vs1, vs2);
+ VEC_ST(vf, 0, dest);
+}
+
+
+static inline void yuv2planeX_u(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset, int x)
+{
+ int i, j;
+
+ for (i = x; i < dstW; i++) {
+ int t = dither[(i + offset) & 7] << 12;
+ for (j = 0; j < filterSize; j++)
+ t += src[j][i] * filter[j];
+ dest[i] = av_clip_uint8(t >> 19);
+ }
+}
+
+static void FUNC(yuv2planeX)(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset)
+{
+ int dst_u = -(uintptr_t)dest & 15;
+ int i;
+
+ yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
+
+ for (i = dst_u; i < dstW - 15; i += 16)
+ FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither,
+ offset, i);
+
+ yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
+}
+
+static void FUNC(hScale_real)(SwsContext *c, int16_t *dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize)
+{
+ register int i;
+ LOCAL_ALIGNED(16, int, tempo, [4]);
+
+ if (filterSize % 4) {
+ for (i = 0; i < dstW; i++) {
+ register int j;
+ register int srcPos = filterPos[i];
+ register int val = 0;
+ for (j = 0; j < filterSize; j++)
+ val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
+ }
+ } else
+ switch (filterSize) {
+ case 4:
+ for (i = 0; i < dstW; i++) {
+ register int srcPos = filterPos[i];
+
+ vector unsigned char src_vF = unaligned_load(srcPos, src);
+ vector signed short src_v, filter_v;
+ vector signed int val_vEven, val_s;
+ src_v = // vec_unpackh sign-extends...
+ (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
+ // now put our elements in the even slots
+ src_v = vec_mergeh(src_v, (vector signed short)vzero);
+ GET_VF4(i, filter_v, filter);
+ val_vEven = vec_mule(src_v, filter_v);
+ val_s = vec_sums(val_vEven, vzero);
+ vec_st(val_s, 0, tempo);
+ dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
+ }
+ break;
+ case 8:
+ for (i = 0; i < dstW; i++) {
+ register int srcPos = filterPos[i];
+ vector unsigned char src_vF, src_v0, src_v1;
+ vector unsigned char permS;
+ vector signed short src_v, filter_v;
+ vector signed int val_v, val_s;
+ FIRST_LOAD(src_v0, srcPos, src, permS);
+ LOAD_SRCV8(srcPos, 0, src, permS, src_v0, src_v1, src_vF);
+ src_v = // vec_unpackh sign-extends...
+ (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
+ filter_v = vec_ld(i << 4, filter);
+ val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
+ val_s = vec_sums(val_v, vzero);
+ vec_st(val_s, 0, tempo);
+ dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
+ }
+ break;
+
+ case 16:
+ for (i = 0; i < dstW; i++) {
+ register int srcPos = filterPos[i];
+
+ vector unsigned char src_vF = unaligned_load(srcPos, src);
+ vector signed short src_vA = // vec_unpackh sign-extends...
+ (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
+ vector signed short src_vB = // vec_unpackh sign-extends...
+ (vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
+ vector signed short filter_v0 = vec_ld(i << 5, filter);
+ vector signed short filter_v1 = vec_ld((i << 5) + 16, filter);
+
+ vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero);
+ vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc);
+
+ vector signed int val_s = vec_sums(val_v, vzero);
+
+ VEC_ST(val_s, 0, tempo);
+ dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
+ }
+ break;
+
+ default:
+ for (i = 0; i < dstW; i++) {
+ register int j, offset = i * 2 * filterSize;
+ register int srcPos = filterPos[i];
+
+ vector signed int val_s, val_v = (vector signed int)vzero;
+ vector signed short filter_v0R;
+ vector unsigned char permF, src_v0, permS;
+ FIRST_LOAD(filter_v0R, offset, filter, permF);
+ FIRST_LOAD(src_v0, srcPos, src, permS);
+
+ for (j = 0; j < filterSize - 15; j += 16) {
+ vector unsigned char src_v1, src_vF;
+ vector signed short filter_v1R, filter_v2R, filter_v0, filter_v1;
+ LOAD_SRCV(srcPos, j, src, permS, src_v0, src_v1, src_vF);
+ vector signed short src_vA = // vec_unpackh sign-extends...
+ (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
+ vector signed short src_vB = // vec_unpackh sign-extends...
+ (vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
+ GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v0, 0);
+ GET_VFD(i, j, filter, filter_v1R, filter_v2R, permF, filter_v1, 16);
+
+ vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v);
+ val_v = vec_msums(src_vB, filter_v1, val_acc);
+ UPDATE_PTR(filter_v2R, filter_v0R, src_v1, src_v0);
+ }
+
+ if (j < filterSize - 7) {
+ // loading src_v0 is useless, it's already done above
+ vector unsigned char src_v1, src_vF;
+ vector signed short src_v, filter_v1R, filter_v;
+ LOAD_SRCV8(srcPos, j, src, permS, src_v0, src_v1, src_vF);
+ src_v = // vec_unpackh sign-extends...
+ (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
+ GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v, 0);
+ val_v = vec_msums(src_v, filter_v, val_v);
+ }
+ val_s = vec_sums(val_v, vzero);
+
+ VEC_ST(val_s, 0, tempo);
+ dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
+ }
+ }
+}
diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
new file mode 100644
index 0000000..853b587
--- /dev/null
+++ b/libswscale/ppc/swscale_vsx.c
@@ -0,0 +1,164 @@
+/*
+ * AltiVec-enhanced yuv2yuvX
+ *
+ * Copyright (C) 2004 Romain Dolbeau <***@dolbeau.org>
+ * based on the equivalent C code in swscale.c
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+
+#include "config.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "yuv2rgb_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+
+#if HAVE_VSX
+#define vzero vec_splat_s32(0)
+
+#if !HAVE_BIGENDIAN
+#define GET_LS(a,b,c,s) {\
+ ls = a;\
+ a = vec_vsx_ld(((b) << 1) + 16, s);\
+ }
+
+#define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
+ vector signed short ls;\
+ GET_LS(l1, x, perm, src);\
+ vector signed int i1 = vec_mule(filter, ls);\
+ vector signed int i2 = vec_mulo(filter, ls);\
+ vector signed int vf1, vf2;\
+ vf1 = vec_mergeh(i1, i2);\
+ vf2 = vec_mergel(i1, i2);\
+ d1 = vec_add(d1, vf1);\
+ d2 = vec_add(d2, vf2);\
+ } while (0)
+
+#define LOAD_FILTER(vf,f) {\
+ vf = vec_vsx_ld(joffset, f);\
+}
+#define LOAD_L1(ll1,s,p){\
+ ll1 = vec_vsx_ld(xoffset, s);\
+}
+
+// The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
+
+// The neat trick: We only care for half the elements,
+// high or low depending on (i<<3)%16 (it's 0 or 8 here),
+// and we're going to use vec_mule, so we choose
+// carefully how to "unpack" the elements into the even slots.
+#define GET_VF4(a, vf, f) {\
+ vf = (vector signed short)vec_vsx_ld(a << 3, f);\
+ vf = vec_mergeh(vf, (vector signed short)vzero);\
+}
+#define FIRST_LOAD(sv, pos, s, per) {}
+#define UPDATE_PTR(s0, d0, s1, d1) {}
+#define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
+ vf = vec_vsx_ld(pos + a, s);\
+}
+#define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) LOAD_SRCV(pos, a, s, per, v0, v1, vf)
+#define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
+ vf = vec_vsx_ld((a * 2 * filterSize) + (b * 2) + off, f);\
+}
+
+#define FUNC(name) name ## _vsx
+#include "swscale_ppc_template.c"
+#undef FUNC
+
+#endif /* !HAVE_BIGENDIAN */
+
+static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset, int start)
+{
+ int i;
+ for (i = start; i < dstW; i++) {
+ int val = (src[i] + dither[(i + offset) & 7]) >> 7;
+ dest[i] = av_clip_uint8(val);
+ }
+}
+
+static void yuv2plane1_8_vsx(const int16_t *src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset)
+{
+ const int dst_u = -(uintptr_t)dest & 15;
+ int i, j;
+ LOCAL_ALIGNED(16, int16_t, val, [16]);
+ const vector uint16_t shifts = (vector uint16_t) {7, 7, 7, 7, 7, 7, 7, 7};
+ vector int16_t vi, vileft, ditherleft, ditherright;
+ vector uint8_t vd;
+
+ for (j = 0; j < 16; j++) {
+ val[j] = dither[(dst_u + offset + j) & 7];
+ }
+
+ ditherleft = vec_ld(0, val);
+ ditherright = vec_ld(0, &val[8]);
+
+ yuv2plane1_8_u(src, dest, dst_u, dither, offset, 0);
+
+ for (i = dst_u; i < dstW - 15; i += 16) {
+
+ vi = vec_vsx_ld(0, &src[i]);
+ vi = vec_adds(ditherleft, vi);
+ vileft = vec_sra(vi, shifts);
+
+ vi = vec_vsx_ld(0, &src[i + 8]);
+ vi = vec_adds(ditherright, vi);
+ vi = vec_sra(vi, shifts);
+
+ vd = vec_packsu(vileft, vi);
+ vec_st(vd, 0, &dest[i]);
+ }
+
+ yuv2plane1_8_u(src, dest, dstW, dither, offset, i);
+}
+
+#endif /* HAVE_VSX */
+
+av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
+{
+#if HAVE_VSX
+ enum AVPixelFormat dstFormat = c->dstFormat;
+
+ if (!(av_get_cpu_flags() & AV_CPU_FLAG_VSX))
+ return;
+
+#if !HAVE_BIGENDIAN
+ if (c->srcBpc == 8 && c->dstBpc <= 14) {
+ c->hyScale = c->hcScale = hScale_real_vsx;
+ }
+ if (!is16BPS(dstFormat) && !isNBPS(dstFormat) &&
+ dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21 &&
+ dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE &&
+ !c->needAlpha) {
+ c->yuv2planeX = yuv2planeX_vsx;
+ }
+#endif
+
+ if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->needAlpha) {
+ switch (c->dstBpc) {
+ case 8:
+ c->yuv2plane1 = yuv2plane1_8_vsx;
+ break;
+ }
+ }
+#endif /* HAVE_VSX */
+}
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 4fa5938..a59d127 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -868,6 +868,7 @@ void ff_sws_init_output_funcs(SwsContext *c,
yuv2packedX_fn *yuv2packedX,
yuv2anyX_fn *yuv2anyX);
void ff_sws_init_swscale_ppc(SwsContext *c);
+void ff_sws_init_swscale_vsx(SwsContext *c);
void ff_sws_init_swscale_x86(SwsContext *c);
void ff_sws_init_swscale_aarch64(SwsContext *c);
void ff_sws_init_swscale_arm(SwsContext *c);
--
2.6.2
Lauri Kasanen
2018-11-30 06:59:45 UTC
Permalink
On Mon, 26 Nov 2018 14:24:15 +0200
Post by Lauri Kasanen
Passes fate on LE (with "lavc/jrevdct: Avoid an aliasing violation" applied). Can anyone test BE?
Ping.

- Lauri
Lauri Kasanen
2018-11-30 12:05:26 UTC
Permalink
On Fri, 30 Nov 2018 12:30:58 +0300
Post by Lauri Kasanen
Passes fate on LE (with "lavc/jrevdct: Avoid an aliasing violation" applied). Can anyone test BE?
Ping.
FATE becomes green as much as possible, I haven't performed any benchmarking though.
Thanks for testing. This patch is not expected to change performance,
it's just moving functions around and putting them under proper VSX
guards.

- Lauri

PS: Your mail did not make it to the list, was it meant for me only?
Lauri Kasanen
2018-12-03 07:24:47 UTC
Permalink
On Fri, 30 Nov 2018 14:05:26 +0200
Post by Lauri Kasanen
On Fri, 30 Nov 2018 12:30:58 +0300
Post by Lauri Kasanen
Passes fate on LE (with "lavc/jrevdct: Avoid an aliasing violation" applied). Can anyone test BE?
Ping.
FATE becomes green as much as possible, I haven't performed any benchmarking though.
Thanks for testing. This patch is not expected to change performance,
it's just moving functions around and putting them under proper VSX
guards.
Could this patch be applied? Also ping on "swscale/output: VSX-optimize
nbps yuv2plane1".

- Lauri
Michael Niedermayer
2018-12-04 01:58:59 UTC
Permalink
Post by Lauri Kasanen
On Fri, 30 Nov 2018 14:05:26 +0200
Post by Lauri Kasanen
On Fri, 30 Nov 2018 12:30:58 +0300
Post by Lauri Kasanen
Passes fate on LE (with "lavc/jrevdct: Avoid an aliasing violation" applied). Can anyone test BE?
Ping.
FATE becomes green as much as possible, I haven't performed any benchmarking though.
Thanks for testing. This patch is not expected to change performance,
it's just moving functions around and putting them under proper VSX
guards.
Could this patch be applied?
will apply

thx
Post by Lauri Kasanen
Also ping on "swscale/output: VSX-optimize
nbps yuv2plane1".
- Lauri
_______________________________________________
ffmpeg-devel mailing list
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

You can kill me, but you cannot change the truth.
Michael Niedermayer
2018-12-04 02:21:30 UTC
Permalink
Post by Lauri Kasanen
On Fri, 30 Nov 2018 14:05:26 +0200
[...]
Post by Lauri Kasanen
Also ping on "swscale/output: VSX-optimize
nbps yuv2plane1".
This IIUC has not been tested on BE yet

my ppc emulation setup is a bit broken and my ppc hw ive not tried using
since years and it was not in good shape last i used it.
So i cant just quickly test this ...
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

The real ebay dictionary, page 1
"Used only once" - "Some unspecified defect prevented a second use"
"In good condition" - "Can be repaird by experienced expert"
"As is" - "You wouldnt want it even if you were payed for it, if you knew ..."
Lauri Kasanen
2018-12-04 07:10:17 UTC
Permalink
On Tue, 4 Dec 2018 03:21:30 +0100
Post by Michael Niedermayer
Post by Lauri Kasanen
Also ping on "swscale/output: VSX-optimize
nbps yuv2plane1".
This IIUC has not been tested on BE yet
my ppc emulation setup is a bit broken and my ppc hw ive not tried using
since years and it was not in good shape last i used it.
So i cant just quickly test this ...
Raptor offers free POWER9 VMs to open source projects. Since you're the
leader of ffmpeg, if you asked, I'm sure they'd give one or two for
ffmpeg build and fate testing.

Ref
https://mobile.twitter.com/RaptorCompSys/status/1067018060777832449?p=v
https://mobile.twitter.com/RaptorCompSys/status/1067029086273486848?p=v

"We offer free access to cloud VPS for libre software projects in
partnership with @Integricloud, would that help?"

"Contact ***@integricloud.com and tell them what you want to use a
VPS or two for. They will generally grant access to the resources."

(I'm developing on a POWER8 VM intended for devs, but ordered a
Blackbird from the cyber monday sale ;))

- Lauri
Dominik 'Rathann' Mierzejewski
2018-12-04 10:00:34 UTC
Permalink
Post by Lauri Kasanen
On Tue, 4 Dec 2018 03:21:30 +0100
Post by Michael Niedermayer
Post by Lauri Kasanen
Also ping on "swscale/output: VSX-optimize
nbps yuv2plane1".
This IIUC has not been tested on BE yet
my ppc emulation setup is a bit broken and my ppc hw ive not tried using
since years and it was not in good shape last i used it.
So i cant just quickly test this ...
Raptor offers free POWER9 VMs to open source projects. Since you're the
leader of ffmpeg, if you asked, I'm sure they'd give one or two for
ffmpeg build and fate testing.
Ref
https://mobile.twitter.com/RaptorCompSys/status/1067018060777832449?p=v
https://mobile.twitter.com/RaptorCompSys/status/1067029086273486848?p=v
"We offer free access to cloud VPS for libre software projects in
VPS or two for. They will generally grant access to the resources."
(I'm developing on a POWER8 VM intended for devs, but ordered a
Blackbird from the cyber monday sale ;))
Red Hat also offers Power 8 VMs, both BE and LE, to open source projects:
http://research.redhat.com/powerlinux-openpower-development-hosting/

Regards,
Dominik
--
Fedora https://getfedora.org | RPMFusion http://rpmfusion.org
There should be a science of discontent. People need hard times and
oppression to develop psychic muscles.
-- from "Collected Sayings of Muad'Dib" by the Princess Irulan
Michael Niedermayer
2018-12-04 13:27:22 UTC
Permalink
Post by Dominik 'Rathann' Mierzejewski
Post by Lauri Kasanen
On Tue, 4 Dec 2018 03:21:30 +0100
Post by Michael Niedermayer
Post by Lauri Kasanen
Also ping on "swscale/output: VSX-optimize
nbps yuv2plane1".
This IIUC has not been tested on BE yet
my ppc emulation setup is a bit broken and my ppc hw ive not tried using
since years and it was not in good shape last i used it.
So i cant just quickly test this ...
Raptor offers free POWER9 VMs to open source projects. Since you're the
leader of ffmpeg, if you asked, I'm sure they'd give one or two for
ffmpeg build and fate testing.
Ref
https://mobile.twitter.com/RaptorCompSys/status/1067018060777832449?p=v
https://mobile.twitter.com/RaptorCompSys/status/1067029086273486848?p=v
"We offer free access to cloud VPS for libre software projects in
VPS or two for. They will generally grant access to the resources."
(I'm developing on a POWER8 VM intended for devs, but ordered a
Blackbird from the cyber monday sale ;))
http://research.redhat.com/powerlinux-openpower-development-hosting/
these are more suggestions than i expected :)
but i just got cross build working again and i also just eliminated a
mysterious ld.so related segfault
ATM iam re rerunning fate with a freshly rebuilt qemu
(the past one had an issue with altivec)

so maybe ill be able to build and test ppc BE locally soon ... or maybe not;)

[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Breaking DRM is a little like attempting to break through a door even
though the window is wide open and the only thing in the house is a bunch
of things you dont want and which you would get tomorrow for free anyway
Michael Niedermayer
2018-12-06 20:47:18 UTC
Permalink
Post by Michael Niedermayer
Post by Dominik 'Rathann' Mierzejewski
Post by Lauri Kasanen
On Tue, 4 Dec 2018 03:21:30 +0100
Post by Michael Niedermayer
Post by Lauri Kasanen
Also ping on "swscale/output: VSX-optimize
nbps yuv2plane1".
This IIUC has not been tested on BE yet
my ppc emulation setup is a bit broken and my ppc hw ive not tried using
since years and it was not in good shape last i used it.
So i cant just quickly test this ...
Raptor offers free POWER9 VMs to open source projects. Since you're the
leader of ffmpeg, if you asked, I'm sure they'd give one or two for
ffmpeg build and fate testing.
Ref
https://mobile.twitter.com/RaptorCompSys/status/1067018060777832449?p=v
https://mobile.twitter.com/RaptorCompSys/status/1067029086273486848?p=v
"We offer free access to cloud VPS for libre software projects in
VPS or two for. They will generally grant access to the resources."
(I'm developing on a POWER8 VM intended for devs, but ordered a
Blackbird from the cyber monday sale ;))
http://research.redhat.com/powerlinux-openpower-development-hosting/
these are more suggestions than i expected :)
but i just got cross build working again and i also just eliminated a
mysterious ld.so related segfault
ATM iam re rerunning fate with a freshly rebuilt qemu
(the past one had an issue with altivec)
i have cross build with ppc and qemu partly working
but it appears gcc or something is just buggy
for example this:
#include <math.h>

main() {
float f = 0.0/0.0;
printf("%f\n", floor(f));
}

produces:
510423550381407695195061911147652317184.000000

but if build with -O1 it produces this:
nan

thats with powerpc-linux-gnu-gcc-4.8 (Ubuntu 4.8.4-2ubuntu1~14.04.1) 4.8.4

also
gcc -O1 behaves different from litterally listing all the options that -O1
is documented to turn on

the remaining issues are:
make: *** [fate-eval] Error 1
make: *** [fate-filter-tremolo] Error 1
make: *** [fate-filter-refcmp-psnr-rgb] Error 1
make: *** [fate-parseutils] Error 1

where eval and parseutils appear to be issues with non finite numbers
not sure iam missing something ...

[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

The real ebay dictionary, page 1
"Used only once" - "Some unspecified defect prevented a second use"
"In good condition" - "Can be repaird by experienced expert"
"As is" - "You wouldnt want it even if you were payed for it, if you knew ..."
Lauri Kasanen
2018-12-11 07:10:02 UTC
Permalink
On Thu, 6 Dec 2018 21:47:18 +0100
Post by Michael Niedermayer
Post by Michael Niedermayer
Post by Michael Niedermayer
Post by Lauri Kasanen
Also ping on "swscale/output: VSX-optimize
nbps yuv2plane1".
This IIUC has not been tested on BE yet
my ppc emulation setup is a bit broken and my ppc hw ive not tried using
since years and it was not in good shape last i used it.
So i cant just quickly test this ...
these are more suggestions than i expected :)
but i just got cross build working again and i also just eliminated a
mysterious ld.so related segfault
ATM iam re rerunning fate with a freshly rebuilt qemu
(the past one had an issue with altivec)
i have cross build with ppc and qemu partly working
but it appears gcc or something is just buggy
Hi,

Carl Eugen Hoyos reported that it builds fine on BE, the guards being
in correct place not to affect BE. How are things on your side?

- Lauri

Carl Eugen Hoyos
2018-12-06 21:38:50 UTC
Permalink
Post by Lauri Kasanen
Passes fate on LE (with "lavc/jrevdct: Avoid an aliasing violation"
applied). Can anyone test BE?
diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
new file mode 100644
index 0000000..853b587
--- /dev/null
+++ b/libswscale/ppc/swscale_vsx.c
@@ -0,0 +1,164 @@
+/*
+ * AltiVec-enhanced yuv2yuvX
+ *
+ * based on the equivalent C code in swscale.c
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+
+#include "config.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "yuv2rgb_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+
+#if HAVE_VSX
+#define vzero vec_splat_s32(0)
+
+#if !HAVE_BIGENDIAN
I changed these two if's and the ones on the bottom of the file into "if 1"
and tested on Power7 Linux BE, fate passes (and fails if I break the
function).
Problem is that HAVE_VSX is 0 on that system. How can this be fixed?

Carl Eugen
Loading...