diff --git a/xs/CMakeLists.txt b/xs/CMakeLists.txt index 73b7cae37a..9de06a6628 100644 --- a/xs/CMakeLists.txt +++ b/xs/CMakeLists.txt @@ -351,13 +351,18 @@ if(RASTERIZER_USE_SYSTEM_LIBPNG) target_compile_definitions(rasterizer PRIVATE ${PNG_DEFINITIONS}) else() add_subdirectory( ${LIBDIR}/png/zlib) - add_subdirectory( ${LIBDIR}/png/libpng ) - target_include_directories( png_static PRIVATE + set(ZLIB_INCLUDE_DIR ${LIBDIR}/png/zlib ${CMAKE_CURRENT_BINARY_DIR}/src/png/zlib ) + add_subdirectory( ${LIBDIR}/png/libpng ) + + target_include_directories(rasterizer PRIVATE + ${LIBDIR}/png/libpng + ${CMAKE_CURRENT_BINARY_DIR}/src/png/libpng + ) target_link_libraries(rasterizer PRIVATE png_static zlibstatic) endif() diff --git a/xs/src/png/libpng/arm/arm_init.c b/xs/src/png/libpng/arm/arm_init.c new file mode 100644 index 0000000000..02df812e77 --- /dev/null +++ b/xs/src/png/libpng/arm/arm_init.c @@ -0,0 +1,135 @@ + +/* arm_init.c - NEON optimised filter functions + * + * Copyright (c) 2014,2016 Glenn Randers-Pehrson + * Written by Mans Rullgard, 2011. + * Last changed in libpng 1.6.22 [May 26, 2016] + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ +/* Below, after checking __linux__, various non-C90 POSIX 1003.1 functions are + * called. + */ +#define _POSIX_SOURCE 1 + +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED + +#if PNG_ARM_NEON_OPT > 0 +#ifdef PNG_ARM_NEON_CHECK_SUPPORTED /* Do run-time checks */ +/* WARNING: it is strongly recommended that you do not build libpng with + * run-time checks for CPU features if at all possible. In the case of the ARM + * NEON instructions there is no processor-specific way of detecting the + * presence of the required support, therefore run-time detection is extremely + * OS specific. + * + * You may set the macro PNG_ARM_NEON_FILE to the file name of file containing + * a fragment of C source code which defines the png_have_neon function. There + * are a number of implementations in contrib/arm-neon, but the only one that + * has partial support is contrib/arm-neon/linux.c - a generic Linux + * implementation which reads /proc/cpufino. + */ +#ifndef PNG_ARM_NEON_FILE +# ifdef __linux__ +# define PNG_ARM_NEON_FILE "contrib/arm-neon/linux.c" +# endif +#endif + +#ifdef PNG_ARM_NEON_FILE + +#include /* for sig_atomic_t */ +static int png_have_neon(png_structp png_ptr); +#include PNG_ARM_NEON_FILE + +#else /* PNG_ARM_NEON_FILE */ +# error "PNG_ARM_NEON_FILE undefined: no support for run-time ARM NEON checks" +#endif /* PNG_ARM_NEON_FILE */ +#endif /* PNG_ARM_NEON_CHECK_SUPPORTED */ + +#ifndef PNG_ALIGNED_MEMORY_SUPPORTED +# error "ALIGNED_MEMORY is required; set: -DPNG_ALIGNED_MEMORY_SUPPORTED" +#endif + +void +png_init_filter_functions_neon(png_structp pp, unsigned int bpp) +{ + /* The switch statement is compiled in for ARM_NEON_API, the call to + * png_have_neon is compiled in for ARM_NEON_CHECK. If both are defined + * the check is only performed if the API has not set the NEON option on + * or off explicitly. In this case the check controls what happens. + * + * If the CHECK is not compiled in and the option is UNSET the behavior prior + * to 1.6.7 was to use the NEON code - this was a bug caused by having the + * wrong order of the 'ON' and 'default' cases. UNSET now defaults to OFF, + * as documented in png.h + */ + png_debug(1, "in png_init_filter_functions_neon"); +#ifdef PNG_ARM_NEON_API_SUPPORTED + switch ((pp->options >> PNG_ARM_NEON) & 3) + { + case PNG_OPTION_UNSET: + /* Allow the run-time check to execute if it has been enabled - + * thus both API and CHECK can be turned on. If it isn't supported + * this case will fall through to the 'default' below, which just + * returns. + */ +#endif /* PNG_ARM_NEON_API_SUPPORTED */ +#ifdef PNG_ARM_NEON_CHECK_SUPPORTED + { + static volatile sig_atomic_t no_neon = -1; /* not checked */ + + if (no_neon < 0) + no_neon = !png_have_neon(pp); + + if (no_neon) + return; + } +#ifdef PNG_ARM_NEON_API_SUPPORTED + break; +#endif +#endif /* PNG_ARM_NEON_CHECK_SUPPORTED */ + +#ifdef PNG_ARM_NEON_API_SUPPORTED + default: /* OFF or INVALID */ + return; + + case PNG_OPTION_ON: + /* Option turned on */ + break; + } +#endif + + /* IMPORTANT: any new external functions used here must be declared using + * PNG_INTERNAL_FUNCTION in ../pngpriv.h. This is required so that the + * 'prefix' option to configure works: + * + * ./configure --with-libpng-prefix=foobar_ + * + * Verify you have got this right by running the above command, doing a build + * and examining pngprefix.h; it must contain a #define for every external + * function you add. (Notice that this happens automatically for the + * initialization function.) + */ + pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_neon; + + if (bpp == 3) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_neon; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_neon; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = + png_read_filter_row_paeth3_neon; + } + + else if (bpp == 4) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_neon; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_neon; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = + png_read_filter_row_paeth4_neon; + } +} +#endif /* PNG_ARM_NEON_OPT > 0 */ +#endif /* READ */ diff --git a/xs/src/png/libpng/arm/filter_neon.S b/xs/src/png/libpng/arm/filter_neon.S new file mode 100644 index 0000000000..000764cd21 --- /dev/null +++ b/xs/src/png/libpng/arm/filter_neon.S @@ -0,0 +1,253 @@ + +/* filter_neon.S - NEON optimised filter functions + * + * Copyright (c) 2014,2017 Glenn Randers-Pehrson + * Written by Mans Rullgard, 2011. + * Last changed in libpng 1.6.31 [July 27, 2017] + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ + +/* This is required to get the symbol renames, which are #defines, and the + * definitions (or not) of PNG_ARM_NEON_OPT and PNG_ARM_NEON_IMPLEMENTATION. + */ +#define PNG_VERSION_INFO_ONLY +#include "../pngpriv.h" + +#if (defined(__linux__) || defined(__FreeBSD__)) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ +#endif + +#ifdef PNG_READ_SUPPORTED + +/* Assembler NEON support - only works for 32-bit ARM (i.e. it does not work for + * ARM64). The code in arm/filter_neon_intrinsics.c supports ARM64, however it + * only works if -mfpu=neon is specified on the GCC command line. See pngpriv.h + * for the logic which sets PNG_USE_ARM_NEON_ASM: + */ +#if PNG_ARM_NEON_IMPLEMENTATION == 2 /* hand-coded assembler */ + +#if PNG_ARM_NEON_OPT > 0 + +#ifdef __ELF__ +# define ELF +#else +# define ELF @ +#endif + + .arch armv7-a + .fpu neon + +.macro func name, export=0 + .macro endfunc +ELF .size \name, . - \name + .endfunc + .purgem endfunc + .endm + .text + + /* Explicitly specifying alignment here because some versions of + * GAS don't align code correctly. This is harmless in correctly + * written versions of GAS. + */ + .align 2 + + .if \export + .global \name + .endif +ELF .type \name, STT_FUNC + .func \name +\name: +.endm + +func png_read_filter_row_sub4_neon, export=1 + ldr r3, [r0, #4] @ rowbytes + vmov.i8 d3, #0 +1: + vld4.32 {d4[],d5[],d6[],d7[]}, [r1,:128] + vadd.u8 d0, d3, d4 + vadd.u8 d1, d0, d5 + vadd.u8 d2, d1, d6 + vadd.u8 d3, d2, d7 + vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r1,:128]! + subs r3, r3, #16 + bgt 1b + + bx lr +endfunc + +func png_read_filter_row_sub3_neon, export=1 + ldr r3, [r0, #4] @ rowbytes + vmov.i8 d3, #0 + mov r0, r1 + mov r2, #3 + mov r12, #12 + vld1.8 {q11}, [r0], r12 +1: + vext.8 d5, d22, d23, #3 + vadd.u8 d0, d3, d22 + vext.8 d6, d22, d23, #6 + vadd.u8 d1, d0, d5 + vext.8 d7, d23, d23, #1 + vld1.8 {q11}, [r0], r12 + vst1.32 {d0[0]}, [r1,:32], r2 + vadd.u8 d2, d1, d6 + vst1.32 {d1[0]}, [r1], r2 + vadd.u8 d3, d2, d7 + vst1.32 {d2[0]}, [r1], r2 + vst1.32 {d3[0]}, [r1], r2 + subs r3, r3, #12 + bgt 1b + + bx lr +endfunc + +func png_read_filter_row_up_neon, export=1 + ldr r3, [r0, #4] @ rowbytes +1: + vld1.8 {q0}, [r1,:128] + vld1.8 {q1}, [r2,:128]! + vadd.u8 q0, q0, q1 + vst1.8 {q0}, [r1,:128]! + subs r3, r3, #16 + bgt 1b + + bx lr +endfunc + +func png_read_filter_row_avg4_neon, export=1 + ldr r12, [r0, #4] @ rowbytes + vmov.i8 d3, #0 +1: + vld4.32 {d4[],d5[],d6[],d7[]}, [r1,:128] + vld4.32 {d16[],d17[],d18[],d19[]},[r2,:128]! + vhadd.u8 d0, d3, d16 + vadd.u8 d0, d0, d4 + vhadd.u8 d1, d0, d17 + vadd.u8 d1, d1, d5 + vhadd.u8 d2, d1, d18 + vadd.u8 d2, d2, d6 + vhadd.u8 d3, d2, d19 + vadd.u8 d3, d3, d7 + vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r1,:128]! + subs r12, r12, #16 + bgt 1b + + bx lr +endfunc + +func png_read_filter_row_avg3_neon, export=1 + push {r4,lr} + ldr r12, [r0, #4] @ rowbytes + vmov.i8 d3, #0 + mov r0, r1 + mov r4, #3 + mov lr, #12 + vld1.8 {q11}, [r0], lr +1: + vld1.8 {q10}, [r2], lr + vext.8 d5, d22, d23, #3 + vhadd.u8 d0, d3, d20 + vext.8 d17, d20, d21, #3 + vadd.u8 d0, d0, d22 + vext.8 d6, d22, d23, #6 + vhadd.u8 d1, d0, d17 + vext.8 d18, d20, d21, #6 + vadd.u8 d1, d1, d5 + vext.8 d7, d23, d23, #1 + vld1.8 {q11}, [r0], lr + vst1.32 {d0[0]}, [r1,:32], r4 + vhadd.u8 d2, d1, d18 + vst1.32 {d1[0]}, [r1], r4 + vext.8 d19, d21, d21, #1 + vadd.u8 d2, d2, d6 + vhadd.u8 d3, d2, d19 + vst1.32 {d2[0]}, [r1], r4 + vadd.u8 d3, d3, d7 + vst1.32 {d3[0]}, [r1], r4 + subs r12, r12, #12 + bgt 1b + + pop {r4,pc} +endfunc + +.macro paeth rx, ra, rb, rc + vaddl.u8 q12, \ra, \rb @ a + b + vaddl.u8 q15, \rc, \rc @ 2*c + vabdl.u8 q13, \rb, \rc @ pa + vabdl.u8 q14, \ra, \rc @ pb + vabd.u16 q15, q12, q15 @ pc + vcle.u16 q12, q13, q14 @ pa <= pb + vcle.u16 q13, q13, q15 @ pa <= pc + vcle.u16 q14, q14, q15 @ pb <= pc + vand q12, q12, q13 @ pa <= pb && pa <= pc + vmovn.u16 d28, q14 + vmovn.u16 \rx, q12 + vbsl d28, \rb, \rc + vbsl \rx, \ra, d28 +.endm + +func png_read_filter_row_paeth4_neon, export=1 + ldr r12, [r0, #4] @ rowbytes + vmov.i8 d3, #0 + vmov.i8 d20, #0 +1: + vld4.32 {d4[],d5[],d6[],d7[]}, [r1,:128] + vld4.32 {d16[],d17[],d18[],d19[]},[r2,:128]! + paeth d0, d3, d16, d20 + vadd.u8 d0, d0, d4 + paeth d1, d0, d17, d16 + vadd.u8 d1, d1, d5 + paeth d2, d1, d18, d17 + vadd.u8 d2, d2, d6 + paeth d3, d2, d19, d18 + vmov d20, d19 + vadd.u8 d3, d3, d7 + vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r1,:128]! + subs r12, r12, #16 + bgt 1b + + bx lr +endfunc + +func png_read_filter_row_paeth3_neon, export=1 + push {r4,lr} + ldr r12, [r0, #4] @ rowbytes + vmov.i8 d3, #0 + vmov.i8 d4, #0 + mov r0, r1 + mov r4, #3 + mov lr, #12 + vld1.8 {q11}, [r0], lr +1: + vld1.8 {q10}, [r2], lr + paeth d0, d3, d20, d4 + vext.8 d5, d22, d23, #3 + vadd.u8 d0, d0, d22 + vext.8 d17, d20, d21, #3 + paeth d1, d0, d17, d20 + vst1.32 {d0[0]}, [r1,:32], r4 + vext.8 d6, d22, d23, #6 + vadd.u8 d1, d1, d5 + vext.8 d18, d20, d21, #6 + paeth d2, d1, d18, d17 + vext.8 d7, d23, d23, #1 + vld1.8 {q11}, [r0], lr + vst1.32 {d1[0]}, [r1], r4 + vadd.u8 d2, d2, d6 + vext.8 d19, d21, d21, #1 + paeth d3, d2, d19, d18 + vst1.32 {d2[0]}, [r1], r4 + vmov d4, d19 + vadd.u8 d3, d3, d7 + vst1.32 {d3[0]}, [r1], r4 + subs r12, r12, #12 + bgt 1b + + pop {r4,pc} +endfunc +#endif /* PNG_ARM_NEON_OPT > 0 */ +#endif /* PNG_ARM_NEON_IMPLEMENTATION == 2 (assembler) */ +#endif /* READ */ diff --git a/xs/src/png/libpng/arm/filter_neon_intrinsics.c b/xs/src/png/libpng/arm/filter_neon_intrinsics.c new file mode 100644 index 0000000000..ea7e356bcc --- /dev/null +++ b/xs/src/png/libpng/arm/filter_neon_intrinsics.c @@ -0,0 +1,387 @@ + +/* filter_neon_intrinsics.c - NEON optimised filter functions + * + * Copyright (c) 2014,2016 Glenn Randers-Pehrson + * Written by James Yu , October 2013. + * Based on filter_neon.S, written by Mans Rullgard, 2011. + * + * Last changed in libpng 1.6.22 [May 26, 2016] + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ + +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED + +/* This code requires -mfpu=neon on the command line: */ +#if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */ + +#include + +/* libpng row pointers are not necessarily aligned to any particular boundary, + * however this code will only work with appropriate alignment. arm/arm_init.c + * checks for this (and will not compile unless it is done). This code uses + * variants of png_aligncast to avoid compiler warnings. + */ +#define png_ptr(type,pointer) png_aligncast(type *,pointer) +#define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer) + +/* The following relies on a variable 'temp_pointer' being declared with type + * 'type'. This is written this way just to hide the GCC strict aliasing + * warning; note that the code is safe because there never is an alias between + * the input and output pointers. + */ +#define png_ldr(type,pointer)\ + (temp_pointer = png_ptr(type,pointer), *temp_pointer) + +#if PNG_ARM_NEON_OPT > 0 + +void +png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_bytep rp = row; + png_bytep rp_stop = row + row_info->rowbytes; + png_const_bytep pp = prev_row; + + png_debug(1, "in png_read_filter_row_up_neon"); + + for (; rp < rp_stop; rp += 16, pp += 16) + { + uint8x16_t qrp, qpp; + + qrp = vld1q_u8(rp); + qpp = vld1q_u8(pp); + qrp = vaddq_u8(qrp, qpp); + vst1q_u8(rp, qrp); + } +} + +void +png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_bytep rp = row; + png_bytep rp_stop = row + row_info->rowbytes; + + uint8x16_t vtmp = vld1q_u8(rp); + uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp); + uint8x8x2_t vrp = *vrpt; + + uint8x8x4_t vdest; + vdest.val[3] = vdup_n_u8(0); + + png_debug(1, "in png_read_filter_row_sub3_neon"); + + for (; rp < rp_stop;) + { + uint8x8_t vtmp1, vtmp2; + uint32x2_t *temp_pointer; + + vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); + vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); + vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6); + vdest.val[1] = vadd_u8(vdest.val[0], vtmp1); + + vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); + vdest.val[2] = vadd_u8(vdest.val[1], vtmp2); + vdest.val[3] = vadd_u8(vdest.val[2], vtmp1); + + vtmp = vld1q_u8(rp + 12); + vrpt = png_ptr(uint8x8x2_t, &vtmp); + vrp = *vrpt; + + vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); + rp += 3; + vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); + rp += 3; + vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); + rp += 3; + vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); + rp += 3; + } + + PNG_UNUSED(prev_row) +} + +void +png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_bytep rp = row; + png_bytep rp_stop = row + row_info->rowbytes; + + uint8x8x4_t vdest; + vdest.val[3] = vdup_n_u8(0); + + png_debug(1, "in png_read_filter_row_sub4_neon"); + + for (; rp < rp_stop; rp += 16) + { + uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp)); + uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp); + uint8x8x4_t vrp = *vrpt; + uint32x2x4_t *temp_pointer; + + vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); + vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]); + vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]); + vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]); + vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); + } + + PNG_UNUSED(prev_row) +} + +void +png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_bytep rp = row; + png_const_bytep pp = prev_row; + png_bytep rp_stop = row + row_info->rowbytes; + + uint8x16_t vtmp; + uint8x8x2_t *vrpt; + uint8x8x2_t vrp; + uint8x8x4_t vdest; + vdest.val[3] = vdup_n_u8(0); + + vtmp = vld1q_u8(rp); + vrpt = png_ptr(uint8x8x2_t,&vtmp); + vrp = *vrpt; + + png_debug(1, "in png_read_filter_row_avg3_neon"); + + for (; rp < rp_stop; pp += 12) + { + uint8x8_t vtmp1, vtmp2, vtmp3; + + uint8x8x2_t *vppt; + uint8x8x2_t vpp; + + uint32x2_t *temp_pointer; + + vtmp = vld1q_u8(pp); + vppt = png_ptr(uint8x8x2_t,&vtmp); + vpp = *vppt; + + vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); + vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); + vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); + + vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); + vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6); + vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); + vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); + + vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6); + vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); + + vtmp = vld1q_u8(rp + 12); + vrpt = png_ptr(uint8x8x2_t,&vtmp); + vrp = *vrpt; + + vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2); + vdest.val[2] = vadd_u8(vdest.val[2], vtmp3); + + vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); + + vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2); + vdest.val[3] = vadd_u8(vdest.val[3], vtmp1); + + vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); + rp += 3; + vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); + rp += 3; + vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); + rp += 3; + vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); + rp += 3; + } +} + +void +png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_bytep rp = row; + png_bytep rp_stop = row + row_info->rowbytes; + png_const_bytep pp = prev_row; + + uint8x8x4_t vdest; + vdest.val[3] = vdup_n_u8(0); + + png_debug(1, "in png_read_filter_row_avg4_neon"); + + for (; rp < rp_stop; rp += 16, pp += 16) + { + uint32x2x4_t vtmp; + uint8x8x4_t *vrpt, *vppt; + uint8x8x4_t vrp, vpp; + uint32x2x4_t *temp_pointer; + + vtmp = vld4_u32(png_ptr(uint32_t,rp)); + vrpt = png_ptr(uint8x8x4_t,&vtmp); + vrp = *vrpt; + vtmp = vld4_u32(png_ptrc(uint32_t,pp)); + vppt = png_ptr(uint8x8x4_t,&vtmp); + vpp = *vppt; + + vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); + vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); + vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]); + vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); + vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]); + vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]); + vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]); + vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]); + + vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); + } +} + +static uint8x8_t +paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c) +{ + uint8x8_t d, e; + uint16x8_t p1, pa, pb, pc; + + p1 = vaddl_u8(a, b); /* a + b */ + pc = vaddl_u8(c, c); /* c * 2 */ + pa = vabdl_u8(b, c); /* pa */ + pb = vabdl_u8(a, c); /* pb */ + pc = vabdq_u16(p1, pc); /* pc */ + + p1 = vcleq_u16(pa, pb); /* pa <= pb */ + pa = vcleq_u16(pa, pc); /* pa <= pc */ + pb = vcleq_u16(pb, pc); /* pb <= pc */ + + p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */ + + d = vmovn_u16(pb); + e = vmovn_u16(p1); + + d = vbsl_u8(d, b, c); + e = vbsl_u8(e, a, d); + + return e; +} + +void +png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_bytep rp = row; + png_const_bytep pp = prev_row; + png_bytep rp_stop = row + row_info->rowbytes; + + uint8x16_t vtmp; + uint8x8x2_t *vrpt; + uint8x8x2_t vrp; + uint8x8_t vlast = vdup_n_u8(0); + uint8x8x4_t vdest; + vdest.val[3] = vdup_n_u8(0); + + vtmp = vld1q_u8(rp); + vrpt = png_ptr(uint8x8x2_t,&vtmp); + vrp = *vrpt; + + png_debug(1, "in png_read_filter_row_paeth3_neon"); + + for (; rp < rp_stop; pp += 12) + { + uint8x8x2_t *vppt; + uint8x8x2_t vpp; + uint8x8_t vtmp1, vtmp2, vtmp3; + uint32x2_t *temp_pointer; + + vtmp = vld1q_u8(pp); + vppt = png_ptr(uint8x8x2_t,&vtmp); + vpp = *vppt; + + vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); + vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); + + vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); + vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); + vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); + vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); + + vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6); + vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6); + vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2); + vdest.val[2] = vadd_u8(vdest.val[2], vtmp1); + + vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); + vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); + + vtmp = vld1q_u8(rp + 12); + vrpt = png_ptr(uint8x8x2_t,&vtmp); + vrp = *vrpt; + + vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3); + vdest.val[3] = vadd_u8(vdest.val[3], vtmp1); + + vlast = vtmp2; + + vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); + rp += 3; + vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); + rp += 3; + vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); + rp += 3; + vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); + rp += 3; + } +} + +void +png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_bytep rp = row; + png_bytep rp_stop = row + row_info->rowbytes; + png_const_bytep pp = prev_row; + + uint8x8_t vlast = vdup_n_u8(0); + uint8x8x4_t vdest; + vdest.val[3] = vdup_n_u8(0); + + png_debug(1, "in png_read_filter_row_paeth4_neon"); + + for (; rp < rp_stop; rp += 16, pp += 16) + { + uint32x2x4_t vtmp; + uint8x8x4_t *vrpt, *vppt; + uint8x8x4_t vrp, vpp; + uint32x2x4_t *temp_pointer; + + vtmp = vld4_u32(png_ptr(uint32_t,rp)); + vrpt = png_ptr(uint8x8x4_t,&vtmp); + vrp = *vrpt; + vtmp = vld4_u32(png_ptrc(uint32_t,pp)); + vppt = png_ptr(uint8x8x4_t,&vtmp); + vpp = *vppt; + + vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); + vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); + vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]); + vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); + vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]); + vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]); + vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]); + vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]); + + vlast = vpp.val[3]; + + vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); + } +} + +#endif /* PNG_ARM_NEON_OPT > 0 */ +#endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */ +#endif /* READ */ diff --git a/xs/src/png/libpng/intel/filter_sse2_intrinsics.c b/xs/src/png/libpng/intel/filter_sse2_intrinsics.c new file mode 100644 index 0000000000..5e8553fbb9 --- /dev/null +++ b/xs/src/png/libpng/intel/filter_sse2_intrinsics.c @@ -0,0 +1,406 @@ + +/* filter_sse2_intrinsics.c - SSE2 optimized filter functions + * + * Copyright (c) 2016-2017 Glenn Randers-Pehrson + * Written by Mike Klein and Matt Sarett + * Derived from arm/filter_neon_intrinsics.c + * + * Last changed in libpng 1.6.31 [July 27, 2017] + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ + +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED + +#if PNG_INTEL_SSE_IMPLEMENTATION > 0 + +#include + +/* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d). + * They're positioned like this: + * prev: c b + * row: a d + * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be + * whichever of a, b, or c is closest to p=a+b-c. + */ + +static __m128i load4(const void* p) { + return _mm_cvtsi32_si128(*(const int*)p); +} + +static void store4(void* p, __m128i v) { + *(int*)p = _mm_cvtsi128_si32(v); +} + +static __m128i load3(const void* p) { + /* We'll load 2 bytes, then 1 byte, + * then mask them together, and finally load into SSE. + */ + const png_uint_16* p01 = (png_const_uint_16p)p; + const png_byte* p2 = (const png_byte*)(p01+1); + + png_uint_32 v012 = (png_uint_32)(*p01) + | (png_uint_32)(*p2) << 16; + return load4(&v012); +} + +static void store3(void* p, __m128i v) { + /* We'll pull from SSE as a 32-bit int, then write + * its bottom two bytes, then its third byte. + */ + png_uint_32 v012; + png_uint_16* p01; + png_byte* p2; + + store4(&v012, v); + + p01 = (png_uint_16p)p; + p2 = (png_byte*)(p01+1); + *p01 = (png_uint_16)v012; + *p2 = (png_byte)(v012 >> 16); +} + +void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row, + png_const_bytep prev) +{ + /* The Sub filter predicts each pixel as the previous pixel, a. + * There is no pixel to the left of the first pixel. It's encoded directly. + * That works with our main loop if we just say that left pixel was zero. + */ + png_size_t rb; + + __m128i a, d = _mm_setzero_si128(); + + png_debug(1, "in png_read_filter_row_sub3_sse2"); + + rb = row_info->rowbytes; + while (rb >= 4) { + a = d; d = load4(row); + d = _mm_add_epi8(d, a); + store3(row, d); + + row += 3; + rb -= 3; + } + if (rb > 0) { + a = d; d = load3(row); + d = _mm_add_epi8(d, a); + store3(row, d); + + row += 3; + rb -= 3; + } + PNG_UNUSED(prev) +} + +void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row, + png_const_bytep prev) +{ + /* The Sub filter predicts each pixel as the previous pixel, a. + * There is no pixel to the left of the first pixel. It's encoded directly. + * That works with our main loop if we just say that left pixel was zero. + */ + png_size_t rb; + + __m128i a, d = _mm_setzero_si128(); + + png_debug(1, "in png_read_filter_row_sub4_sse2"); + + rb = row_info->rowbytes+4; + while (rb > 4) { + a = d; d = load4(row); + d = _mm_add_epi8(d, a); + store4(row, d); + + row += 4; + rb -= 4; + } + PNG_UNUSED(prev) +} + +void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row, + png_const_bytep prev) +{ + /* The Avg filter predicts each pixel as the (truncated) average of a and b. + * There's no pixel to the left of the first pixel. Luckily, it's + * predicted to be half of the pixel above it. So again, this works + * perfectly with our loop if we make sure a starts at zero. + */ + + png_size_t rb; + + const __m128i zero = _mm_setzero_si128(); + + __m128i b; + __m128i a, d = zero; + + png_debug(1, "in png_read_filter_row_avg3_sse2"); + rb = row_info->rowbytes; + while (rb >= 4) { + __m128i avg; + b = load4(prev); + a = d; d = load4(row ); + + /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ + avg = _mm_avg_epu8(a,b); + /* ...but we can fix it up by subtracting off 1 if it rounded up. */ + avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), + _mm_set1_epi8(1))); + d = _mm_add_epi8(d, avg); + store3(row, d); + + prev += 3; + row += 3; + rb -= 3; + } + if (rb > 0) { + __m128i avg; + b = load3(prev); + a = d; d = load3(row ); + + /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ + avg = _mm_avg_epu8(a,b); + /* ...but we can fix it up by subtracting off 1 if it rounded up. */ + avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), + _mm_set1_epi8(1))); + + d = _mm_add_epi8(d, avg); + store3(row, d); + + prev += 3; + row += 3; + rb -= 3; + } +} + +void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row, + png_const_bytep prev) +{ + /* The Avg filter predicts each pixel as the (truncated) average of a and b. + * There's no pixel to the left of the first pixel. Luckily, it's + * predicted to be half of the pixel above it. So again, this works + * perfectly with our loop if we make sure a starts at zero. + */ + png_size_t rb; + const __m128i zero = _mm_setzero_si128(); + __m128i b; + __m128i a, d = zero; + + png_debug(1, "in png_read_filter_row_avg4_sse2"); + + rb = row_info->rowbytes+4; + while (rb > 4) { + __m128i avg; + b = load4(prev); + a = d; d = load4(row ); + + /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ + avg = _mm_avg_epu8(a,b); + /* ...but we can fix it up by subtracting off 1 if it rounded up. */ + avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), + _mm_set1_epi8(1))); + + d = _mm_add_epi8(d, avg); + store4(row, d); + + prev += 4; + row += 4; + rb -= 4; + } +} + +/* Returns |x| for 16-bit lanes. */ +static __m128i abs_i16(__m128i x) { +#if PNG_INTEL_SSE_IMPLEMENTATION >= 2 + return _mm_abs_epi16(x); +#else + /* Read this all as, return x<0 ? -x : x. + * To negate two's complement, you flip all the bits then add 1. + */ + __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128()); + + /* Flip negative lanes. */ + x = _mm_xor_si128(x, is_negative); + + /* +1 to negative lanes, else +0. */ + x = _mm_sub_epi16(x, is_negative); + return x; +#endif +} + +/* Bytewise c ? t : e. */ +static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { +#if PNG_INTEL_SSE_IMPLEMENTATION >= 3 + return _mm_blendv_epi8(e,t,c); +#else + return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e)); +#endif +} + +void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row, + png_const_bytep prev) +{ + /* Paeth tries to predict pixel d using the pixel to the left of it, a, + * and two pixels from the previous row, b and c: + * prev: c b + * row: a d + * The Paeth function predicts d to be whichever of a, b, or c is nearest to + * p=a+b-c. + * + * The first pixel has no left context, and so uses an Up filter, p = b. + * This works naturally with our main loop's p = a+b-c if we force a and c + * to zero. + * Here we zero b and d, which become c and a respectively at the start of + * the loop. + */ + png_size_t rb; + const __m128i zero = _mm_setzero_si128(); + __m128i c, b = zero, + a, d = zero; + + png_debug(1, "in png_read_filter_row_paeth3_sse2"); + + rb = row_info->rowbytes; + while (rb >= 4) { + /* It's easiest to do this math (particularly, deal with pc) with 16-bit + * intermediates. + */ + __m128i pa,pb,pc,smallest,nearest; + c = b; b = _mm_unpacklo_epi8(load4(prev), zero); + a = d; d = _mm_unpacklo_epi8(load4(row ), zero); + + /* (p-a) == (a+b-c - a) == (b-c) */ + + pa = _mm_sub_epi16(b,c); + + /* (p-b) == (a+b-c - b) == (a-c) */ + pb = _mm_sub_epi16(a,c); + + /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ + pc = _mm_add_epi16(pa,pb); + + pa = abs_i16(pa); /* |p-a| */ + pb = abs_i16(pb); /* |p-b| */ + pc = abs_i16(pc); /* |p-c| */ + + smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); + + /* Paeth breaks ties favoring a over b over c. */ + nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, + if_then_else(_mm_cmpeq_epi16(smallest, pb), b, + c)); + + /* Note `_epi8`: we need addition to wrap modulo 255. */ + d = _mm_add_epi8(d, nearest); + store3(row, _mm_packus_epi16(d,d)); + + prev += 3; + row += 3; + rb -= 3; + } + if (rb > 0) { + /* It's easiest to do this math (particularly, deal with pc) with 16-bit + * intermediates. + */ + __m128i pa,pb,pc,smallest,nearest; + c = b; b = _mm_unpacklo_epi8(load3(prev), zero); + a = d; d = _mm_unpacklo_epi8(load3(row ), zero); + + /* (p-a) == (a+b-c - a) == (b-c) */ + pa = _mm_sub_epi16(b,c); + + /* (p-b) == (a+b-c - b) == (a-c) */ + pb = _mm_sub_epi16(a,c); + + /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ + pc = _mm_add_epi16(pa,pb); + + pa = abs_i16(pa); /* |p-a| */ + pb = abs_i16(pb); /* |p-b| */ + pc = abs_i16(pc); /* |p-c| */ + + smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); + + /* Paeth breaks ties favoring a over b over c. */ + nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, + if_then_else(_mm_cmpeq_epi16(smallest, pb), b, + c)); + + /* Note `_epi8`: we need addition to wrap modulo 255. */ + d = _mm_add_epi8(d, nearest); + store3(row, _mm_packus_epi16(d,d)); + + prev += 3; + row += 3; + rb -= 3; + } +} + +void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row, + png_const_bytep prev) +{ + /* Paeth tries to predict pixel d using the pixel to the left of it, a, + * and two pixels from the previous row, b and c: + * prev: c b + * row: a d + * The Paeth function predicts d to be whichever of a, b, or c is nearest to + * p=a+b-c. + * + * The first pixel has no left context, and so uses an Up filter, p = b. + * This works naturally with our main loop's p = a+b-c if we force a and c + * to zero. + * Here we zero b and d, which become c and a respectively at the start of + * the loop. + */ + png_size_t rb; + const __m128i zero = _mm_setzero_si128(); + __m128i pa,pb,pc,smallest,nearest; + __m128i c, b = zero, + a, d = zero; + + png_debug(1, "in png_read_filter_row_paeth4_sse2"); + + rb = row_info->rowbytes+4; + while (rb > 4) { + /* It's easiest to do this math (particularly, deal with pc) with 16-bit + * intermediates. + */ + c = b; b = _mm_unpacklo_epi8(load4(prev), zero); + a = d; d = _mm_unpacklo_epi8(load4(row ), zero); + + /* (p-a) == (a+b-c - a) == (b-c) */ + pa = _mm_sub_epi16(b,c); + + /* (p-b) == (a+b-c - b) == (a-c) */ + pb = _mm_sub_epi16(a,c); + + /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ + pc = _mm_add_epi16(pa,pb); + + pa = abs_i16(pa); /* |p-a| */ + pb = abs_i16(pb); /* |p-b| */ + pc = abs_i16(pc); /* |p-c| */ + + smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); + + /* Paeth breaks ties favoring a over b over c. */ + nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, + if_then_else(_mm_cmpeq_epi16(smallest, pb), b, + c)); + + /* Note `_epi8`: we need addition to wrap modulo 255. */ + d = _mm_add_epi8(d, nearest); + store4(row, _mm_packus_epi16(d,d)); + + prev += 4; + row += 4; + rb -= 4; + } +} + +#endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */ +#endif /* READ */ diff --git a/xs/src/png/libpng/intel/intel_init.c b/xs/src/png/libpng/intel/intel_init.c new file mode 100644 index 0000000000..8f08baf8c5 --- /dev/null +++ b/xs/src/png/libpng/intel/intel_init.c @@ -0,0 +1,53 @@ + +/* intel_init.c - SSE2 optimized filter functions + * + * Copyright (c) 2016-2017 Glenn Randers-Pehrson + * Written by Mike Klein and Matt Sarett, Google, Inc. + * Derived from arm/arm_init.c + * + * Last changed in libpng 1.6.29 [March 16, 2017] + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ + +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED +#if PNG_INTEL_SSE_IMPLEMENTATION > 0 + +void +png_init_filter_functions_sse2(png_structp pp, unsigned int bpp) +{ + /* The techniques used to implement each of these filters in SSE operate on + * one pixel at a time. + * So they generally speed up 3bpp images about 3x, 4bpp images about 4x. + * They can scale up to 6 and 8 bpp images and down to 2 bpp images, + * but they'd not likely have any benefit for 1bpp images. + * Most of these can be implemented using only MMX and 64-bit registers, + * but they end up a bit slower than using the equally-ubiquitous SSE2. + */ + png_debug(1, "in png_init_filter_functions_sse2"); + if (bpp == 3) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_sse2; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_sse2; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = + png_read_filter_row_paeth3_sse2; + } + else if (bpp == 4) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_sse2; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_sse2; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = + png_read_filter_row_paeth4_sse2; + } + + /* No need optimize PNG_FILTER_VALUE_UP. The compiler should + * autovectorize. + */ +} + +#endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */ +#endif /* PNG_READ_SUPPORTED */ diff --git a/xs/src/png/libpng/mips/filter_msa_intrinsics.c b/xs/src/png/libpng/mips/filter_msa_intrinsics.c new file mode 100644 index 0000000000..943bb3d052 --- /dev/null +++ b/xs/src/png/libpng/mips/filter_msa_intrinsics.c @@ -0,0 +1,807 @@ + +/* filter_msa_intrinsics.c - MSA optimised filter functions + * + * Copyright (c) 2016 Glenn Randers-Pehrson + * Written by Mandar Sahastrabuddhe, August 2016. + * Last changed in libpng 1.6.25 [September 1, 2016] + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ +#include +#include +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED + +/* This code requires -mfpu=msa on the command line: */ +#if PNG_MIPS_MSA_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */ + +#include + +/* libpng row pointers are not necessarily aligned to any particular boundary, + * however this code will only work with appropriate alignment. mips/mips_init.c + * checks for this (and will not compile unless it is done). This code uses + * variants of png_aligncast to avoid compiler warnings. + */ +#define png_ptr(type,pointer) png_aligncast(type *,pointer) +#define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer) + +/* The following relies on a variable 'temp_pointer' being declared with type + * 'type'. This is written this way just to hide the GCC strict aliasing + * warning; note that the code is safe because there never is an alias between + * the input and output pointers. + */ +#define png_ldr(type,pointer)\ + (temp_pointer = png_ptr(type,pointer), *temp_pointer) + +#if PNG_MIPS_MSA_OPT > 0 + +#ifdef CLANG_BUILD + #define MSA_SRLI_B(a, b) __msa_srli_b((v16i8) a, b) + + #define LW(psrc) \ + ( { \ + uint8_t *psrc_lw_m = (uint8_t *) (psrc); \ + uint32_t val_m; \ + \ + asm volatile ( \ + "lw %[val_m], %[psrc_lw_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_lw_m] "m" (*psrc_lw_m) \ + ); \ + \ + val_m; \ + } ) + + #define SH(val, pdst) \ + { \ + uint8_t *pdst_sh_m = (uint8_t *) (pdst); \ + uint16_t val_m = (val); \ + \ + asm volatile ( \ + "sh %[val_m], %[pdst_sh_m] \n\t" \ + \ + : [pdst_sh_m] "=m" (*pdst_sh_m) \ + : [val_m] "r" (val_m) \ + ); \ + } + + #define SW(val, pdst) \ + { \ + uint8_t *pdst_sw_m = (uint8_t *) (pdst); \ + uint32_t val_m = (val); \ + \ + asm volatile ( \ + "sw %[val_m], %[pdst_sw_m] \n\t" \ + \ + : [pdst_sw_m] "=m" (*pdst_sw_m) \ + : [val_m] "r" (val_m) \ + ); \ + } + + #if (__mips == 64) + #define SD(val, pdst) \ + { \ + uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ + uint64_t val_m = (val); \ + \ + asm volatile ( \ + "sd %[val_m], %[pdst_sd_m] \n\t" \ + \ + : [pdst_sd_m] "=m" (*pdst_sd_m) \ + : [val_m] "r" (val_m) \ + ); \ + } + #else + #define SD(val, pdst) \ + { \ + uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ + uint32_t val0_m, val1_m; \ + \ + val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \ + val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(val0_m, pdst_sd_m); \ + SW(val1_m, pdst_sd_m + 4); \ + } + #endif +#else + #define MSA_SRLI_B(a, b) (a >> b) + +#if (__mips_isa_rev >= 6) + #define LW(psrc) \ + ( { \ + uint8_t *psrc_lw_m = (uint8_t *) (psrc); \ + uint32_t val_m; \ + \ + asm volatile ( \ + "lw %[val_m], %[psrc_lw_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_lw_m] "m" (*psrc_lw_m) \ + ); \ + \ + val_m; \ + } ) + + #define SH(val, pdst) \ + { \ + uint8_t *pdst_sh_m = (uint8_t *) (pdst); \ + uint16_t val_m = (val); \ + \ + asm volatile ( \ + "sh %[val_m], %[pdst_sh_m] \n\t" \ + \ + : [pdst_sh_m] "=m" (*pdst_sh_m) \ + : [val_m] "r" (val_m) \ + ); \ + } + + #define SW(val, pdst) \ + { \ + uint8_t *pdst_sw_m = (uint8_t *) (pdst); \ + uint32_t val_m = (val); \ + \ + asm volatile ( \ + "sw %[val_m], %[pdst_sw_m] \n\t" \ + \ + : [pdst_sw_m] "=m" (*pdst_sw_m) \ + : [val_m] "r" (val_m) \ + ); \ + } + + #if (__mips == 64) + #define SD(val, pdst) \ + { \ + uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ + uint64_t val_m = (val); \ + \ + asm volatile ( \ + "sd %[val_m], %[pdst_sd_m] \n\t" \ + \ + : [pdst_sd_m] "=m" (*pdst_sd_m) \ + : [val_m] "r" (val_m) \ + ); \ + } + #else + #define SD(val, pdst) \ + { \ + uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ + uint32_t val0_m, val1_m; \ + \ + val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \ + val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(val0_m, pdst_sd_m); \ + SW(val1_m, pdst_sd_m + 4); \ + } + #endif +#else // !(__mips_isa_rev >= 6) + #define LW(psrc) \ + ( { \ + uint8_t *psrc_lw_m = (uint8_t *) (psrc); \ + uint32_t val_m; \ + \ + asm volatile ( \ + "ulw %[val_m], %[psrc_lw_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_lw_m] "m" (*psrc_lw_m) \ + ); \ + \ + val_m; \ + } ) + + #define SH(val, pdst) \ + { \ + uint8_t *pdst_sh_m = (uint8_t *) (pdst); \ + uint16_t val_m = (val); \ + \ + asm volatile ( \ + "ush %[val_m], %[pdst_sh_m] \n\t" \ + \ + : [pdst_sh_m] "=m" (*pdst_sh_m) \ + : [val_m] "r" (val_m) \ + ); \ + } + + #define SW(val, pdst) \ + { \ + uint8_t *pdst_sw_m = (uint8_t *) (pdst); \ + uint32_t val_m = (val); \ + \ + asm volatile ( \ + "usw %[val_m], %[pdst_sw_m] \n\t" \ + \ + : [pdst_sw_m] "=m" (*pdst_sw_m) \ + : [val_m] "r" (val_m) \ + ); \ + } + + #define SD(val, pdst) \ + { \ + uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ + uint32_t val0_m, val1_m; \ + \ + val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \ + val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(val0_m, pdst_sd_m); \ + SW(val1_m, pdst_sd_m + 4); \ + } + + #define SW_ZERO(pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *) (pdst); \ + \ + asm volatile ( \ + "usw $0, %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m" (*pdst_m) \ + : \ + ); \ + } +#endif // (__mips_isa_rev >= 6) +#endif + +#define LD_B(RTYPE, psrc) *((RTYPE *) (psrc)) +#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) +#define LD_B2(RTYPE, psrc, stride, out0, out1) \ +{ \ + out0 = LD_B(RTYPE, (psrc)); \ + out1 = LD_B(RTYPE, (psrc) + stride); \ +} +#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) +#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ +{ \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ +} +#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) + +#define ST_B(RTYPE, in, pdst) *((RTYPE *) (pdst)) = (in) +#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) +#define ST_B2(RTYPE, in0, in1, pdst, stride) \ +{ \ + ST_B(RTYPE, in0, (pdst)); \ + ST_B(RTYPE, in1, (pdst) + stride); \ +} +#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) +#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ +{ \ + ST_B2(RTYPE, in0, in1, (pdst), stride); \ + ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ +} +#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) + +#define ADD2(in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ +} +#define ADD3(in0, in1, in2, in3, in4, in5, \ + out0, out1, out2) \ +{ \ + ADD2(in0, in1, in2, in3, out0, out1); \ + out2 = in4 + in5; \ +} +#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + ADD2(in0, in1, in2, in3, out0, out1); \ + ADD2(in4, in5, in6, in7, out2, out3); \ +} + +#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \ + out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \ +} +#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) + +#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \ + out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \ +} +#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) + +#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ +{ \ + v16i8 zero_m = { 0 }; \ + out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \ + out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \ +} +#define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__) + +#define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2, slide_val) \ +{ \ + v16i8 zero_m = { 0 }; \ + SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ + out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val); \ +} +#define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__) + +#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \ + out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \ +} +#define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__) + +#define ADD_ABS_H3(RTYPE, in0, in1, in2, out0, out1, out2) \ +{ \ + RTYPE zero = {0}; \ + \ + out0 = __msa_add_a_h((v8i16) zero, in0); \ + out1 = __msa_add_a_h((v8i16) zero, in1); \ + out2 = __msa_add_a_h((v8i16) zero, in2); \ +} +#define ADD_ABS_H3_SH(...) ADD_ABS_H3(v8i16, __VA_ARGS__) + +#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \ + out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \ +} +#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) + +#define CMP_AND_SELECT(inp0, inp1, inp2, inp3, inp4, inp5, out0) \ +{ \ + v8i16 _sel_h0, _sel_h1; \ + v16u8 _sel_b0, _sel_b1; \ + _sel_h0 = (v8i16) __msa_clt_u_h((v8u16) inp1, (v8u16) inp0); \ + _sel_b0 = (v16u8) __msa_pckev_b((v16i8) _sel_h0, (v16i8) _sel_h0); \ + inp0 = (v8i16) __msa_bmnz_v((v16u8) inp0, (v16u8) inp1, (v16u8) _sel_h0); \ + inp4 = (v16u8) __msa_bmnz_v(inp3, inp4, _sel_b0); \ + _sel_h1 = (v8i16) __msa_clt_u_h((v8u16) inp2, (v8u16) inp0); \ + _sel_b1 = (v16u8) __msa_pckev_b((v16i8) _sel_h1, (v16i8) _sel_h1); \ + inp4 = (v16u8) __msa_bmnz_v(inp4, inp5, _sel_b1); \ + out0 += inp4; \ +} + +void png_read_filter_row_up_msa(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_size_t i, cnt, cnt16, cnt32; + png_size_t istop = row_info->rowbytes; + png_bytep rp = row; + png_const_bytep pp = prev_row; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + for (i = 0; i < (istop >> 6); i++) + { + LD_UB4(rp, 16, src0, src1, src2, src3); + LD_UB4(pp, 16, src4, src5, src6, src7); + pp += 64; + + ADD4(src0, src4, src1, src5, src2, src6, src3, src7, + src0, src1, src2, src3); + + ST_UB4(src0, src1, src2, src3, rp, 16); + rp += 64; + } + + if (istop & 0x3F) + { + cnt32 = istop & 0x20; + cnt16 = istop & 0x10; + cnt = istop & 0xF; + + if(cnt32) + { + if (cnt16 && cnt) + { + LD_UB4(rp, 16, src0, src1, src2, src3); + LD_UB4(pp, 16, src4, src5, src6, src7); + + ADD4(src0, src4, src1, src5, src2, src6, src3, src7, + src0, src1, src2, src3); + + ST_UB4(src0, src1, src2, src3, rp, 16); + rp += 64; + } + else if (cnt16 || cnt) + { + LD_UB2(rp, 16, src0, src1); + LD_UB2(pp, 16, src4, src5); + pp += 32; + src2 = LD_UB(rp + 32); + src6 = LD_UB(pp); + + ADD3(src0, src4, src1, src5, src2, src6, src0, src1, src2); + + ST_UB2(src0, src1, rp, 16); + rp += 32; + ST_UB(src2, rp); + rp += 16; + } + else + { + LD_UB2(rp, 16, src0, src1); + LD_UB2(pp, 16, src4, src5); + + ADD2(src0, src4, src1, src5, src0, src1); + + ST_UB2(src0, src1, rp, 16); + rp += 32; + } + } + else if (cnt16 && cnt) + { + LD_UB2(rp, 16, src0, src1); + LD_UB2(pp, 16, src4, src5); + + ADD2(src0, src4, src1, src5, src0, src1); + + ST_UB2(src0, src1, rp, 16); + rp += 32; + } + else if (cnt16 || cnt) + { + src0 = LD_UB(rp); + src4 = LD_UB(pp); + pp += 16; + + src0 += src4; + + ST_UB(src0, rp); + rp += 16; + } + } +} + +void png_read_filter_row_sub4_msa(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_size_t count; + png_size_t istop = row_info->rowbytes; + png_bytep src = row; + png_bytep nxt = row + 4; + int32_t inp0; + v16u8 src0, src1, src2, src3, src4; + v16u8 dst0, dst1; + v16u8 zero = { 0 }; + + istop -= 4; + + inp0 = LW(src); + src += 4; + src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0); + + for (count = 0; count < istop; count += 16) + { + src1 = LD_UB(src); + src += 16; + + src2 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 4); + src3 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 8); + src4 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 12); + src1 += src0; + src2 += src1; + src3 += src2; + src4 += src3; + src0 = src4; + ILVEV_W2_UB(src1, src2, src3, src4, dst0, dst1); + dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0); + + ST_UB(dst0, nxt); + nxt += 16; + } +} + +void png_read_filter_row_sub3_msa(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_size_t count; + png_size_t istop = row_info->rowbytes; + png_bytep src = row; + png_bytep nxt = row + 3; + int64_t out0; + int32_t inp0, out1; + v16u8 src0, src1, src2, src3, src4, dst0, dst1; + v16u8 zero = { 0 }; + v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 }; + + istop -= 3; + + inp0 = LW(src); + src += 3; + src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0); + + for (count = 0; count < istop; count += 12) + { + src1 = LD_UB(src); + src += 12; + + src2 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 3); + src3 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 6); + src4 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 9); + src1 += src0; + src2 += src1; + src3 += src2; + src4 += src3; + src0 = src4; + VSHF_B2_UB(src1, src2, src3, src4, mask0, mask0, dst0, dst1); + dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0); + out0 = __msa_copy_s_d((v2i64) dst0, 0); + out1 = __msa_copy_s_w((v4i32) dst0, 2); + + SD(out0, nxt); + nxt += 8; + SW(out1, nxt); + nxt += 4; + } +} + +void png_read_filter_row_avg4_msa(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_size_t i; + png_bytep src = row; + png_bytep nxt = row; + png_const_bytep pp = prev_row; + png_size_t istop = row_info->rowbytes - 4; + int32_t inp0, inp1, out0; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1; + v16u8 zero = { 0 }; + + inp0 = LW(pp); + pp += 4; + inp1 = LW(src); + src += 4; + src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0); + src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1); + src0 = (v16u8) MSA_SRLI_B(src0, 1); + src1 += src0; + out0 = __msa_copy_s_w((v4i32) src1, 0); + SW(out0, nxt); + nxt += 4; + + for (i = 0; i < istop; i += 16) + { + src2 = LD_UB(pp); + pp += 16; + src6 = LD_UB(src); + src += 16; + + SLDI_B2_0_UB(src2, src6, src3, src7, 4); + SLDI_B2_0_UB(src2, src6, src4, src8, 8); + SLDI_B2_0_UB(src2, src6, src5, src9, 12); + src2 = __msa_ave_u_b(src2, src1); + src6 += src2; + src3 = __msa_ave_u_b(src3, src6); + src7 += src3; + src4 = __msa_ave_u_b(src4, src7); + src8 += src4; + src5 = __msa_ave_u_b(src5, src8); + src9 += src5; + src1 = src9; + ILVEV_W2_UB(src6, src7, src8, src9, dst0, dst1); + dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0); + + ST_UB(dst0, nxt); + nxt += 16; + } +} + +void png_read_filter_row_avg3_msa(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_size_t i; + png_bytep src = row; + png_bytep nxt = row; + png_const_bytep pp = prev_row; + png_size_t istop = row_info->rowbytes - 3; + int64_t out0; + int32_t inp0, inp1, out1; + int16_t out2; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1; + v16u8 zero = { 0 }; + v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 }; + + inp0 = LW(pp); + pp += 3; + inp1 = LW(src); + src += 3; + src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0); + src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1); + src0 = (v16u8) MSA_SRLI_B(src0, 1); + src1 += src0; + out2 = __msa_copy_s_h((v8i16) src1, 0); + SH(out2, nxt); + nxt += 2; + nxt[0] = src1[2]; + nxt++; + + for (i = 0; i < istop; i += 12) + { + src2 = LD_UB(pp); + pp += 12; + src6 = LD_UB(src); + src += 12; + + SLDI_B2_0_UB(src2, src6, src3, src7, 3); + SLDI_B2_0_UB(src2, src6, src4, src8, 6); + SLDI_B2_0_UB(src2, src6, src5, src9, 9); + src2 = __msa_ave_u_b(src2, src1); + src6 += src2; + src3 = __msa_ave_u_b(src3, src6); + src7 += src3; + src4 = __msa_ave_u_b(src4, src7); + src8 += src4; + src5 = __msa_ave_u_b(src5, src8); + src9 += src5; + src1 = src9; + VSHF_B2_UB(src6, src7, src8, src9, mask0, mask0, dst0, dst1); + dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0); + out0 = __msa_copy_s_d((v2i64) dst0, 0); + out1 = __msa_copy_s_w((v4i32) dst0, 2); + + SD(out0, nxt); + nxt += 8; + SW(out1, nxt); + nxt += 4; + } +} + +void png_read_filter_row_paeth4_msa(png_row_infop row_info, + png_bytep row, + png_const_bytep prev_row) +{ + int32_t count, rp_end; + png_bytep nxt; + png_const_bytep prev_nxt; + int32_t inp0, inp1, res0; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16u8 src10, src11, src12, src13, dst0, dst1; + v8i16 vec0, vec1, vec2; + v16u8 zero = { 0 }; + + nxt = row; + prev_nxt = prev_row; + + inp0 = LW(nxt); + inp1 = LW(prev_nxt); + prev_nxt += 4; + src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0); + src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1); + + src1 += src0; + res0 = __msa_copy_s_w((v4i32) src1, 0); + + SW(res0, nxt); + nxt += 4; + + /* Remainder */ + rp_end = row_info->rowbytes - 4; + + for (count = 0; count < rp_end; count += 16) + { + src2 = LD_UB(prev_nxt); + prev_nxt += 16; + src6 = LD_UB(prev_row); + prev_row += 16; + src10 = LD_UB(nxt); + + SLDI_B3_0_UB(src2, src6, src10, src3, src7, src11, 4); + SLDI_B3_0_UB(src2, src6, src10, src4, src8, src12, 8); + SLDI_B3_0_UB(src2, src6, src10, src5, src9, src13, 12); + ILVR_B2_SH(src2, src6, src1, src6, vec0, vec1); + HSUB_UB2_SH(vec0, vec1, vec0, vec1); + vec2 = vec0 + vec1; + ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); + CMP_AND_SELECT(vec0, vec1, vec2, src1, src2, src6, src10); + ILVR_B2_SH(src3, src7, src10, src7, vec0, vec1); + HSUB_UB2_SH(vec0, vec1, vec0, vec1); + vec2 = vec0 + vec1; + ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); + CMP_AND_SELECT(vec0, vec1, vec2, src10, src3, src7, src11); + ILVR_B2_SH(src4, src8, src11, src8, vec0, vec1); + HSUB_UB2_SH(vec0, vec1, vec0, vec1); + vec2 = vec0 + vec1; + ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); + CMP_AND_SELECT(vec0, vec1, vec2, src11, src4, src8, src12); + ILVR_B2_SH(src5, src9, src12, src9, vec0, vec1); + HSUB_UB2_SH(vec0, vec1, vec0, vec1); + vec2 = vec0 + vec1; + ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); + CMP_AND_SELECT(vec0, vec1, vec2, src12, src5, src9, src13); + src1 = src13; + ILVEV_W2_UB(src10, src11, src12, src1, dst0, dst1); + dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0); + + ST_UB(dst0, nxt); + nxt += 16; + } +} + +void png_read_filter_row_paeth3_msa(png_row_infop row_info, + png_bytep row, + png_const_bytep prev_row) +{ + int32_t count, rp_end; + png_bytep nxt; + png_const_bytep prev_nxt; + int64_t out0; + int32_t inp0, inp1, out1; + int16_t out2; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1; + v16u8 src10, src11, src12, src13; + v8i16 vec0, vec1, vec2; + v16u8 zero = { 0 }; + v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 }; + + nxt = row; + prev_nxt = prev_row; + + inp0 = LW(nxt); + inp1 = LW(prev_nxt); + prev_nxt += 3; + src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0); + src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1); + + src1 += src0; + out2 = __msa_copy_s_h((v8i16) src1, 0); + + SH(out2, nxt); + nxt += 2; + nxt[0] = src1[2]; + nxt++; + + /* Remainder */ + rp_end = row_info->rowbytes - 3; + + for (count = 0; count < rp_end; count += 12) + { + src2 = LD_UB(prev_nxt); + prev_nxt += 12; + src6 = LD_UB(prev_row); + prev_row += 12; + src10 = LD_UB(nxt); + + SLDI_B3_0_UB(src2, src6, src10, src3, src7, src11, 3); + SLDI_B3_0_UB(src2, src6, src10, src4, src8, src12, 6); + SLDI_B3_0_UB(src2, src6, src10, src5, src9, src13, 9); + ILVR_B2_SH(src2, src6, src1, src6, vec0, vec1); + HSUB_UB2_SH(vec0, vec1, vec0, vec1); + vec2 = vec0 + vec1; + ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); + CMP_AND_SELECT(vec0, vec1, vec2, src1, src2, src6, src10); + ILVR_B2_SH(src3, src7, src10, src7, vec0, vec1); + HSUB_UB2_SH(vec0, vec1, vec0, vec1); + vec2 = vec0 + vec1; + ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); + CMP_AND_SELECT(vec0, vec1, vec2, src10, src3, src7, src11); + ILVR_B2_SH(src4, src8, src11, src8, vec0, vec1); + HSUB_UB2_SH(vec0, vec1, vec0, vec1); + vec2 = vec0 + vec1; + ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); + CMP_AND_SELECT(vec0, vec1, vec2, src11, src4, src8, src12); + ILVR_B2_SH(src5, src9, src12, src9, vec0, vec1); + HSUB_UB2_SH(vec0, vec1, vec0, vec1); + vec2 = vec0 + vec1; + ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); + CMP_AND_SELECT(vec0, vec1, vec2, src12, src5, src9, src13); + src1 = src13; + VSHF_B2_UB(src10, src11, src12, src13, mask0, mask0, dst0, dst1); + dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0); + out0 = __msa_copy_s_d((v2i64) dst0, 0); + out1 = __msa_copy_s_w((v4i32) dst0, 2); + + SD(out0, nxt); + nxt += 8; + SW(out1, nxt); + nxt += 4; + } +} + +#endif /* PNG_MIPS_MSA_OPT > 0 */ +#endif /* PNG_MIPS_MSA_IMPLEMENTATION == 1 (intrinsics) */ +#endif /* READ */ diff --git a/xs/src/png/libpng/mips/mips_init.c b/xs/src/png/libpng/mips/mips_init.c new file mode 100644 index 0000000000..0bfb7a32e9 --- /dev/null +++ b/xs/src/png/libpng/mips/mips_init.c @@ -0,0 +1,129 @@ + +/* mips_init.c - MSA optimised filter functions + * + * Copyright (c) 2016 Glenn Randers-Pehrson + * Written by Mandar Sahastrabuddhe, 2016. + * Last changed in libpng 1.6.25 [September 1, 2016] + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ +/* Below, after checking __linux__, various non-C90 POSIX 1003.1 functions are + * called. + */ +#define _POSIX_SOURCE 1 + +#include +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED + +#if PNG_MIPS_MSA_OPT > 0 +#ifdef PNG_MIPS_MSA_CHECK_SUPPORTED /* Do run-time checks */ +/* WARNING: it is strongly recommended that you do not build libpng with + * run-time checks for CPU features if at all possible. In the case of the MIPS + * MSA instructions there is no processor-specific way of detecting the + * presence of the required support, therefore run-time detection is extremely + * OS specific. + * + * You may set the macro PNG_MIPS_MSA_FILE to the file name of file containing + * a fragment of C source code which defines the png_have_msa function. There + * are a number of implementations in contrib/mips-msa, but the only one that + * has partial support is contrib/mips-msa/linux.c - a generic Linux + * implementation which reads /proc/cpufino. + */ +#ifndef PNG_MIPS_MSA_FILE +# ifdef __linux__ +# define PNG_MIPS_MSA_FILE "contrib/mips-msa/linux.c" +# endif +#endif + +#ifdef PNG_MIPS_MSA_FILE + +#include /* for sig_atomic_t */ +static int png_have_msa(png_structp png_ptr); +#include PNG_MIPS_MSA_FILE + +#else /* PNG_MIPS_MSA_FILE */ +# error "PNG_MIPS_MSA_FILE undefined: no support for run-time MIPS MSA checks" +#endif /* PNG_MIPS_MSA_FILE */ +#endif /* PNG_MIPS_MSA_CHECK_SUPPORTED */ + +#ifndef PNG_ALIGNED_MEMORY_SUPPORTED +# error "ALIGNED_MEMORY is required; set: -DPNG_ALIGNED_MEMORY_SUPPORTED" +#endif + +void +png_init_filter_functions_msa(png_structp pp, unsigned int bpp) +{ + /* The switch statement is compiled in for MIPS_MSA_API, the call to + * png_have_msa is compiled in for MIPS_MSA_CHECK. If both are defined + * the check is only performed if the API has not set the MSA option on + * or off explicitly. In this case the check controls what happens. + */ + +#ifdef PNG_MIPS_MSA_API_SUPPORTED + switch ((pp->options >> PNG_MIPS_MSA) & 3) + { + case PNG_OPTION_UNSET: + /* Allow the run-time check to execute if it has been enabled - + * thus both API and CHECK can be turned on. If it isn't supported + * this case will fall through to the 'default' below, which just + * returns. + */ +#endif /* PNG_MIPS_MSA_API_SUPPORTED */ +#ifdef PNG_MIPS_MSA_CHECK_SUPPORTED + { + static volatile sig_atomic_t no_msa = -1; /* not checked */ + + if (no_msa < 0) + no_msa = !png_have_msa(pp); + + if (no_msa) + return; + } +#ifdef PNG_MIPS_MSA_API_SUPPORTED + break; +#endif +#endif /* PNG_MIPS_MSA_CHECK_SUPPORTED */ + +#ifdef PNG_MIPS_MSA_API_SUPPORTED + default: /* OFF or INVALID */ + return; + + case PNG_OPTION_ON: + /* Option turned on */ + break; + } +#endif + + /* IMPORTANT: any new external functions used here must be declared using + * PNG_INTERNAL_FUNCTION in ../pngpriv.h. This is required so that the + * 'prefix' option to configure works: + * + * ./configure --with-libpng-prefix=foobar_ + * + * Verify you have got this right by running the above command, doing a build + * and examining pngprefix.h; it must contain a #define for every external + * function you add. (Notice that this happens automatically for the + * initialization function.) + */ + pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_msa; + + if (bpp == 3) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_msa; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_msa; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth3_msa; + } + + else if (bpp == 4) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_msa; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_msa; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth4_msa; + } +} +#endif /* PNG_MIPS_MSA_OPT > 0 */ +#endif /* READ */ diff --git a/xs/src/png/libpng/pngusr.dfa b/xs/src/png/libpng/pngusr.dfa new file mode 100644 index 0000000000..83067c38c0 --- /dev/null +++ b/xs/src/png/libpng/pngusr.dfa @@ -0,0 +1,14 @@ +# pngusr.dfa +# +# Build time configuration of libpng +# +# Enter build configuration options in this file +# +# Security settings: by default these limits are unset, you can change them +# here by entering the appropriate values as #defines preceded by '@' (to cause, +# them to be passed through to the build of pnglibconf.h), for example: +# +# @# define PNG_USER_WIDTH_MAX 65535 +# @# define PNG_USER_HEIGHT_MAX 65535 +# @# define PNG_USER_CHUNK_CACHE_MAX 256 +# @# define PNG_USER_CHUNK_MALLOC_MAX 640000 diff --git a/xs/src/png/libpng/powerpc/filter_vsx_intrinsics.c b/xs/src/png/libpng/powerpc/filter_vsx_intrinsics.c new file mode 100644 index 0000000000..e3de496bdc --- /dev/null +++ b/xs/src/png/libpng/powerpc/filter_vsx_intrinsics.c @@ -0,0 +1,767 @@ +/* filter_vsx_intrinsics.c - PowerPC optimised filter functions + * + * Copyright (c) 2017 Glenn Randers-Pehrson + * Written by Vadim Barkov, 2017. + * Last changed in libpng 1.6.29 [March 16, 2017] + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ +#include +#include +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED + +/* This code requires -maltivec and -mvsx on the command line: */ +#if PNG_POWERPC_VSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */ + +#include + +#if PNG_POWERPC_VSX_OPT > 0 + +#ifndef __VSX__ +# error "This code requires VSX support (POWER7 and later). Please provide -mvsx compiler flag." +#endif + +#define vec_ld_unaligned(vec,data) vec = vec_vsx_ld(0,data) +#define vec_st_unaligned(vec,data) vec_vsx_st(vec,0,data) + + +/* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d). + * They're positioned like this: + * prev: c b + * row: a d + * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be + * whichever of a, b, or c is closest to p=a+b-c. + * ( this is taken from ../intel/filter_sse2_intrinsics.c ) + */ + +#define vsx_declare_common_vars(row_info,row,prev_row,offset) \ + png_byte i;\ + png_bytep rp = row + offset;\ + png_const_bytep pp = prev_row;\ + png_size_t unaligned_top = 16 - (((png_size_t)rp % 16));\ + png_size_t istop;\ + if(unaligned_top == 16)\ + unaligned_top = 0;\ + istop = row_info->rowbytes;\ + if((unaligned_top < istop))\ + istop -= unaligned_top;\ + else{\ + unaligned_top = istop;\ + istop = 0;\ + } + +void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vsx_declare_common_vars(row_info,row,prev_row,0) + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + rp_vec = vec_ld(0,rp); + vec_ld_unaligned(pp_vec,pp); + + rp_vec = vec_add(rp_vec,pp_vec); + + vec_st(rp_vec,0,rp); + + pp += 16; + rp += 16; + istop -= 16; + } + + if(istop > 0) + { + /* If byte count of row is not divisible by 16 + * we will process remaining part as usual + */ + for (i = 0; i < istop; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); + rp++; + } +} + +} + +static const vector unsigned char VSX_LEFTSHIFTED1_4 = {16,16,16,16, 0, 1, 2, 3,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED2_4 = {16,16,16,16,16,16,16,16, 4, 5, 6, 7,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 8, 9,10,11}; + +static const vector unsigned char VSX_LEFTSHIFTED1_3 = {16,16,16, 0, 1, 2,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED2_3 = {16,16,16,16,16,16, 3, 4, 5,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 6, 7, 8,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 9,10,11,16}; + +static const vector unsigned char VSX_NOT_SHIFTED1_4 = {16,16,16,16, 4, 5, 6, 7,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED2_4 = {16,16,16,16,16,16,16,16, 8, 9,10,11,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,15}; + +static const vector unsigned char VSX_NOT_SHIFTED1_3 = {16,16,16, 3, 4, 5,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED2_3 = {16,16,16,16,16,16, 6, 7, 8,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16}; + +static const vector unsigned char VSX_CHAR_ZERO = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; +#ifdef __LITTLE_ENDIAN__ + +static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = { 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = { 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {12,16,13,16,14,16,15,16,16,16,16,16,16,16,16,16}; + +static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 0, 2, 4, 6,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 0, 2, 4, 6,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4, 6}; + +static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = { 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = { 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = { 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {12,16,13,16,14,16,16,16,16,16,16,16,16,16,16,16}; + +static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 0, 2, 4,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 0, 2, 4,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 0, 2, 4,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4,16}; + +#elif defined(__BIG_ENDIAN__) + +static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = {16, 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = {16, 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {16,12,16,13,16,14,16,15,16,16,16,16,16,16,16,16}; + +static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 1, 3, 5, 7,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 1, 3, 5, 7,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5, 7}; + +static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = {16, 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = {16, 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = {16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {16,12,16,13,16,14,16,16,16,16,16,16,16,16,16,16}; + +static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 1, 3, 5,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 1, 3, 5,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 1, 3, 5,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5,16}; + +#endif + +#define vsx_char_to_short(vec,offset,bpp) (vector unsigned short)vec_perm((vec),VSX_CHAR_ZERO,VSX_CHAR_TO_SHORT##offset##_##bpp) +#define vsx_short_to_char(vec,offset,bpp) vec_perm(((vector unsigned char)(vec)),VSX_CHAR_ZERO,VSX_SHORT_TO_CHAR##offset##_##bpp) + +#ifdef PNG_USE_ABS +# define vsx_abs(number) abs(number) +#else +# define vsx_abs(number) (number > 0) ? (number) : -(number) +#endif + +void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const png_byte bpp = 4; + + vector unsigned char rp_vec; + vector unsigned char part_vec; + + vsx_declare_common_vars(row_info,row,prev_row,bpp) + + PNG_UNUSED(pp) + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + rp -= bpp; + + rp_vec = vec_ld(0,rp); + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4); + rp_vec = vec_add(rp_vec,part_vec); + + vec_st(rp_vec,0,rp); + + rp += 16; + istop -= 16; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp - bpp))) & 0xff); + rp++; + } + +} + +void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const png_byte bpp = 3; + + vector unsigned char rp_vec; + vector unsigned char part_vec; + + vsx_declare_common_vars(row_info,row,prev_row,bpp) + + PNG_UNUSED(pp) + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + rp -= bpp; + + rp_vec = vec_ld(0,rp); + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3); + rp_vec = vec_add(rp_vec,part_vec); + + vec_st(rp_vec,0,rp); + rp += 15; + istop -= 16; + + /* Since 16 % bpp = 16 % 3 = 1, last element of array must + * be proceeded manually + */ + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } +} + +void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const png_byte bpp = 4; + + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned char pp_part_vec; + vector unsigned char rp_part_vec; + vector unsigned char avg_vec; + + vsx_declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; + + for (i = 0; i < bpp; i++) + { + *rp = (png_byte)(((int)(*rp) + + ((int)(*pp++) / 2 )) & 0xff); + + rp++; + } + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + rp -= bpp; + pp -= bpp; + + vec_ld_unaligned(pp_vec,pp); + rp_vec = vec_ld(0,rp); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_4); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_4); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_4); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + vec_st(rp_vec,0,rp); + + rp += 16; + pp += 16; + istop -= 16; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } +} + +void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const png_byte bpp = 3; + + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned char pp_part_vec; + vector unsigned char rp_part_vec; + vector unsigned char avg_vec; + + vsx_declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; + + for (i = 0; i < bpp; i++) + { + *rp = (png_byte)(((int)(*rp) + + ((int)(*pp++) / 2 )) & 0xff); + + rp++; + } + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + rp -= bpp; + pp -= bpp; + + vec_ld_unaligned(pp_vec,pp); + rp_vec = vec_ld(0,rp); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED4_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + vec_st(rp_vec,0,rp); + + rp += 15; + pp += 15; + istop -= 16; + + /* Since 16 % bpp = 16 % 3 = 1, last element of array must + * be proceeded manually + */ + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + rp++; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } +} + +/* Bytewise c ? t : e. */ +#define if_then_else(c,t,e) vec_sel(e,t,c) + +#define vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) {\ + c = *(pp - bpp);\ + a = *(rp - bpp);\ + b = *pp++;\ + p = b - c;\ + pc = a - c;\ + pa = vsx_abs(p);\ + pb = vsx_abs(pc);\ + pc = vsx_abs(p + pc);\ + if (pb < pa) pa = pb, a = b;\ + if (pc < pa) a = c;\ + a += *rp;\ + *rp++ = (png_byte)a;\ + } + +void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const png_byte bpp = 4; + + int a, b, c, pa, pb, pc, p; + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned short a_vec,b_vec,c_vec,nearest_vec; + vector signed short pa_vec,pb_vec,pc_vec,smallest_vec; + + vsx_declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; + + /* Process the first pixel in the row completely (this is the same as 'up' + * because there is only one candidate predictor for the first row). + */ + for(i = 0; i < bpp ; i++) + { + *rp = (png_byte)( *rp + *pp); + rp++; + pp++; + } + + for(i = 0; i < unaligned_top ; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + while( istop >= 16) + { + for(i = 0; i < bpp ; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + rp -= bpp; + pp -= bpp; + rp_vec = vec_ld(0,rp); + vec_ld_unaligned(pp_vec,pp); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_4),1,4); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,4))); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_4),2,4); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,4))); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_4),3,4); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,4))); + + vec_st(rp_vec,0,rp); + + rp += 16; + pp += 16; + istop -= 16; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } +} + +void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const png_byte bpp = 3; + + int a, b, c, pa, pb, pc, p; + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned short a_vec,b_vec,c_vec,nearest_vec; + vector signed short pa_vec,pb_vec,pc_vec,smallest_vec; + + vsx_declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; + + /* Process the first pixel in the row completely (this is the same as 'up' + * because there is only one candidate predictor for the first row). + */ + for(i = 0; i < bpp ; i++) + { + *rp = (png_byte)( *rp + *pp); + rp++; + pp++; + } + + for(i = 0; i < unaligned_top ; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + while( istop >= 16) + { + for(i = 0; i < bpp ; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + rp -= bpp; + pp -= bpp; + rp_vec = vec_ld(0,rp); + vec_ld_unaligned(pp_vec,pp); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_3),1,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,3))); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_3),2,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,3))); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_3),3,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,3))); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED4_3),4,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,4,3))); + + vec_st(rp_vec,0,rp); + + rp += 15; + pp += 15; + istop -= 16; + + /* Since 16 % bpp = 16 % 3 = 1, last element of array must + * be proceeded manually + */ + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } +} + +#endif /* PNG_POWERPC_VSX_OPT > 0 */ +#endif /* PNG_POWERPC_VSX_IMPLEMENTATION == 1 (intrinsics) */ +#endif /* READ */ diff --git a/xs/src/png/libpng/powerpc/powerpc_init.c b/xs/src/png/libpng/powerpc/powerpc_init.c new file mode 100644 index 0000000000..07016177c5 --- /dev/null +++ b/xs/src/png/libpng/powerpc/powerpc_init.c @@ -0,0 +1,125 @@ + +/* powerpc_init.c - POWERPC optimised filter functions + * + * Copyright (c) 2017 Glenn Randers-Pehrson + * Written by Vadim Barkov, 2017. + * Last changed in libpng 1.6.29 [March 16, 2017] + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ +/* Below, after checking __linux__, various non-C90 POSIX 1003.1 functions are + * called. + */ +#define _POSIX_SOURCE 1 + +#include +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED + +#if PNG_POWERPC_VSX_OPT > 0 +#ifdef PNG_POWERPC_VSX_CHECK_SUPPORTED /* Do run-time checks */ +/* WARNING: it is strongly recommended that you do not build libpng with + * run-time checks for CPU features if at all possible. In the case of the PowerPC + * VSX instructions there is no processor-specific way of detecting the + * presence of the required support, therefore run-time detection is extremely + * OS specific. + * + * You may set the macro PNG_POWERPC_VSX_FILE to the file name of file containing + * a fragment of C source code which defines the png_have_vsx function. There + * are a number of implementations in contrib/powerpc-vsx, but the only one that + * has partial support is contrib/powerpc-vsx/linux.c - a generic Linux + * implementation which reads /proc/cpufino. + */ +#ifndef PNG_POWERPC_VSX_FILE +# ifdef __linux__ +# define PNG_POWERPC_VSX_FILE "contrib/powerpc-vsx/linux_aux.c" +# endif +#endif + +#ifdef PNG_POWERPC_VSX_FILE + +#include /* for sig_atomic_t */ +static int png_have_vsx(png_structp png_ptr); +#include PNG_POWERPC_VSX_FILE + +#else /* PNG_POWERPC_VSX_FILE */ +# error "PNG_POWERPC_VSX_FILE undefined: no support for run-time POWERPC VSX checks" +#endif /* PNG_POWERPC_VSX_FILE */ +#endif /* PNG_POWERPC_VSX_CHECK_SUPPORTED */ + +void +png_init_filter_functions_vsx(png_structp pp, unsigned int bpp) +{ + /* The switch statement is compiled in for POWERPC_VSX_API, the call to + * png_have_vsx is compiled in for POWERPC_VSX_CHECK. If both are defined + * the check is only performed if the API has not set the PowerPC option on + * or off explicitly. In this case the check controls what happens. + */ + +#ifdef PNG_POWERPC_VSX_API_SUPPORTED + switch ((pp->options >> PNG_POWERPC_VSX) & 3) + { + case PNG_OPTION_UNSET: + /* Allow the run-time check to execute if it has been enabled - + * thus both API and CHECK can be turned on. If it isn't supported + * this case will fall through to the 'default' below, which just + * returns. + */ +#endif /* PNG_POWERPC_VSX_API_SUPPORTED */ +#ifdef PNG_POWERPC_VSX_CHECK_SUPPORTED + { + static volatile sig_atomic_t no_vsx = -1; /* not checked */ + + if (no_vsx < 0) + no_vsx = !png_have_vsx(pp); + + if (no_vsx) + return; + } +#ifdef PNG_POWERPC_VSX_API_SUPPORTED + break; +#endif +#endif /* PNG_POWERPC_VSX_CHECK_SUPPORTED */ + +#ifdef PNG_POWERPC_VSX_API_SUPPORTED + default: /* OFF or INVALID */ + return; + + case PNG_OPTION_ON: + /* Option turned on */ + break; + } +#endif + + /* IMPORTANT: any new internal functions used here must be declared using + * PNG_INTERNAL_FUNCTION in ../pngpriv.h. This is required so that the + * 'prefix' option to configure works: + * + * ./configure --with-libpng-prefix=foobar_ + * + * Verify you have got this right by running the above command, doing a build + * and examining pngprefix.h; it must contain a #define for every external + * function you add. (Notice that this happens automatically for the + * initialization function.) + */ + pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_vsx; + + if (bpp == 3) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_vsx; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_vsx; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth3_vsx; + } + + else if (bpp == 4) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_vsx; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_vsx; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth4_vsx; + } +} +#endif /* PNG_POWERPC_VSX_OPT > 0 */ +#endif /* READ */ diff --git a/xs/src/png/libpng/scripts/checksym.awk b/xs/src/png/libpng/scripts/checksym.awk new file mode 100755 index 0000000000..fe3af55e05 --- /dev/null +++ b/xs/src/png/libpng/scripts/checksym.awk @@ -0,0 +1,173 @@ +#!/bin/awk -f +# Check a list of symbols against the master definition +# (official) list. Arguments: +# +# awk -f checksym.awk official-def list-to-check +# +# Output is a file in the current directory called 'symbols.new', +# the value of the awk variable "of" (which can be changed on the +# command line if required.) stdout holds error messages. Error +# code indicates success or failure. +# +# NOTE: this is a pure, old fashioned, awk script. It will +# work with any awk + +BEGIN{ + err=0 + master="" # master file + official[1] = "" # defined symbols from master file + symbol[1] = "" # defined symbols from png.h + removed[1] = "" # removed symbols from png.h + lasto = 0 # last ordinal value from png.h + mastero = 0 # highest ordinal in master file + symbolo = 0 # highest ordinal in png.h + missing = "error"# log an error on missing symbols + of="symbols.new" # default to a fixed name +} + +# Read existing definitions from the master file (the first +# file on the command line.) This must be a def file and it +# has definition lines (others are ignored) of the form: +# +# symbol @ordinal +# +master == "" { + master = FILENAME +} +FILENAME==master && NF==2 && $2~/^@/ && $1!~/^;/ { + o=0+substr($2,2) + if (o > 0) { + if (official[o] == "") { + official[o] = $1 + if (o > mastero) mastero = o + next + } else + print master ": duplicated symbol:", official[o] ":", $0 + } else + print master ": bad export line format:", $0 + err = 1 +} +FILENAME==master && $1==";missing" && NF==2{ + # This allows the master file to control how missing symbols + # are handled; symbols that aren't in either the master or + # the new file. Valid values are 'ignore', 'warning' and + # 'error' + missing = $2 +} +FILENAME==master { + next +} + +# Read new definitions, these are free form but the lines must +# just be symbol definitions. Lines will be commented out for +# 'removed' symbols, introduced in png.h using PNG_REMOVED rather +# than PNG_EXPORT. Use symbols.dfn or pngwin.dfn to generate the +# input file. +# +# symbol @ordinal # two fields, exported symbol +# ; symbol @ordinal # three fields, removed symbol +# ; @ordinal # two fields, the last ordinal +NF==2 && $1 == ";" && $2 ~ /^@[1-9][0-9]*$/ { # last ordinal + o=0+substr($2,2) + if (lasto == 0 || lasto == o) + lasto=o + else { + print "png.h: duplicated last ordinal:", lasto, o + err = 1 + } + next +} +NF==3 && $1 == ";" && $3 ~ /^@[1-9][0-9]*$/ { # removed symbol + o=0+substr($3,2) + if (removed[o] == "" || removed[o] == $2) { + removed[o] = $2 + if (o > symbolo) symbolo = o + } else { + print "png.h: duplicated removed symbol", o ": '" removed[o] "' != '" $2 "'" + err = 1 + } + next +} +NF==2 && $2 ~ /^@[1-9][0-9]*$/ { # exported symbol + o=0+substr($2,2) + if (symbol[o] == "" || symbol[o] == $1) { + symbol[o] = $1 + if (o > symbolo) symbolo = o + } else { + print "png.h: duplicated symbol", o ": '" symbol[o] "' != '" $1 "'" + err = 1 + } +} +{ + next # skip all other lines +} + +# At the end check for symbols marked as both duplicated and removed +END{ + if (symbolo > lasto) { + print "highest symbol ordinal in png.h,", symbolo ", exceeds last ordinal from png.h", lasto + err = 1 + } + if (mastero > lasto) { + print "highest symbol ordinal in", master ",", mastero ", exceeds last ordinal from png.h", lasto + err = 1 + } + unexported=0 + # Add a standard header to symbols.new: + print ";Version INSERT-VERSION-HERE" >of + print ";--------------------------------------------------------------" >of + print "; LIBPNG symbol list as a Win32 DEF file" >of + print "; Contains all the symbols that can be exported from libpng" >of + print ";--------------------------------------------------------------" >of + print "LIBRARY" >of + print "" >of + print "EXPORTS" >of + + for (o=1; o<=lasto; ++o) { + if (symbol[o] == "" && removed[o] == "") { + if (unexported == 0) unexported = o + if (official[o] == "") { + # missing in export list too, so ok + if (o < lasto) continue + } + } + if (unexported != 0) { + # Symbols in the .def but not in the new file are errors, but + # the 'unexported' symbols aren't in either. By default this + # is an error too (see the setting of 'missing' at the start), + # but this can be reset on the command line or by stuff in the + # file - see the comments above. + if (missing != "ignore") { + if (o-1 > unexported) + print "png.h:", missing ": missing symbols:", unexported "-" o-1 + else + print "png.h:", missing ": missing symbol:", unexported + if (missing != "warning") + err = 1 + } + unexported = 0 + } + if (symbol[o] != "" && removed[o] != "") { + print "png.h: symbol", o, "both exported as '" symbol[o] "' and removed as '" removed[o] "'" + err = 1 + } else if (symbol[o] != official[o]) { + # either the symbol is missing somewhere or it changed + err = 1 + if (symbol[o] == "") + print "png.h: symbol", o, "is exported as '" official[o] "' in", master + else if (official[o] == "") + print "png.h: exported symbol", o, "'" symbol[o] "' not present in", master + else + print "png.h: exported symbol", o, "'" symbol[o] "' exists as '" official[o] "' in", master + } + + # Finally generate symbols.new + if (symbol[o] != "") + print " " symbol[o], "@" o > of + } + + if (err != 0) { + print "*** A new list is in", of, "***" + exit 1 + } +} diff --git a/xs/src/png/libpng/scripts/def.c b/xs/src/png/libpng/scripts/def.c new file mode 100644 index 0000000000..0ffcbeb0c2 --- /dev/null +++ b/xs/src/png/libpng/scripts/def.c @@ -0,0 +1,29 @@ +/* def.c - define format of libpng.def + * + * Last changed in libpng version 1.6.16 [December 22, 2014] + * Copyright (c) 2011-2014 Glenn Randers-Pehrson + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ + +/* Write the export file header: */ +PNG_DFN ";--------------------------------------------------------------" +PNG_DFN "; LIBPNG module definition file for OS/2" +PNG_DFN ";--------------------------------------------------------------" +PNG_DFN "" +PNG_DFN "; If you give the library an explicit name one or other files" +PNG_DFN "; may need modifying to support the new name on one or more" +PNG_DFN "; systems." +PNG_DFN "LIBRARY" +PNG_DFN "OS2 DESCRIPTION "PNG image compression library"" +PNG_DFN "OS2 CODE PRELOAD MOVEABLE DISCARDABLE" +PNG_DFN "" +PNG_DFN "EXPORTS" +PNG_DFN ";Version 1.6.35beta02" + +#define PNG_EXPORTA(ordinal, type, name, args, attributes)\ + PNG_DFN "@" SYMBOL_PREFIX "@@" name "@" + +#include "../png.h" diff --git a/xs/src/png/libpng/scripts/dfn.awk b/xs/src/png/libpng/scripts/dfn.awk new file mode 100755 index 0000000000..346b9db7d5 --- /dev/null +++ b/xs/src/png/libpng/scripts/dfn.awk @@ -0,0 +1,203 @@ +#!/bin/awk -f +# scripts/dfn.awk - process a .dfn file +# +# last changed in libpng version 1.5.19 - August 21, 2014 +# +# Copyright (c) 2013-2014 Glenn Randers-Pehrson +# +# This code is released under the libpng license. +# For conditions of distribution and use, see the disclaimer +# and license in png.h + +# The output of this script is written to the file given by +# the variable 'out', which should be set on the command line. +# Error messages are printed to stdout and if any are printed +# the script will exit with error code 1. + +BEGIN{ + out="/dev/null" # as a flag + out_count=0 # count of output lines + err=0 # set if an error occurred + sort=0 # sort the output + array[""]="" +} + +# The output file must be specified before any input: +NR==1 && out == "/dev/null" { + print "out=output.file must be given on the command line" + # but continue without setting the error code; this allows the + # script to be checked easily +} + +# Output can be sorted; two lines are recognized +$1 == "PNG_DFN_START_SORT"{ + sort=0+$2 + next +} + +$1 ~ /^PNG_DFN_END_SORT/{ + # Do a very simple, slow, sort; notice that blank lines won't be + # output by this + for (entry in array) { + while (array[entry] != "") { + key = entry + value = array[key] + array[key] = "" + + for (alt in array) { + if (array[alt] != "" && alt < key) { + array[key] = value + value = array[alt] + key = alt + array[alt] = "" + } + } + + print value >out + } + } + sort=0 + next +} + +/^[^"]*PNG_DFN *".*"[^"]*$/{ + # A definition line, apparently correctly formatted; extract the + # definition then replace any doubled "" that remain with a single + # double quote. Notice that the original doubled double quotes + # may have been split by tokenization + # + # Sometimes GCC splits the PNG_DFN lines; we know this has happened + # if the quotes aren't closed and must read another line. In this + # case it is essential to reject lines that start with '#' because those + # are introduced #line directives. + orig=$0 + line=$0 + lineno=FNR + if (lineno == "") lineno=NR + + if (sub(/^[^"]*PNG_DFN *"/,"",line) != 1) { + print "line", lineno ": processing failed:" + print orig + err=1 + next + } else { + ++out_count + } + + # Now examine quotes within the value: + # + # @" - delete this and any following spaces + # "@ - delete this and any preceding spaces + # @' - replace this by a double quote + # + # This allows macro substitution by the C compiler thus: + # + # #define first_name John + # #define last_name Smith + # + # PNG_DFN"#define name @'@" first_name "@ @" last_name "@@'" + # + # Might get C preprocessed to: + # + # PNG_DFN "#define foo @'@" John "@ @" Smith "@@'" + # + # Which this script reduces to: + # + # #define name "John Smith" + # + while (1) { + # While there is an @" remove it and the next "@ + if (line ~ /@"/) { + if (line ~ /@".*"@/) { + # Do this special case first to avoid swallowing extra spaces + # before or after the @ stuff: + if (!sub(/@" *"@/, "", line)) { + # Ok, do it in pieces - there has to be a non-space between the + # two. NOTE: really weird things happen if a leading @" is + # lost - the code will error out below (I believe). + if (!sub(/@" */, "", line) || !sub(/ *"@/, "", line)) { + print "line", lineno, ": internal error:", orig + exit 1 + } + } + } + + # There is no matching "@. Assume a split line + else while (1) { + if (getline nextline) { + # If the line starts with '#' it is a preprocesor line directive + # from cc -E; skip it: + if (nextline !~ /^#/) { + line = line " " nextline + break + } + } else { + # This is end-of-input - probably a missing "@ on the first line: + print "line", lineno ": unbalanced @\" ... \"@ pair" + err=1 + next + } + } + + # Keep going until all the @" have gone + continue + } + + # Attempt to remove a trailing " (not preceded by '@') - if this can + # be done, stop now; if not assume a split line again + if (sub(/"[^"]*$/, "", line)) + break + + # Read another line + while (1) { + if (getline nextline) { + if (nextline !~ /^#/) { + line = line " " nextline + # Go back to stripping @" "@ pairs + break + } + } else { + print "line", lineno ": unterminated PNG_DFN string" + err=1 + next + } + } + } + + # Put any needed double quotes in (at the end, because these would otherwise + # interfere with the processing above.) + gsub(/@'/,"\"", line) + + # Remove any trailing spaces (not really required, but for + # editorial consistency + sub(/ *$/, "", line) + + # Remove trailing CR + sub(/ $/, "", line) + + if (sort) { + if (split(line, parts) < sort) { + print "line", lineno ": missing sort field:", line + err=1 + } else + array[parts[sort]] = line + } + + else + print line >out + next +} + +/PNG_DFN/{ + print "line", NR, "incorrectly formatted PNG_DFN line:" + print $0 + err = 1 +} + +END{ + if (out_count > 0 || err > 0) + exit err + + print "no definition lines found" + exit 1 +} diff --git a/xs/src/png/libpng/scripts/intprefix.c b/xs/src/png/libpng/scripts/intprefix.c new file mode 100644 index 0000000000..254f8e94b4 --- /dev/null +++ b/xs/src/png/libpng/scripts/intprefix.c @@ -0,0 +1,22 @@ + +/* intprefix.c - generate an unprefixed internal symbol list + * + * Last changed in libpng version 1.6.16 [December 22, 2014] + * Copyright (c) 2013-2014 Glenn Randers-Pehrson + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ + +#define PNG_INTERNAL_DATA(type, name, array)\ + PNG_DFN "@" name "@" + +#define PNG_INTERNAL_FUNCTION(type, name, args, attributes)\ + PNG_DFN "@" name "@" + +#define PNG_INTERNAL_CALLBACK(type, name, args, attributes)\ + PNG_DFN "@" name "@" + +#define PNGPREFIX_H /* self generation */ +#include "../pngpriv.h" diff --git a/xs/src/png/libpng/scripts/options.awk b/xs/src/png/libpng/scripts/options.awk new file mode 100755 index 0000000000..fef5dfd78b --- /dev/null +++ b/xs/src/png/libpng/scripts/options.awk @@ -0,0 +1,898 @@ +#!/bin/awk -f +# scripts/options.awk - library build configuration control +# +# last changed in libpng version 1.6.11 - June 5, 2014 +# +# Copyright (c) 1998-2014 Glenn Randers-Pehrson +# +# This code is released under the libpng license. +# For conditions of distribution and use, see the disclaimer +# and license in png.h + +# The output of this script is written to the file given by +# the variable 'out'. The script is run twice, once with +# an intermediate output file, 'options.tmp' then again on +# that file to produce the final output: +# +# awk -f scripts/options.awk out=options.tmp scripts/options.dfa 1>&2 +# awk -f scripts/options.awk out=options.dfn options.tmp 1>&2 +# +# Some options may be specified on the command line: +# +# deb=1 Causes debugging to be output +# logunsupported=1 Causes all options to be recorded in the output +# everything=off Causes all options to be disabled by default +# everything=on Causes all options to be enabled by default +# +# If awk fails on your platform, try nawk instead. +# +# These options may also be specified in the original input file (and +# are copied to the preprocessed file). + +BEGIN{ + out="" # intermediate, preprocessed, file + pre=-1 # preprocess (first line) + version="libpng version unknown" # version information + version_file="" # where to find the version + err=0 # in-line exit sets this + # The following definitions prevent the C preprocessor noticing the lines + # that will be in the final output file. Some C preprocessors tokenise + # the lines, for example by inserting spaces around operators, and all + # C preprocessors notice lines that start with '#', most remove comments. + # The technique adopted here is to make the final output lines into + # C strings (enclosed in double quotes), preceded by PNG_DFN. As a + # consequence the output cannot contain a 'raw' double quote - instead put + # @' in, this will be replaced by a single " afterward. See the parser + # script dfn.awk for more capabilities (not required here). Note that if + # you need a " in a 'setting' in pnglibconf.dfa it must also be @'! + dq="@'" # For a single double quote + start=" PNG_DFN \"" # Start stuff to output (can't contain a "!) + end="\" " # End stuff to output + subs="@\" " # Substitute start (substitute a C macro) + sube=" \"@" # Substitute end + comment=start "/*" # Comment start + cend="*/" end # Comment end + def=start "#define PNG_" # Arbitrary define + sup="_SUPPORTED" end # end supported option + und=comment "#undef PNG_" # Unsupported option + une="_SUPPORTED" cend # end unsupported option + error=start "ERROR:" # error message, terminate with 'end' + + # Variables + deb=0 # debug - set on command line + everything="" # do not override defaults + logunsupported=0 # write unsupported options too + + # Precreate arrays + # for each option: + option[""] = "" # list of all options: default enabled/disabled + done[""] = 1 # marks option as having been output + requires[""] = "" # requires by option + iffs[""] = "" # if by option + enabledby[""] = "" # options that enable it by option + sets[""] = "" # settings set by each option + setval[""] = "" # value to set (indexed: 'option sets[option]') + # for each setting: + setting[""] = "" # requires by setting + defaults[""] = "" # used for a defaulted value + doneset[""] = 1 # marks setting as having been output + r[""] = "" # Temporary array + + # For decorating the output file + protect = "" +} + +# The output file must be specified before any input: +out == "" { + print "out=output.file must be given on the command line" + err = 1 + exit 1 +} + +# The very first line indicates whether we are reading pre-processed +# input or not, this must come *first* because 'PREPROCESSED' needs +# to be the very first line in the temporary file. +pre == -1{ + if ($0 == "PREPROCESSED") { + pre = 0 + next + } else { + pre = 1 + print "PREPROCESSED" >out + # And fall through to continue processing + } +} + +# While pre-processing if version is set to "search" look for a version string +# in the following file. +pre && version == "search" && version_file == ""{ + version_file = FILENAME +} + +pre && version == "search" && version_file != FILENAME{ + print "version string not found in", version_file + err = 1 + exit 1 +} + +pre && version == "search" && $0 ~ /^ \* libpng version/{ + version = substr($0, 4) + print "version =", version >out + next +} + +pre && FILENAME == version_file{ + next +} + +# variable=value +# Sets the given variable to the given value (the syntax is fairly +# free form, except for deb (you are expected to understand how to +# set the debug variable...) +# +# This happens before the check on 'pre' below skips most of the +# rest of the actions, so the variable settings happen during +# preprocessing but are recorded in the END action too. This +# allows them to be set on the command line too. +$0 ~ /^[ ]*version[ ]*=/{ + sub(/^[ ]*version[ ]*=[ ]*/, "") + version = $0 + next +} +$0 ~ /^[ ]*everything[ =]*off[ ]*$/{ + everything = "off" + next +} +$0 ~ /^[ ]*everything[ =]*on[ ]*$/{ + everything = "on" + next +} +$0 ~ /^[ ]*logunsupported[ =]*0[ ]*$/{ + logunsupported = 0 + next +} +$0 ~ /^[ ]*logunsupported[ =]*1[ ]*$/{ + logunsupported = 1 + next +} +$1 == "deb" && $2 == "=" && NF == 3{ + deb = $3 + next +} + +# Preprocessing - this just copies the input file with lines +# that need preprocessing (just chunk at present) expanded +# The bare "pre" instead of "pre != 0" crashes under Sunos awk +pre && $1 != "chunk"{ + print >out + next +} + +# The first characters of the line determine how it is processed, +# leading spaces are ignored. In general tokens that are not +# keywords are the names of options. An option 'name' is +# controlled by the definition of the corresponding macros: +# +# PNG_name_SUPPORTED The option is turned on +# PNG_NO_name +# PNG_NO_name_SUPPORTED If the first macro is not defined +# either of these will turn the option off +# +# If none of these macros are defined the option is turned on, unless +# the keyword 'off' is given in a line relating to the option. The +# keyword 'on' can also be given, but it will be ignored (since it is +# the default.) +# +# In the syntax below a 'name' is indicated by "NAME", other macro +# values are indicated by "MACRO", as with "NAME" the leading "PNG_" +# is omitted, but in this case the "NO_" prefix and the "_SUPPORTED" +# suffix are never used. +# +# Each line is introduced by a keyword - the first non-space characters +# on the line. A line starting with a '#' is a comment - it is totally +# ignored. Keywords are as follows, a NAME, is simply a macro name +# without the leading PNG_, PNG_NO_ or the trailing _SUPPORTED. + +$1 ~ /^#/ || $0 ~ /^[ ]*$/{ + next +} + +# com +# The whole line is placed in the output file as a comment with +# the preceding 'com' removed +$1 == "com"{ + if (NF > 1) { + # sub(/^[ ]*com[ ]*/, "") + $1 = "" + print comment $0, cend >out + } else + print start end >out + next +} + +# version +# Inserts a version comment +$1 == "version" && NF == 1{ + if (version == "") { + print "ERROR: no version string set" + err = 1 # prevent END{} running + exit 1 + } + + print comment, version, cend >out + next +} + +# file output input protect +# Informational: the official name of the input file (without +# make generated local directories), the official name of the +# output file and, if required, a name to use in a protection +# macro for the contents. +$1 == "file" && NF >= 2{ + print comment, $2, cend >out + print comment, "Machine generated file: DO NOT EDIT", cend >out + if (NF >= 3) + print comment, "Derived from:", $3, cend >out + protect = $4 + if (protect != "") { + print start "#ifndef", protect end >out + print start "#define", protect end >out + } + next +} + +# option NAME ( (requires|enables|if) NAME* | on | off | disabled | +# sets SETTING VALUE+ )* +# +# Declares an option 'NAME' and describes its default setting (disabled) +# and its relationship to other options. The option is disabled +# unless *all* the options listed after 'requires' are set and at +# least one of the options listed after 'if' is set. If the +# option is set then it turns on all the options listed after 'enables'. +# +# Note that "enables" takes priority over the required/if/disabled/off +# setting of the target option. +# +# The definition file may list an option as 'disabled': off by default, +# otherwise the option is enabled: on by default. A later (and it must +# be later) entry may turn an option on or off explicitly. + +$1 == "option" && NF >= 2{ + opt = $2 + sub(/,$/,"",opt) + onoff = option[opt] # records current (and the default is "", enabled) + key = "" + istart = 3 + do { + if (istart == 1) { # continuation line + val = getline + + if (val != 1) { # error reading it + if (val == 0) + print "option", opt ": ERROR: missing continuation line" + else + print "option", opt ": ERROR: error reading continuation line" + + # This is a hard error + err = 1 # prevent END{} running + exit 1 + } + } + + for (i=istart; i<=NF; ++i) { + val=$(i) + sub(/,$/,"",val) + if (val == "on" || val == "off" || val == "disabled" || val =="enabled") { + key = "" + if (onoff != val) { + # on or off can zap disabled or enabled: + if (onoff == "" || (onoff == "disabled" || onoff == "enabled") && + (val == "on" || val == "off")) { + # It's easy to mis-spell the option when turning it + # on or off, so warn about it here: + if (onoff == "" && (val == "on" || val == "off")) { + print "option", opt ": ERROR: turning unrecognized option", val + # For the moment error out - it is safer + err = 1 # prevent END{} running + exit 1 + } + onoff = val + } else { + # Print a message, otherwise the error + # below is incomprehensible + print "option", opt ": currently", onoff ": attempt to turn", val + break + } + } + } else if (val == "requires" || val == "if" || val == "enables" || val =="sets") { + key = val + } else if (key == "requires") { + requires[opt] = requires[opt] " " val + } else if (key == "if") { + iffs[opt] = iffs[opt] " " val + } else if (key == "enables") { + enabledby[val] = enabledby[val] " " opt + } else if (key == "sets") { + sets[opt] = sets[opt] " " val + key = "setval" + set = val + } else if (key == "setval") { + setval[opt " " set] = setval[opt " " set] " " val + } else + break # bad line format + } + + istart = 1 + } while (i > NF && $0 ~ /,$/) + + if (i > NF) { + # Set the option, defaulting to 'enabled' + if (onoff == "") onoff = "enabled" + option[opt] = onoff + next + } + # Else fall through to the error handler +} + +# chunk NAME [requires OPT] [enables LIST] [on|off|disabled] +# Expands to the 'option' settings appropriate to the reading and +# writing of an ancillary PNG chunk 'NAME': +# +# option READ_NAME requires READ_ANCILLARY_CHUNKS [READ_OPT] +# option READ_NAME enables NAME LIST +# [option READ_NAME off] +# option WRITE_NAME requires WRITE_ANCILLARY_CHUNKS [WRITE_OPT] +# option WRITE_NAME enables NAME LIST +# [option WRITE_NAME off] + +pre != 0 && $1 == "chunk" && NF >= 2{ + # 'chunk' is handled on the first pass by writing appropriate + # 'option' lines into the intermediate file. + opt = $2 + sub(/,$/,"",opt) + onoff = "" + reqread = "" + reqwrite = "" + enables = "" + req = 0 + istart = 3 + do { + if (istart == 1) { # continuation line + val = getline + + if (val != 1) { # error reading it + if (val == 0) + print "chunk", opt ": ERROR: missing continuation line" + else + print "chunk", opt ": ERROR: error reading continuation line" + + # This is a hard error + err = 1 # prevent END{} running + exit 1 + } + } + + # read the keywords/additional OPTS + for (i=istart; i<=NF; ++i) { + val = $(i) + sub(/,$/,"",val) + if (val == "on" || val == "off" || val == "disabled") { + if (onoff != val) { + if (onoff == "") + onoff = val + else + break # on/off conflict + } + req = 0 + } else if (val == "requires") + req = 1 + else if (val == "enables") + req = 2 + else if (req == 1){ + reqread = reqread " READ_" val + reqwrite = reqwrite " WRITE_" val + } else if (req == 2) + enables = enables " " val + else + break # bad line: handled below + } + + istart = 1 + } while (i > NF && $0 ~ /,$/) + + if (i > NF) { + # Output new 'option' lines to the intermediate file (out) + print "option READ_" opt, "requires READ_ANCILLARY_CHUNKS" reqread, "enables", opt enables , onoff >out + print "option WRITE_" opt, "requires WRITE_ANCILLARY_CHUNKS" reqwrite, "enables", opt enables, onoff >out + next + } + # Else hit the error handler below - bad line format! +} + +# setting MACRO ( requires MACRO* )* [ default VALUE ] +# Behaves in a similar way to 'option' without looking for NO_ or +# _SUPPORTED; the macro is enabled if it is defined so long as all +# the 'requires' macros are also defined. The definitions may be +# empty, an error will be issued if the 'requires' macros are +# *not* defined. If given the 'default' value is used if the +# macro is not defined. The default value will be re-tokenised. +# (BTW: this is somewhat restrictive, it mainly exists for the +# support of non-standard configurations and numeric parameters, +# see the uses in scripts/options.dat + +$1 == "setting" && (NF == 2 || NF >= 3 && ($3 == "requires" || $3 == "default")){ + reqs = "" + deflt = "" + isdef = 0 + key = "" + for (i=3; i<=NF; ++i) + if ($(i) == "requires" || $(i) == "default") { + key = $(i) + if (key == "default") isdef = 1 + } else if (key == "requires") + reqs = reqs " " $(i) + else if (key == "default") + deflt = deflt " " $(i) + else + break # Format error, handled below + + setting[$2] = reqs + # NOTE: this overwrites a previous value silently + if (isdef && deflt == "") + deflt = " " # as a flag to force output + defaults[$2] = deflt + next +} + +# The order of the dependency lines (option, chunk, setting) is irrelevant +# - the 'enables', 'requires' and 'if' settings will be used to determine +# the correct order in the output and the final values in pnglibconf.h are +# not order dependent. 'requires' and 'if' entries take precedence over +# 'enables' from other options; if an option requires another option it +# won't be set regardless of any options that enable it unless the other +# option is also enabled. +# +# Similarly 'enables' trumps a NO_ definition in CFLAGS or pngusr.h +# +# For simplicity cycles in the definitions are regarded as errors, +# even if they are not ambiguous. +# A given NAME can be specified in as many 'option' lines as required, the +# definitions are additive. + +# For backwards compatibility equivalent macros may be listed thus: +# +# = [NO_]NAME MACRO +# Makes -DMACRO equivalent to -DPNG_NO_NAME or -DPNG_NAME_SUPPORTED +# as appropriate. +# +# The definition is injected into the C compiler input when encountered +# in the second pass (so all these definitions appear *after* the @ +# lines!) +# +# 'NAME' is as above, but 'MACRO' is the full text of the equivalent +# old, deprecated, macro. + +$1 == "=" && NF == 3{ + print "#ifdef PNG_" $3 >out + if ($2 ~ /^NO_/) + print "# define PNG_" $2 >out + else + print "# define PNG_" $2 "_SUPPORTED" >out + print "#endif" >out + next +} + +# Lines may be injected into the C compiler input by preceding them +# with an "@" character. The line is copied with just the leading +# @ removed. + +$1 ~ /^@/{ + # sub(/^[ ]*@/, "") + $1 = substr($1, 2) + print >out + next +} + +# Check for unrecognized lines, because of the preprocessing chunk +# format errors will be detected on the first pass independent of +# any other format errors. +{ + print "options.awk: bad line (" NR "):", $0 + err = 1 # prevent END{} running + exit 1 +} + +# For checking purposes names that start with "ok_" or "fail_" are +# not output to pnglibconf.h and must be either enabled or disabled +# respectively for the build to succeed. This allows interdependencies +# between options of the form "at least one of" or "at most one of" +# to be checked. For example: +# +# option FLOATING_POINT enables ok_math +# option FIXED_POINT enables ok_math +# This ensures that at least one of FLOATING_POINT and FIXED_POINT +# must be set for the build to succeed. +# +# option fail_math requires FLOATING_POINT FIXED_POINT +# This means the build will fail if *both* FLOATING_POINT and +# FIXED_POINT are set (this is an example; in fact both are allowed.) +# +# If all these options were given the build would require exactly one +# of the names to be enabled. + +END{ + # END{} gets run on an exit (a traditional awk feature) + if (err) exit 1 + + if (pre) { + # Record the final value of the variables + print "deb =", deb >out + if (everything != "") { + print "everything =", everything >out + } + print "logunsupported =", logunsupported >out + exit 0 + } + + # Do the options first (allowing options to set settings). The dependency + # tree is thus: + # + # name > name + # name requires name + # name if name + # name enabledby name + # + # First build a list 'tree' by option of all the things on which + # it depends. + print "" >out + print "/* OPTIONS */" >out + print comment, "options", cend >out + for (opt in enabledby) tree[opt] = 1 # may not be explicit options + for (opt in option) if (opt != "") { + o = option[opt] + # option should always be one of the following values + if (o != "on" && o != "off" && o != "disabled" && o != "enabled") { + print "internal option error (" o ")" + exit 1 + } + tree[opt] = "" # so unlisted options marked + } + for (opt in tree) if (opt != "") { + if (tree[opt] == 1) { + tree[opt] = "" + if (option[opt] != "") { + print "internal error (1)" + exit 1 + } + # Macros only listed in 'enables' remain off unless + # one of the enabling macros is on. + option[opt] = "disabled" + } + + split("", list) # clear 'list' + # Now add every requires, iffs or enabledby entry to 'list' + # so that we can add a unique list of requirements to tree[i] + split(requires[opt] iffs[opt] enabledby[opt], r) + for (i in r) list[r[i]] = 1 + for (i in list) tree[opt] = tree[opt] " " i + } + + # print the tree for extreme debugging + if (deb > 2) for (i in tree) if (i != "") print i, "depends-on" tree[i] + + # Ok, now check all options marked explicitly 'on' or 'off': + # + # If an option[opt] is 'on' then turn on all requires[opt] + # If an option[opt] is 'off' then turn off all enabledby[opt] + # + # Error out if we have to turn 'on' to an 'off' option or vice versa. + npending = 0 + for (opt in option) if (opt != "") { + if (option[opt] == "on" || option[opt] == "off") { + pending[++npending] = opt + } + } + + err = 0 # set on error + while (npending > 0) { + opt = pending[npending--] + if (option[opt] == "on") { + nreqs = split(requires[opt], r) + for (j=1; j<=nreqs; ++j) { + if (option[r[j]] == "off") { + print "option", opt, "turned on, but requirement", r[j], "is turned off" + err = 1 + } else if (option[r[j]] != "on") { + option[r[j]] = "on" + pending[++npending] = r[j] + } + } + } else { + if (option[opt] != "off") { + print "internal error (2)" + exit 1 + } + nreqs = split(enabledby[opt], r) + for (j=1; j<=nreqs; ++j) { + if (option[r[j]] == "on") { + print "option", opt, "turned off, but enabled by", r[j], "which is turned on" + err = 1 + } else if (option[r[j]] != "off") { + option[r[j]] = "off" + pending[++npending] = r[j] + } + } + } + } + if (err) exit 1 + + # Sort options: + print "PNG_DFN_START_SORT 2" >out + + # option[i] is now the complete list of all the tokens we may + # need to output, go through it as above, depth first. + finished = 0 + while (!finished) { + finished = 1 + movement = 0 # done nothing + for (i in option) if (!done[i]) { + nreqs = split(tree[i], r) + if (nreqs > 0) { + for (j=1; j<=nreqs; ++j) if (!done[r[j]]) { + break + } + if (j<=nreqs) { + finished = 0 + continue # next option + } + } + + # All the requirements have been processed, output + # this option. An option is _SUPPORTED if: + # + # all 'requires' are _SUPPORTED AND + # at least one of the 'if' options are _SUPPORTED AND + # EITHER: + # The name is _SUPPORTED (on the command line) + # OR: + # an 'enabledby' is _SUPPORTED + # OR: + # NO_name is not defined AND + # the option is not disabled; an option is disabled if: + # option == off + # option == disabled && everything != on + # option == "" && everything == off + if (deb) print "option", i + print "" >out + print "/* option:", i, option[i] >out + print " * requires: " requires[i] >out + print " * if: " iffs[i] >out + print " * enabled-by:" enabledby[i] >out + print " * sets: " sets[i], "*/" >out + print "#undef PNG_on" >out + print "#define PNG_on 1" >out + + # requires + nreqs = split(requires[i], r) + for (j=1; j<=nreqs; ++j) { + print "#ifndef PNG_" r[j] "_SUPPORTED" >out + print "# undef PNG_on /*!" r[j] "*/" >out + # This error appears in the final output if something + # was switched 'on' but the processing above to force + # the requires did not work + if (option[i] == "on") { + print error, i, "requires", r[j] end >out + } + print "#endif" >out + } + + # if + have_ifs = 0 + nreqs = split(iffs[i], r) + print "#undef PNG_no_if" >out + if (nreqs > 0) { + have_ifs = 1 + print "/* if" iffs[i], "*/" >out + print "#define PNG_no_if 1" >out + for (j=1; j<=nreqs; ++j) { + print "#ifdef PNG_" r[j] "_SUPPORTED" >out + print "# undef PNG_no_if /*" r[j] "*/" >out + print "#endif" >out + } + print "#ifdef PNG_no_if /*missing if*/" >out + print "# undef PNG_on" >out + # There is no checking above for this, because we + # don't know which 'if' to choose, so whine about + # it here: + if (option[i] == "on") { + print error, i, "needs one of:", iffs[i] end >out + } + print "#endif" >out + } + + print "#ifdef PNG_on /*requires, if*/" >out + # enables + print "# undef PNG_not_enabled" >out + print "# define PNG_not_enabled 1" >out + print " /* enabled by" enabledby[i], "*/" >out + nreqs = split(enabledby[i], r) + for (j=1; j<=nreqs; ++j) { + print "#ifdef PNG_" r[j] "_SUPPORTED" >out + print "# undef PNG_not_enabled /*" r[j] "*/" >out + # Oops, probably not intended (should be factored + # out by the checks above). + if (option[i] == "off") { + print error, i, "enabled by:", r[j] end >out + } + print "#endif" >out + } + + print "# ifndef PNG_" i "_SUPPORTED /*!command line*/" >out + print "# ifdef PNG_not_enabled /*!enabled*/" >out + # 'have_ifs' here means that everything = "off" still allows an 'if' on + # an otherwise enabled option to turn it on; otherwise the 'if' + # handling is effectively disabled by 'everything = off' + if (option[i] == "off" || option[i] == "disabled" && everything != "on" || option[i] == "enabled" && everything == "off" && !have_ifs) { + print "# undef PNG_on /*default off*/" >out + } else { + print "# ifdef PNG_NO_" i >out + print "# undef PNG_on /*turned off*/" >out + print "# endif" >out + print "# ifdef PNG_NO_" i "_SUPPORTED" >out + print "# undef PNG_on /*turned off*/" >out + print "# endif" >out + } + print "# endif /*!enabled*/" >out + print "# ifdef PNG_on" >out + # The _SUPPORTED macro must be defined so that dependent + # options output later work. + print "# define PNG_" i "_SUPPORTED" >out + print "# endif" >out + print "# endif /*!command line*/" >out + # If PNG_on is still set the option should be defined in + # pnglibconf.h + print "# ifdef PNG_on" >out + if (i ~ /^fail_/) { + print error, i, "is on: enabled by:" iffs[i] enabledby[i] ", requires" requires[i] end >out + } else if (i !~ /^ok_/) { + print def i sup >out + # Supported option, set required settings + nreqs = split(sets[i], r) + for (j=1; j<=nreqs; ++j) { + print "# ifdef PNG_set_" r[j] >out + # Some other option has already set a value: + print error, i, "sets", r[j] ": duplicate setting" end >out + print error, " previous value: " end "PNG_set_" r[j] >out + print "# else" >out + # Else set the default: note that this won't accept arbitrary + # values, the setval string must be acceptable to all the C + # compilers we use. That means it must be VERY simple; a number, + # a name or a string. + print "# define PNG_set_" r[j], setval[i " " r[j]] >out + print "# endif" >out + } + } + print "# endif /* definition */" >out + print "#endif /*requires, if*/" >out + if (logunsupported || i ~ /^ok_/) { + print "#ifndef PNG_on" >out + if (logunsupported) { + print und i une >out + } + if (i ~ /^ok_/) { + print error, i, "not enabled: requires:" requires[i] ", enabled by:" iffs[i] enabledby[i] end >out + } + print "#endif" >out + } + + done[i] = 1 + ++movement + } + + if (!finished && !movement) { + print "option: loop or missing option in dependency tree, cannot process:" + for (i in option) if (!done[i]) { + print " option", i, "depends on" tree[i], "needs:" + nreqs = split(tree[i], r) + if (nreqs > 0) for (j=1; j<=nreqs; ++j) if (!done[r[j]]) { + print " " r[j] + } + } + exit 1 + } + } + print "PNG_DFN_END_SORT" >out + print comment, "end of options", cend >out + + # Do the 'setting' values second, the algorithm the standard + # tree walk (O(1)) done in an O(2) while/for loop; iterations + # settings x depth, outputting the deepest required macros + # first. + print "" >out + print "/* SETTINGS */" >out + print comment, "settings", cend >out + # Sort (in dfn.awk) on field 2, the setting name + print "PNG_DFN_START_SORT 2" >out + finished = 0 + while (!finished) { + finished = 1 + movement = 0 # done nothing + for (i in setting) if (!doneset[i]) { + nreqs = split(setting[i], r) + if (nreqs > 0) { + # By default assume the requires values are options, but if there + # is no option with that name check for a setting + for (j=1; j<=nreqs; ++j) if (option[r[j]] == "" && !doneset[r[j]]) { + break + } + if (j<=nreqs) { + finished = 0 + continue # try a different setting + } + } + + # All the requirements have been processed, output + # this setting. + if (deb) print "setting", i + deflt = defaults[i] + # Remove any spurious trailing spaces + sub(/ *$/,"",deflt) + # A leading @ means leave it unquoted so the preprocessor + # can substitute the build time value + if (deflt ~ /^ @/) + deflt = " " subs substr(deflt, 3) sube + print "" >out + print "/* setting: ", i >out + print " * requires:" setting[i] >out + print " * default: ", defaults[i] deflt, "*/" >out + for (j=1; j<=nreqs; ++j) { + if (option[r[j]] != "") + print "#ifndef PNG_" r[j] "_SUPPORTED" >out + else + print "#ifndef PNG_" r[j] >out + print error, i, "requires", r[j] end >out + print "# endif" >out + } + # The precedence is: + # + # 1) External definition; trumps: + # 2) Option 'sets' value; trumps: + # 3) Setting 'default' + # + print "#ifdef PNG_" i >out + # PNG_ is defined, so substitute the value: + print def i, subs "PNG_" i sube end >out + print "#else /* use default */" >out + print "# ifdef PNG_set_" i >out + # Value from an option 'sets' argument + print def i, subs "PNG_set_" i sube end >out + # This is so that subsequent tests on the setting work: + print "# define PNG_" i, "1" >out + if (defaults[i] != "") { + print "# else /*default*/" >out + print def i deflt end >out + print "# define PNG_" i, "1" >out + } + print "# endif /* defaults */" >out + print "#endif /* setting", i, "*/" >out + + doneset[i] = 1 + ++movement + } + + if (!finished && !movement) { + print "setting: loop or missing setting in 'requires', cannot process:" + for (i in setting) if (!doneset[i]) { + print " setting", i, "requires" setting[i] + } + exit 1 + } + } + print "PNG_DFN_END_SORT" >out + print comment, "end of settings", cend >out + + # Regular end - everything looks ok + if (protect != "") { + print start "#endif", "/*", protect, "*/" end >out + } +} diff --git a/xs/src/png/libpng/scripts/pnglibconf.dfa b/xs/src/png/libpng/scripts/pnglibconf.dfa new file mode 100644 index 0000000000..b298a72f3b --- /dev/null +++ b/xs/src/png/libpng/scripts/pnglibconf.dfa @@ -0,0 +1,919 @@ +# scripts/pnglibconf.dfa - library build configuration control +# +@/*- pnglibconf.dfn intermediate file +@ * generated from scripts/pnglibconf.dfa +@ */ +# +com pnglibconf.h - library build configuration +com +version +com +com Copyright (c) 1998-2017 Glenn Randers-Pehrson +com +com This code is released under the libpng license. +com For conditions of distribution and use, see the disclaimer +com and license in png.h +com + +file pnglibconf.h scripts/pnglibconf.dfa PNGLCONF_H + +# This file is preprocessed by scripts/options.awk and the +# C compiler to generate 'pnglibconf.h' - a list of all the +# configuration options. The file lists the various options +# that can *only* be specified during the libpng build; +# pnglibconf.h freezes the definitions selected for the specific +# build. +# +# The syntax is detailed in scripts/options.awk; this is a summary +# only: +# +# setting [requires ...] [default] +# #define PNG_ /* value comes from current setting */ +# option [requires ...] [if ...] [enables ...] [disabled] +# #define PNG__SUPPORTED if the requirements are met and +# enable the other options listed +# chunk [requires ...] [enables ...] [disabled] +# Enable chunk processing for the given ancillary chunk; any +# 'requires something' expands to READ_something for read and +# WRITE_something for write, but the enables list members are +# used as given (e.g. enables GAMMA just expands to that on the +# correspond READ_name and WRITE_name lines.) +# +# "," may be used to separate options on an 'option' line and is ignored; it +# doesn't change the meaning of the line. (NOT setting, where "," becomes +# part of the setting!) A comma at the end of an option line causes a +# continuation (the next line is included in the option too.) +# +# Note that the 'on' and 'off' keywords, while valid on both option +# and chunk, should not be used in this file because they force the +# relevant options on or off. + +#---------------------------------------------------------------------- + +# The following setting, option and chunk values can all be changed +# while building libpng: +# +# setting: change 'setting' lines to fine tune library performance; +# changes to the settings don't affect the libpng API functionally +# +# option: change 'option' lines to remove or add capabilities from +# or to the library; options change the library API +# +# chunk: change 'chunk' lines to remove capabilities to process +# optional ('ancillary') chunks. This does not prevent PNG +# decoding but does change the libpng API because some chunks +# will be ignored. +# +# There are three ways of disabling features, in no particular order: +# +# 1) Create 'pngusr.h', enter the required private build information +# detailed below and #define PNG_NO_