Hello community, here is the log from the commit of package zlib for openSUSE:Factory checked in at Sun Apr 3 12:14:01 CEST 2011. -------- --- zlib/zlib.changes 2011-01-09 14:33:39.000000000 +0100 +++ /mounts/work_src_done/STABLE/zlib/zlib.changes 2011-03-30 21:48:31.000000000 +0200 @@ -1,0 +2,11 @@ +Wed Mar 30 19:47:30 UTC 2011 - crrodriguez@opensuse.org + +- Update SSE2/MMX patches to version 2. + +------------------------------------------------------------------- +Tue Mar 15 22:38:32 UTC 2011 - crrodriguez@opensuse.org + +- Add highly experimental patches to use SSE2/SSSE3/MMX in zlib + this makes the library up to 6 times faster. + +------------------------------------------------------------------- calling whatdependson for head-i586 New: ---- 01-prepare.patch 02-ppc_altivec.patch 03-arm.patch 04-x86.patch ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ zlib.spec ++++++ --- /var/tmp/diff_new_pack.Tg1qVv/_old 2011-04-03 12:13:22.000000000 +0200 +++ /var/tmp/diff_new_pack.Tg1qVv/_new 2011-04-03 12:13:22.000000000 +0200 @@ -1,5 +1,5 @@ # -# spec file for package zlib (Version 1.2.5) +# spec file for package zlib # # Copyright (c) 2011 SUSE LINUX Products GmbH, Nuernberg, Germany. # @@ -28,7 +28,7 @@ %endif # Version: 1.2.5 -Release: 5 +Release: 11 Summary: Data Compression Library Url: http://www.zlib.net/ Source: zlib-%{version}.tar.bz2 @@ -40,6 +40,10 @@ Patch1: zlib-lfs.patch # PATCH-FIX-JENGELH-PARALLEL-MAKE zlib-parallel.patch meissner@novell.com -- shared library links with libz.a Patch2: zlib-parallel.patch +Patch3: 01-prepare.patch +Patch4: 02-ppc_altivec.patch +Patch5: 03-arm.patch +Patch6: 04-x86.patch BuildRoot: %{_tmppath}/%{name}-%{version}-build BuildRequires: pkgconfig @@ -84,6 +88,10 @@ %patch0 %patch1 %patch2 -p1 +%patch3 +%patch4 +%patch5 +%patch6 %build # Marcus: breaks example64 in 32bit builds. ++++++ 01-prepare.patch ++++++ === modified file 'Makefile.in' --- Makefile.in 2011-03-14 01:01:37 +0000 +++ Makefile.in 2011-03-14 02:19:21 +0000 @@ -236,7 +236,8 @@ # DO NOT DELETE THIS LINE -- make depend depends on it. -adler32.o zutil.o: zutil.h zlib.h zconf.h +adler32.o: adler32.c zutil.h zlib.h zconf.h +zutil.o: zutil.h zlib.h zconf.h gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h crc32.o: zutil.h zlib.h zconf.h crc32.h @@ -246,7 +247,8 @@ inftrees.o: zutil.h zlib.h zconf.h inftrees.h trees.o: deflate.h zutil.h zlib.h zconf.h trees.h -adler32.lo zutil.lo: zutil.h zlib.h zconf.h +adler32.lo: adler32.c zutil.h zlib.h zconf.h +zutil.lo: zutil.h zlib.h zconf.h gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h crc32.lo: zutil.h zlib.h zconf.h crc32.h === modified file 'adler32.c' --- adler32.c 2011-03-14 01:01:37 +0000 +++ adler32.c 2011-03-30 13:38:42 +0000 @@ -9,6 +9,35 @@ #define local static +#define GCC_VERSION_GE(x) ((__GNUC__-0) * 100 + __GNUC_MINOR__-0 >= x) + +#if GCC_VERSION_GE(301) +/* sometimes leakes out of old kernel header */ +# undef noinline +# define noinline __attribute__((__noinline__)) +#else +# ifndef noinline +# define noinline +# endif +#endif + +#if GCC_VERSION_GE(301) +# define GCC_ATTR_UNUSED_PARAM __attribute__((__unused__)) +#else +# define GCC_ATTR_UNUSED_PARAM +#endif + +#if GCC_VERSION_GE(296) +# define likely(x) __builtin_expect(!!(x), 1) +# define unlikely(x) __builtin_expect(!!(x), 0) +#else +# define likely(x) (x) +# define unlikely(x) (x) +#endif + +#define ROUND_TO(x , n) ((x) & ~((n) - 1L)) +#define ALIGN_DIFF(x, n) (((intptr_t)((x)+(n) - 1L) & ~((intptr_t)(n) - 1L)) - (intptr_t)(x)) + local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2); #define BASE 65521UL /* largest prime smaller than 65536 */ @@ -21,9 +50,20 @@ #define DO8(buf,i) DO4(buf,i); DO4(buf,i+4); #define DO16(buf) DO8(buf,0); DO8(buf,8); +#if defined(__alpha__) +/* even if gcc can generate a mul by inverse, the code is really + * ugly (find global const pool pointer, load constant, a mul, lots + * of shifts/add/sub), up to 14 instructions. The replacement code + * only needs >= 5 instructions + */ +# define NO_DIVIDE +#endif + /* use NO_DIVIDE if your processor does not do division in hardware */ #ifdef NO_DIVIDE -# define MOD(a) \ +/* use NO_SHIFT if your processor does shift > 1 by loop */ +# ifdef NO_SHIFT +# define reduce_full(a) \ do { \ if (a >= (BASE << 16)) a -= (BASE << 16); \ if (a >= (BASE << 15)) a -= (BASE << 15); \ @@ -43,21 +83,237 @@ if (a >= (BASE << 1)) a -= (BASE << 1); \ if (a >= BASE) a -= BASE; \ } while (0) -# define MOD4(a) \ +# define reduce_x(a) \ do { \ + if (MIN_WORK >= (1 << 6) && a >= (BASE << 6)) a -= (BASE << 6); \ + if (MIN_WORK >= (1 << 5) && a >= (BASE << 5)) a -= (BASE << 5); \ if (a >= (BASE << 4)) a -= (BASE << 4); \ if (a >= (BASE << 3)) a -= (BASE << 3); \ if (a >= (BASE << 2)) a -= (BASE << 2); \ if (a >= (BASE << 1)) a -= (BASE << 1); \ if (a >= BASE) a -= BASE; \ } while (0) +# define reduce(a) reduce_full(a) +# else +# define reduce_full(a) \ + do { \ + unsigned long b = a & 0x0000ffff; \ + a >>= 16; \ + b -= a; \ + a <<= 4; \ + a += b; \ + } while(a >= BASE) +# define reduce_x(a) \ + do { \ + unsigned long b = a & 0x0000ffff; \ + a >>= 16; \ + b -= a; \ + a <<= 4; \ + a += b; \ + a = a >= BASE ? a - BASE : a; \ + } while(0) +# define reduce(a) \ + do { \ + unsigned long b = a & 0x0000ffff; \ + a >>= 16; \ + b -= a; \ + a <<= 4; \ + a += b; \ + } while(0) +# endif #else -# define MOD(a) a %= BASE -# define MOD4(a) a %= BASE -#endif - -/* ========================================================================= */ -uLong ZEXPORT adler32(adler, buf, len) +# define reduce_full(a) a %= BASE +# define reduce_x(a) a %= BASE +# define reduce(a) a %= BASE +#endif + +local int host_is_bigendian() +{ + local const union { + uInt d; + unsigned char endian[sizeof(uInt)]; + } x = {1}; + return x.endian[0] == 0; +} + +#ifndef MIN_WORK +# define MIN_WORK 16 +#endif + +/* ========================================================================= */ +local noinline uLong adler32_1(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len GCC_ATTR_UNUSED_PARAM; +{ + unsigned long sum2; + + /* split Adler-32 into component sums */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + adler += buf[0]; + if (adler >= BASE) + adler -= BASE; + sum2 += adler; + if (sum2 >= BASE) + sum2 -= BASE; + return adler | (sum2 << 16); +} + +/* ========================================================================= */ +local noinline uLong adler32_common(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + unsigned long sum2; + + /* split Adler-32 into component sums */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + while (len--) { + adler += *buf++; + sum2 += adler; + } + if (adler >= BASE) + adler -= BASE; + reduce_x(sum2); /* only added so many BASE's */ + return adler | (sum2 << 16); +} + +#ifndef HAVE_ADLER32_VEC +# if (defined(__LP64__) || ((SIZE_MAX-0) >> 31) >= 2) && !defined(NO_ADLER32_VEC) + +/* On 64 Bit archs, we can do pseudo SIMD with a nice win. + * This is esp. important for old Alphas, they do not have byte + * access. + * This needs some register but x86_64 is fine (>= 9 for the mainloop + * req.). If your 64 Bit arch is more limited, throw it away... + */ +# ifndef UINT64_C +# if defined(_MSC_VER) || defined(__BORLANDC__) +# define UINT64_C(c) (c ## ui64) +# else +# define UINT64_C(c) (c ## ULL) +# endif +# endif + +# undef VNMAX +# define VNMAX (2*NMAX+((9*NMAX)/10)) + +/* ========================================================================= */ +local noinline uLong adler32_vec(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + unsigned int s1, s2; + unsigned int k; + + /* split Adler-32 into component sums */ + s1 = adler & 0xffff; + s2 = (adler >> 16) & 0xffff; + + /* align input data */ + k = ALIGN_DIFF(buf, sizeof(size_t)); + len -= k; + if (k) do { + s1 += *buf++; + s2 += s1; + } while(--k); + + k = len > VNMAX ? VNMAX : len; + len -= k; + if (likely(k >= 2 * sizeof(size_t))) do + { + unsigned int vs1, vs2; + unsigned int vs1s; + + /* add s1 to s2 for rounds to come */ + s2 += s1 * ROUND_TO(k, sizeof(size_t)); + vs1s = vs1 = vs2 = 0; + do { + size_t vs1l = 0, vs1h = 0, vs1l_s = 0, vs1h_s = 0; + unsigned int a, b, c, d, e, f, g, h; + unsigned int j; + + j = k > 23 * sizeof(size_t) ? 23 : k/sizeof(size_t); + k -= j * sizeof(size_t); + /* add s1 to s1 round sum for rounds to come */ + vs1s += j * vs1; + do { + size_t in8 = *(const size_t *)buf; + buf += sizeof(size_t); + /* add this s1 to s1 round sum */ + vs1l_s += vs1l; + vs1h_s += vs1h; + /* add up input data to s1 */ + vs1l += in8 & UINT64_C(0x00ff00ff00ff00ff); + vs1h += (in8 & UINT64_C(0xff00ff00ff00ff00)) >> 8; + } while(--j); + + /* split s1 */ + if(host_is_bigendian()) { + a = (vs1h >> 48) & 0x0000ffff; + b = (vs1l >> 48) & 0x0000ffff; + c = (vs1h >> 32) & 0x0000ffff; + d = (vs1l >> 32) & 0x0000ffff; + e = (vs1h >> 16) & 0x0000ffff; + f = (vs1l >> 16) & 0x0000ffff; + g = (vs1h ) & 0x0000ffff; + h = (vs1l ) & 0x0000ffff; + } else { + a = (vs1l ) & 0x0000ffff; + b = (vs1h ) & 0x0000ffff; + c = (vs1l >> 16) & 0x0000ffff; + d = (vs1h >> 16) & 0x0000ffff; + e = (vs1l >> 32) & 0x0000ffff; + f = (vs1h >> 32) & 0x0000ffff; + g = (vs1l >> 48) & 0x0000ffff; + h = (vs1h >> 48) & 0x0000ffff; + } + + /* add s1 & s2 horiz. */ + vs2 += 8*a + 7*b + 6*c + 5*d + 4*e + 3*f + 2*g + 1*h; + vs1 += a + b + c + d + e + f + g + h; + + /* split and add up s1 round sum */ + vs1l_s = ((vs1l_s ) & UINT64_C(0x0000ffff0000ffff)) + + ((vs1l_s >> 16) & UINT64_C(0x0000ffff0000ffff)); + vs1h_s = ((vs1h_s ) & UINT64_C(0x0000ffff0000ffff)) + + ((vs1h_s >> 16) & UINT64_C(0x0000ffff0000ffff)); + vs1l_s += vs1h_s; + vs1s += ((vs1l_s ) & UINT64_C(0x00000000ffffffff)) + + ((vs1l_s >> 32) & UINT64_C(0x00000000ffffffff)); + } while (k >= sizeof(size_t)); + reduce(vs1s); + s2 += vs1s * 8 + vs2; + reduce(s2); + s1 += vs1; + reduce(s1); + len += k; + k = len > VNMAX ? VNMAX : len; + len -= k; + } while (k >= sizeof(size_t)); + + /* handle trailer */ + if (k) do { + s1 += *buf++; + s2 += s1; + } while (--k); + reduce(s1); + reduce(s2); + + /* return recombined sums */ + return (s2 << 16) | s1; +} + +# else + +/* ========================================================================= */ +local noinline uLong adler32_vec(adler, buf, len) uLong adler; const Bytef *buf; uInt len; @@ -69,33 +325,6 @@ sum2 = (adler >> 16) & 0xffff; adler &= 0xffff; - /* in case user likes doing a byte at a time, keep it fast */ - if (len == 1) { - adler += buf[0]; - if (adler >= BASE) - adler -= BASE; - sum2 += adler; - if (sum2 >= BASE) - sum2 -= BASE; - return adler | (sum2 << 16); - } - - /* initial Adler-32 value (deferred check for len == 1 speed) */ - if (buf == Z_NULL) - return 1L; - - /* in case short lengths are provided, keep it somewhat fast */ - if (len < 16) { - while (len--) { - adler += *buf++; - sum2 += adler; - } - if (adler >= BASE) - adler -= BASE; - MOD4(sum2); /* only added so many BASE's */ - return adler | (sum2 << 16); - } - /* do length NMAX blocks -- requires just one modulo operation */ while (len >= NMAX) { len -= NMAX; @@ -104,8 +333,8 @@ DO16(buf); /* 16 sums unrolled */ buf += 16; } while (--n); - MOD(adler); - MOD(sum2); + reduce_full(adler); + reduce_full(sum2); } /* do remaining bytes (less than NMAX, still just one modulo) */ @@ -119,13 +348,36 @@ adler += *buf++; sum2 += adler; } - MOD(adler); - MOD(sum2); + reduce_full(adler); + reduce_full(sum2); } /* return recombined sums */ return adler | (sum2 << 16); } +# endif +#endif + +/* ========================================================================= */ +uLong ZEXPORT adler32(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + /* in case user likes doing a byte at a time, keep it fast */ + if (len == 1) + return adler32_1(adler, buf, len); /* should create a fast tailcall */ + + /* initial Adler-32 value (deferred check for len == 1 speed) */ + if (buf == Z_NULL) + return 1L; + + /* in case short lengths are provided, keep it somewhat fast */ + if (len < MIN_WORK) + return adler32_common(adler, buf, len); + + return adler32_vec(adler, buf, len); +} /* ========================================================================= */ local uLong adler32_combine_(adler1, adler2, len2) @@ -141,7 +393,7 @@ rem = (unsigned)(len2 % BASE); sum1 = adler1 & 0xffff; sum2 = rem * sum1; - MOD(sum2); + reduce_full(sum2); sum1 += (adler2 & 0xffff) + BASE - 1; sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem; if (sum1 >= BASE) sum1 -= BASE; ++++++ 02-ppc_altivec.patch ++++++ === modified file 'Makefile.in' --- Makefile.in 2011-03-14 02:19:21 +0000 +++ Makefile.in 2011-03-14 03:06:03 +0000 @@ -236,7 +236,7 @@ # DO NOT DELETE THIS LINE -- make depend depends on it. -adler32.o: adler32.c zutil.h zlib.h zconf.h +adler32.o: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h zutil.o: zutil.h zlib.h zconf.h gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h @@ -247,7 +247,7 @@ inftrees.o: zutil.h zlib.h zconf.h inftrees.h trees.o: deflate.h zutil.h zlib.h zconf.h trees.h -adler32.lo: adler32.c zutil.h zlib.h zconf.h +adler32.lo: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h zutil.lo: zutil.h zlib.h zconf.h gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h === modified file 'adler32.c' --- adler32.c 2011-03-30 13:38:42 +0000 +++ adler32.c 2011-03-30 13:38:46 +0000 @@ -36,7 +36,10 @@ #endif #define ROUND_TO(x , n) ((x) & ~((n) - 1L)) +#define DIV_ROUNDUP(a, b) (((a) + (b) - 1) / (b)) #define ALIGN_DIFF(x, n) (((intptr_t)((x)+(n) - 1L) & ~((intptr_t)(n) - 1L)) - (intptr_t)(x)) +#define ALIGN_DOWN(x, n) (((intptr_t)(x)) & ~((intptr_t)(n) - 1L)) +#define ALIGN_DOWN_DIFF(x, n) (((intptr_t)(x)) & ((intptr_t)(n) - 1L)) local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2); @@ -136,6 +139,12 @@ return x.endian[0] == 0; } +#ifndef NO_ADLER32_VEC +# if defined(__powerpc__) || defined(__powerpc64__) +# include "adler32_ppc.c" +# endif +#endif + #ifndef MIN_WORK # define MIN_WORK 16 #endif === added file 'adler32_ppc.c' --- adler32_ppc.c 1970-01-01 00:00:00 +0000 +++ adler32_ppc.c 2011-03-30 11:12:04 +0000 @@ -0,0 +1,253 @@ +/* + * adler32.c -- compute the Adler-32 checksum of a data stream + * ppc implementation + * Copyright (C) 1995-2007 Mark Adler + * Copyright (C) 2009-2011 Jan Seiffert + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* @(#) $Id$ */ + +/* + * We use the Altivec PIM vector stuff, but still, this is only + * tested with GCC, and prop. uses some GCC specifics (like GCC + * understands vector types and you can simply write a += b) + */ +#if defined(__ALTIVEC__) && defined(__GNUC__) +# define HAVE_ADLER32_VEC +/* it needs some bytes till the vec version gets up to speed... */ +# define MIN_WORK 64 +# include <altivec.h> + +/* + * Depending on length, this can be slower (short length < 64 bytes), + * much faster (our beloved 128kb 22.2s generic to 3.4s vec, but cache + * is important...), to a little faster (very long length, 1.6MB, 47.6s + * to 36s), which is prop. only capped by memory bandwith. + * (The orig. 128k case was slower in AltiVec, because AltiVec loads + * are always uncached and trigger no HW prefetching, because that is + * what you often need with mass data manipulation (not poisen your + * cache, movntq), instead you have to do it for yourself (data stream + * touch). With 128k it could be cleanly seen: no prefetch, half as slow + * as generic, but comment out the memory load -> 3s. With proper prefetch + * we are at 3.4s. So AltiVec can execute these "expensive" FMA quite + * fast (even without fancy unrolling), only the data does not arrive + * fast enough. In cases where the working set does not fit into cache + * it simply cannot be delivered fast enough over the FSB/Mem). + * Still we have to prefetch, or we are slow as hell. + */ + +# define SOVUC (sizeof(vector unsigned char)) + +/* can be propably more, since we do not have the x86 psadbw 64 bit sum */ +# define VNMAX (6*NMAX) + +/* ========================================================================= */ +local inline vector unsigned char vec_identl(level) + unsigned int level; +{ + return vec_lvsl(level, (const unsigned char *)0); +} + +/* ========================================================================= */ +local inline vector unsigned char vec_ident_rev(void) +{ + return vec_xor(vec_identl(0), vec_splat_u8(15)); +} + +/* ========================================================================= */ +/* multiply two 32 bit ints, return the low 32 bit */ +local inline vector unsigned int vec_mullw(vector unsigned int a, vector unsigned int b) +{ + vector unsigned int v16 = vec_splat_u32(-16); + vector unsigned int v0_32 = vec_splat_u32(0); + vector unsigned int swap, low, high; + + swap = vec_rl(b, v16); + low = vec_mulo((vector unsigned short)a, (vector unsigned short)b); + high = vec_msum((vector unsigned short)a, (vector unsigned short)swap, v0_32); + high = vec_sl(high, v16); + return vec_add(low, high); +} + +/* ========================================================================= */ +local inline vector unsigned int vector_reduce(vector unsigned int x) +{ + vector unsigned int y; + vector unsigned int vsh; + + vsh = vec_splat_u32(1); + vsh = vec_sl(vsh, vec_splat_u32(4)); + + y = vec_sl(x, vsh); + y = vec_sr(y, vsh); + x = vec_sr(x, vsh); + y = vec_sub(y, x); + x = vec_sl(x, vec_splat_u32(4)); + x = vec_add(x, y); + return x; +} + +/* ========================================================================= */ +local noinline uLong adler32_vec(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + unsigned int s1, s2; + + s1 = adler & 0xffff; + s2 = (adler >> 16) & 0xffff; + + if (likely(len >= 2*SOVUC)) { + vector unsigned int v0_32 = vec_splat_u32(0); + vector unsigned int vsh = vec_splat_u32(4); + vector unsigned char v1 = vec_splat_u8(1); + vector unsigned char vord; + vector unsigned char v0 = vec_splat_u8(0); + vector unsigned int vs1, vs2; + vector unsigned char in16, vord_a, v1_a, vperm; + unsigned int f, n; + unsigned int k, block_num; + + /* + * if i understand the Altivec PEM right, little + * endian impl. should have the data reversed on + * load, so the big endian vorder works. + */ + vord = vec_ident_rev() + v1; + block_num = DIV_ROUNDUP(len, 512); /* 32 block size * 16 bytes */ + f = 512; + f |= block_num >= 256 ? 0 : block_num << 16; + vec_dst(buf, f, 2); + /* + * Add stuff to achieve alignment + */ + /* swizzle masks in place */ + vperm = vec_lvsl(0, buf); + vord_a = vec_perm(vord, v0, vperm); + v1_a = vec_perm(v1, v0, vperm); + vperm = vec_lvsr(0, buf); + vord_a = vec_perm(v0, vord_a, vperm); + v1_a = vec_perm(v0, v1_a, vperm); + + /* align hard down */ + f = (unsigned) ALIGN_DOWN_DIFF(buf, SOVUC); + n = SOVUC - f; + buf = (const unsigned char *)ALIGN_DOWN(buf, SOVUC); + + /* add n times s1 to s2 for start round */ + s2 += s1 * n; + + /* set sums 0 */ + vs1 = v0_32; + vs2 = v0_32; + + k = len < VNMAX ? (unsigned)len : VNMAX; + len -= k; + + /* insert scalar start somewhere */ + vs1 = vec_lde(0, &s1); + vs2 = vec_lde(0, &s2); + + /* get input data */ + in16 = vec_ldl(0, buf); + + /* mask out excess data, add 4 byte horizontal and add to old dword */ + vs1 = vec_msum(in16, v1_a, vs1); + + /* apply order, masking out excess data, add 4 byte horizontal and add to old dword */ + vs2 = vec_msum(in16, vord_a, vs2); + + buf += SOVUC; + k -= n; + + if (likely(k >= SOVUC)) do { + vector unsigned int vs1_r = v0_32; + f = 512; + f |= block_num >= 256 ? 0 : block_num << 16; + vec_dst(buf, f, 2); + + do { + /* get input data */ + in16 = vec_ldl(0, buf); + + /* add vs1 for this round */ + vs1_r += vs1; + + /* add 4 byte horizontal and add to old dword */ + vs1 = vec_sum4s(in16, vs1); + /* apply order, add 4 byte horizontal and add to old dword */ + vs2 = vec_msum(in16, vord, vs2); + + buf += SOVUC; + k -= SOVUC; + } while (k >= SOVUC); + /* reduce vs1 round sum before multiplying by 16 */ + vs1_r = vector_reduce(vs1_r); + /* add all vs1 for 16 times */ + vs2 += vec_sl(vs1_r, vsh); + /* reduce the vectors to something in the range of BASE */ + vs2 = vector_reduce(vs2); + vs1 = vector_reduce(vs1); + len += k; + k = len < VNMAX ? (unsigned)len : VNMAX; + block_num = DIV_ROUNDUP(len, 512); /* 32 block size * 16 bytes */ + len -= k; + } while (likely(k >= SOVUC)); + + if (likely(k)) { + vector unsigned int vk; + /* + * handle trailer + */ + f = SOVUC - k; + /* swizzle masks in place */ + vperm = vec_identl(f); + vord_a = vec_perm(vord, v0, vperm); + v1_a = vec_perm(v1, v0, vperm); + + /* add k times vs1 for this trailer */ + vk = (vector unsigned int)vec_lvsl(0, (unsigned *)(intptr_t)k); + vk = (vector unsigned)vec_mergeh(v0, (vector unsigned char)vk); + vk = (vector unsigned)vec_mergeh((vector unsigned short)v0, (vector unsigned short)vk); + vk = vec_splat(vk, 0); + vs2 += vec_mullw(vs1, vk); + + /* get input data */ + in16 = vec_ldl(0, buf); + + /* mask out excess data, add 4 byte horizontal and add to old dword */ + vs1 = vec_msum(in16, v1_a, vs1); + /* apply order, masking out excess data, add 4 byte horizontal and add to old dword */ + vs2 = vec_msum(in16, vord_a, vs2); + + buf += k; + k -= k; + } + + vec_dss(2); + + /* add horizontal */ + /* stuff should be reduced so no proplem with signed sature */ + vs1 = (vector unsigned)vec_sums((vector int)vs1, (vector int)v0_32); + vs2 = (vector unsigned)vec_sums((vector int)vs2, (vector int)v0_32); + /* shake and roll */ + vs1 = vec_splat(vs1, 3); + vs2 = vec_splat(vs2, 3); + vec_ste(vs1, 0, &s1); + vec_ste(vs2, 0, &s2); + /* after horizontal add, reduce again in scalar code */ + } + + if (unlikely(len)) do { + s1 += *buf++; + s2 += s1; + } while (--len); + reduce(s1); + reduce(s2); + + return (s2 << 16) | s1; +} + +#endif ++++++ 03-arm.patch ++++++ === modified file 'Makefile.in' --- Makefile.in 2011-03-14 03:06:03 +0000 +++ Makefile.in 2011-03-14 14:39:24 +0000 @@ -236,7 +236,7 @@ # DO NOT DELETE THIS LINE -- make depend depends on it. -adler32.o: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h +adler32.o: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h zutil.o: zutil.h zlib.h zconf.h gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h @@ -247,7 +247,7 @@ inftrees.o: zutil.h zlib.h zconf.h inftrees.h trees.o: deflate.h zutil.h zlib.h zconf.h trees.h -adler32.lo: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h +adler32.lo: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h zutil.lo: zutil.h zlib.h zconf.h gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h === modified file 'adler32.c' --- adler32.c 2011-03-30 13:38:46 +0000 +++ adler32.c 2011-03-30 13:38:46 +0000 @@ -140,7 +140,9 @@ } #ifndef NO_ADLER32_VEC -# if defined(__powerpc__) || defined(__powerpc64__) +# if defined(__arm__) +# include "adler32_arm.c" +# elif defined(__powerpc__) || defined(__powerpc64__) # include "adler32_ppc.c" # endif #endif === added file 'adler32_arm.c' --- adler32_arm.c 1970-01-01 00:00:00 +0000 +++ adler32_arm.c 2011-03-30 11:18:49 +0000 @@ -0,0 +1,359 @@ +/* + * adler32.c -- compute the Adler-32 checksum of a data stream + * arm implementation + * Copyright (C) 1995-2007 Mark Adler + * Copyright (C) 2009-2011 Jan Seiffert + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* @(#) $Id$ */ + +#if defined(__ARM_NEON__) +// TODO: need byte order define +/* + * Big endian NEON qwords are kind of broken. + * They are big endian within the dwords, but WRONG + * (really??) way round between lo and hi. + * Creating some kind of PDP11 middle endian. + * + * This is madness and unsupportable. For this reason + * GCC wants to disable qword endian specific patterns. + * We would need a Preprocessor define which endian we + * have to disable this code. + */ +# include <arm_neon.h> + +# define SOVUCQ sizeof(uint8x16_t) +# define SOVUC sizeof(uint8x8_t) +/* since we do not have the 64bit psadbw sum, we could prop. do a little more */ +# define VNMAX (6*NMAX) +# define HAVE_ADLER32_VEC +# define MIN_WORK 32 + +/* ========================================================================= */ +local inline uint8x16_t neon_simple_alignq(uint8x16_t a, uint8x16_t b, unsigned amount) +{ + switch(amount % SOVUCQ) + { + case 0: return a; + case 1: return vextq_u8(a, b, 1); + case 2: return vextq_u8(a, b, 2); + case 3: return vextq_u8(a, b, 3); + case 4: return vextq_u8(a, b, 4); + case 5: return vextq_u8(a, b, 5); + case 6: return vextq_u8(a, b, 6); + case 7: return vextq_u8(a, b, 7); + case 8: return vextq_u8(a, b, 8); + case 9: return vextq_u8(a, b, 9); + case 10: return vextq_u8(a, b, 10); + case 11: return vextq_u8(a, b, 11); + case 12: return vextq_u8(a, b, 12); + case 13: return vextq_u8(a, b, 13); + case 14: return vextq_u8(a, b, 14); + case 15: return vextq_u8(a, b, 15); + } + return b; +} + +/* ========================================================================= */ +local inline uint32x4_t vector_reduce(uint32x4_t x) +{ + uint32x4_t y; + + y = vshlq_n_u32(x, 16); + x = vshrq_n_u32(x, 16); + y = vshrq_n_u32(y, 16); + y = vsubq_u32(y, x); + x = vaddq_u32(y, vshlq_n_u32(x, 4)); + return x; +} + +/* ========================================================================= */ +local noinline uLong adler32_vec(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + uint32x4_t v0_32 = (uint32x4_t){0,0,0,0}; + uint8x16_t v0 = (uint8x16_t)v0_32; + uint8x16_t vord, vord_a; + uint32x4_t vs1, vs2; + uint32x2_t v_tsum; + uint8x16_t in16; + uint32_t s1, s2; + unsigned k; + + s1 = adler & 0xffff; + s2 = (adler >> 16) & 0xffff; + +// TODO: big endian mask is prop. wrong + if (host_is_bigendian()) + vord = (uint8x16_t){16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1}; + else + vord = (uint8x16_t){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; + + if (likely(len >= 2*SOVUCQ)) { + unsigned f, n; + + /* + * Add stuff to achieve alignment + */ + /* align hard down */ + f = (unsigned) ALIGN_DOWN_DIFF(buf, SOVUCQ); + n = SOVUCQ - f; + buf = (const unsigned char *)ALIGN_DOWN(buf, SOVUCQ); + + /* add n times s1 to s2 for start round */ + s2 += s1 * n; + + /* set sums 0 */ + vs1 = v0_32; + vs2 = v0_32; + /* + * the accumulation of s1 for every round grows very fast + * (quadratic?), even if we accumulate in 4 dwords, more + * rounds means nonlinear growth. + * We already split it out of s2, normaly it would be in + * s2 times 16... and even grow faster. + * Thanks to this split and vector reduction, we can stay + * longer in the loops. But we have to prepare for the worst + * (all 0xff), only do 6 times the work. + * (we could prop. stay a little longer since we have 4 sums, + * not 2 like on x86). + */ + k = len < VNMAX ? (unsigned)len : VNMAX; + len -= k; + /* insert scalar start somewhere */ + vs1 = vsetq_lane_u32(s1, vs1, 0); + vs2 = vsetq_lane_u32(s2, vs2, 0); + + /* get input data */ + in16 = *(const uint8x16_t *)buf; + /* mask out excess data */ + if(host_is_bigendian()) { + in16 = neon_simple_alignq(v0, in16, n); + vord_a = neon_simple_alignq(v0, vord, n); + } else { + in16 = neon_simple_alignq(in16, v0, f); + vord_a = neon_simple_alignq(vord, v0, f); + } + + /* pairwise add bytes and long, pairwise add word long acc */ + vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16)); + /* apply order, add words, pairwise add word long acc */ + vs2 = vpadalq_u16(vs2, + vmlal_u8( + vmull_u8(vget_low_u8(in16), vget_low_u8(vord_a)), + vget_high_u8(in16), vget_high_u8(vord_a) + ) + ); + + buf += SOVUCQ; + k -= n; + + if (likely(k >= SOVUCQ)) do { + uint32x4_t vs1_r = v0_32; + do { + /* add vs1 for this round */ + vs1_r = vaddq_u32(vs1_r, vs1); + + /* get input data */ + in16 = *(const uint8x16_t *)buf; + +// TODO: make work in inner loop more tight + /* + * decompose partial sums, so we do less instructions and + * build loops around it to do acc and so on only from time + * to time. + * This is hard with NEON, because the instruction are nice: + * we have the stuff in widening and with acc (practicaly + * for free...) + */ + /* pairwise add bytes and long, pairwise add word long acc */ + vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16)); + /* apply order, add words, pairwise add word long acc */ + vs2 = vpadalq_u16(vs2, + vmlal_u8( + vmull_u8(vget_low_u8(in16), vget_low_u8(vord)), + vget_high_u8(in16), vget_high_u8(vord) + ) + ); + + buf += SOVUCQ; + k -= SOVUCQ; + } while (k >= SOVUCQ); + /* reduce vs1 round sum before multiplying by 16 */ + vs1_r = vector_reduce(vs1_r); + /* add vs1 for this round (16 times) */ + /* they have shift right and accummulate, where is shift left and acc?? */ + vs2 = vaddq_u32(vs2, vshlq_n_u32(vs1_r, 4)); + /* reduce both vectors to something within 16 bit */ + vs2 = vector_reduce(vs2); + vs1 = vector_reduce(vs1); + len += k; + k = len < VNMAX ? (unsigned) len : VNMAX; + len -= k; + } while (likely(k >= SOVUC)); + + if (likely(k)) { + /* + * handle trailer + */ + f = SOVUCQ - k; + /* add k times vs1 for this trailer */ + vs2 = vmlaq_u32(vs2, vs1, vdupq_n_u32(k)); + + /* get input data */ + in16 = *(const uint8x16_t *)buf; + /* masks out bad data */ + if(host_is_bigendian()) + in16 = neon_simple_alignq(in16, v0, f); + else + in16 = neon_simple_alignq(v0, in16, k); + + /* pairwise add bytes and long, pairwise add word long acc */ + vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16)); + /* apply order, add words, pairwise add word long acc */ + vs2 = vpadalq_u16(vs2, + vmlal_u8( + vmull_u8(vget_low_u8(in16), vget_low_u8(vord)), + vget_high_u8(in16), vget_high_u8(vord) + ) + ); + + buf += k; + k -= k; + } + + /* add horizontal */ + v_tsum = vpadd_u32(vget_high_u32(vs1), vget_low_u32(vs1)); + v_tsum = vpadd_u32(v_tsum, v_tsum); + s1 = vget_lane_u32(v_tsum, 0); + v_tsum = vpadd_u32(vget_high_u32(vs2), vget_low_u32(vs2)); + v_tsum = vpadd_u32(v_tsum, v_tsum); + s2 = vget_lane_u32(v_tsum, 0); + } + + if (unlikely(len)) do { + s1 += *buf++; + s2 += s1; + } while (--len); + reduce_x(s1); + reduce_x(s2); + + return (s2 << 16) | s1; +} + +/* inline asm, so only on GCC (or compatible) && ARM v6 or better */ +#elif defined(__GNUC__) && ( \ + defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \ + defined(__ARM_ARCH_7A__) \ + ) +# define SOU32 (sizeof(unsigned int)) +# define HAVE_ADLER32_VEC +# define MIN_WORK 16 + +/* ========================================================================= */ +local noinline uLong adler32_vec(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + unsigned int s1, s2; + unsigned int k; + + s1 = adler & 0xffff; + s2 = (adler >> 16) & 0xffff; + + k = ALIGN_DIFF(buf, SOU32); + len -= k; + if (k) do { + s1 += *buf++; + s2 += s1; + } while (--k); + + if (likely(len >= 4 * SOU32)) { + unsigned int vs1 = s1, vs2 = s2; + unsigned int order_lo, order_hi; + +// TODO: byte order? + if(host_is_bigendian()) { + order_lo = 0x00030001; + order_hi = 0x00040002; + } else { + order_lo = 0x00020004; + order_hi = 0x00010003; + } +// TODO: we could go over NMAX, since we have split the vs2 sum + /* something around (NMAX+(NMAX/3)+302) */ + k = len < NMAX ? len : NMAX; + len -= k; + + do { + unsigned int vs1_r = 0; + do { + unsigned int t21, t22, in; + + /* get input data */ + in = *(const unsigned int *)buf; + + /* add vs1 for this round */ + vs1_r += vs1; + + /* add horizontal and acc */ + asm ("usada8 %0, %1, %2, %3" : "=r" (vs1) : "r" (in), "r" (0), "r" (vs1)); + /* widen bytes to words, apply order, add and acc */ + asm ("uxtb16 %0, %1" : "=r" (t21) : "r" (in)); + asm ("uxtb16 %0, %1, ror #8" : "=r" (t22) : "r" (in)); +// TODO: instruction result latency + /* + * The same problem like the classic serial sum: + * Chip maker sell us 1-cycle instructions, but that is not the + * whole story. Nearly all 1-cycle chips are pipelined, so + * you can get one result per cycle, but only if _they_ (plural) + * are independent. + * If you are depending on the result of an preciding instruction, + * in the worst case you hit the instruction latency which is worst + * case >= pipeline length. On the other hand there are result-fast-paths. + * This could all be a wash with the classic sum (4 * 2 instructions, + * + dependence), since smald is: + * - 2 cycle issue + * - needs the acc in pipeline step E1, instead of E2 + * But the Cortex has a fastpath for acc. + * I don't know. + * We can not even unroll, we would need 4 order vars, return ENOREGISTER. + */ + asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (t21) , "r" (order_lo), "r" (vs2)); + asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (t22) , "r" (order_hi), "r" (vs2)); + + buf += SOU32; + k -= SOU32; + } while (k >= SOU32); + /* reduce vs1 round sum before multiplying by 4 */ + reduce(vs1_r); + /* add vs1 for this round (4 times) */ + vs2 += vs1_r * 4; + /* reduce both sums to something within 16 bit */ + reduce(vs2); + reduce(vs1); + len += k; + k = len < NMAX ? len : NMAX; + len -= k; + } while (likely(k >= 4 * SOU32)); + len += k; + s1 = vs1; + s2 = vs2; + } + + if (unlikely(len)) do { + s1 += *buf++; + s2 += s1; + } while (--len); + /* at this point we should no have so big s1 & s2 */ + reduce_x(s1); + reduce_x(s2); + + return (s2 << 16) | s1; +} +#endif ++++++ 04-x86.patch ++++++ ++++ 1165 lines (skipped) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Remember to have fun... -- To unsubscribe, e-mail: opensuse-commit+unsubscribe@opensuse.org For additional commands, e-mail: opensuse-commit+help@opensuse.org