Hello community,
here is the log from the commit of package dd_rescue for openSUSE:Factory checked in at 2013-08-04 07:28:46
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/dd_rescue (Old)
and /work/SRC/openSUSE:Factory/.dd_rescue.new (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "dd_rescue"
Changes:
--------
--- /work/SRC/openSUSE:Factory/dd_rescue/dd_rescue.changes 2013-07-25 20:27:32.000000000 +0200
+++ /work/SRC/openSUSE:Factory/.dd_rescue.new/dd_rescue.changes 2013-08-04 07:28:48.000000000 +0200
@@ -1,0 +2,18 @@
+Fri Aug 2 22:05:52 CEST 2013 - kurt@garloff.de
+
+- Update to dd_rescue-1.38:
+ * Further optimized SSE2 sparse detection. (Also added AVX2
+ version, not enabled by default though.)
+ * --force allows to ignore non-seekable output with non zero
+ output position.
+ * make check does some testing ...
+ * improved cur.rate and ETA calculation.
+
+-------------------------------------------------------------------
+Thu Aug 1 22:02:16 CEST 2013 - kurt@garloff.de
+
+- Update to dd_rescue-1.37:
+ * Important bugfix for SSE2 sparse detection.
+ * Fix exact zero-length on big endian machines (irrelevant).
+
+-------------------------------------------------------------------
Old:
----
dd_rescue-1.36.tar.gz
New:
----
dd_rescue-1.38.tar.gz
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Other differences:
------------------
++++++ dd_rescue.spec ++++++
--- /var/tmp/diff_new_pack.LTvZx7/_old 2013-08-04 07:28:49.000000000 +0200
+++ /var/tmp/diff_new_pack.LTvZx7/_new 2013-08-04 07:28:49.000000000 +0200
@@ -17,7 +17,7 @@
Name: dd_rescue
-Version: 1.36
+Version: 1.38
Release: 0
Summary: Data Copying in the Presence of I/O Errors
License: GPL-2.0 or GPL-3.0
@@ -63,6 +63,9 @@
ln -sf %{_bindir}/dd_rescue %{buildroot}/bin
#EndUsrMerge
+%check
+make check
+
%files
%defattr(-,root,root,-)
%doc COPYING README.dd_rescue
++++++ dd_rescue-1.36.tar.gz -> dd_rescue-1.38.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dd_rescue/Makefile new/dd_rescue/Makefile
--- old/dd_rescue/Makefile 2013-07-24 00:38:20.000000000 +0200
+++ new/dd_rescue/Makefile 2013-08-02 13:33:21.000000000 +0200
@@ -1,8 +1,8 @@
# Makefile for dd_rescue
# (c) garloff@suse.de, 99/10/09, GNU GPL
-# $Id: Makefile,v 1.62 2013/07/23 22:38:20 garloff Exp $
+# $Id: Makefile,v 1.70 2013/08/02 10:23:12 garloff Exp $
-VERSION = 1.36
+VERSION = 1.38
DESTDIR =
@@ -53,6 +53,9 @@
find_nonzero.o: find_nonzero.c find_nonzero.h
$(CC) $(CFLAGS_OPT) -c $< $(SSE)
+find_nonzero_avx.o: find_nonzero_avx.c find_nonzero.h
+ $(CC) $(CFLAGS_OPT) -mavx2 -c $<
+
libfalloc: dd_rescue.c $(HEADERS) $(OBJECTS)
$(CC) $(CFLAGS) -DHAVE_LIBFALLOCATE=1 $(DEFINES) $< $(OUT) $(OBJECTS) -lfallocate
@@ -75,14 +78,19 @@
strip -S $<
clean:
- rm -f $(TARGETS) $(OBJECTS) dd_rescue.o core test log find_nonzero fmt_no
+ rm -f $(TARGETS) $(OBJECTS) dd_rescue.o core test log find_nonzero fmt_no file_zblock find_nonzero_avx.o find_nonzero_avx
find_nonzero: find_nonzero.c find_nonzero.h
$(CC) $(CFLAGS_OPT) -o $@ $< -DTEST $(SSE)
+find_nonzero_avx: find_nonzero.c find_nonzero.h find_nonzero_avx.o
+ $(CC) $(CFLAGS_OPT) -o $@ $< -DHAVE_AVX2 -DTEST $(SSE) find_nonzero_avx.o
+
fmt_no: fmt_no.c fmt_no.h
$(CC) $(CFLAGS) -o $@ $< -DTEST
+file_zblock: file_zblock.c find_nonzero.h find_nonzero.c find_nonzero.o
+ $(CC) $(CFLAGS) -o $@ $< find_nonzero.o
distclean: clean
rm -f *~
@@ -99,3 +107,9 @@
$(INSTALL) $(INSTASROOT) -m 644 dd_rescue.1 $(MANDIR)/man1/
gzip -9 $(MANDIR)/man1/dd_rescue.1
+check: $(TARGETS) find_nonzero
+ ./dd_rescue -apP dd_rescue dd_rescue.copy
+ cmp dd_rescue dd_rescue.copy
+ ./find_nonzero 2
+ rm dd_rescue.copy
+
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dd_rescue/dd_rescue.1 new/dd_rescue/dd_rescue.1
--- old/dd_rescue/dd_rescue.1 2013-07-24 16:47:30.000000000 +0200
+++ new/dd_rescue/dd_rescue.1 2013-07-26 00:00:15.000000000 +0200
@@ -1,4 +1,4 @@
-.\" $Id: dd_rescue.1,v 1.12 2013/07/16 09:53:50 garloff Exp $
+.\" $Id: dd_rescue.1,v 1.13 2013/07/25 11:57:34 garloff Exp $
.
.TH dd_rescue 1 "2013-02-24" "Kurt Garloff" "Rescue copy tool"
.
@@ -359,7 +359,7 @@
.TP 8
.BR \-d ", " \-\-odir_in
instructs
-.b dd_rescue
+.B dd_rescue
to open
.IR infie
with O_DIRECT, bypassing the kernel buffers. While this option has a negative
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dd_rescue/dd_rescue.c new/dd_rescue/dd_rescue.c
--- old/dd_rescue/dd_rescue.c 2013-07-24 16:47:30.000000000 +0200
+++ new/dd_rescue/dd_rescue.c 2013-08-02 00:16:39.000000000 +0200
@@ -45,7 +45,7 @@
# define __COMPILER__ "(unknown compiler)"
#endif
-#define ID "$Id: dd_rescue.c,v 1.214 2013/07/23 22:28:55 garloff Exp $"
+#define ID "$Id: dd_rescue.c,v 1.218 2013/08/01 22:16:39 garloff Exp $"
#ifndef BUF_SOFTBLOCKSIZE
# define BUF_SOFTBLOCKSIZE 65536
@@ -146,7 +146,8 @@
char identical, preserve, falloc, dosplice;
char i_chr, o_chr;
char i_repeat, i_rep_init;
-int i_rep_zero, prng_seed;
+size_t i_rep_zero;
+int prng_seed;
char noextend, avoidwrite;
char prng_libc, prng_frnd;
char bsim715, bsim715_4, bsim715_2, bsim715_2ndpass;
@@ -562,12 +563,21 @@
}
#endif
+float floatrate4 = 0.0;
+float floatrate32 = 0.0;
void doprint(FILE* const file, const unsigned int bs, const clock_t cl,
const float t1, const float t2, const int sync)
{
float avgrate = (float)xfer/t1;
float currate = (float)(xfer-lxfer)/t2;
const char *bold = BOLD, *norm = NORM;
+ if (!floatrate4) {
+ floatrate4 = currate;
+ floatrate32 = currate;
+ } else {
+ floatrate4 = (floatrate4 * 3 + currate)/ 4;
+ floatrate32 = (floatrate32*31 + currate)/32;
+ }
if (nocol || (file != stderr && file != stdout)) {
bold = ""; norm = "";
}
@@ -581,7 +591,7 @@
fmt_int(10, 1, 1024, sxfer, bold, norm, 1));
if (sync || (file != stdin && file != stdout) )
fprintf(file, " +curr.rate:%skB/s, avg.rate:%skB/s, avg.load:%s%%\n",
- fmt_int(9, 0, 1024, currate, bold, norm, 1),
+ fmt_int(9, 0, 1024, floatrate4, bold, norm, 1),
fmt_int(9, 0, 1024, avgrate, bold, norm, 1),
fmt_int(3, 1, 10, (cl-startclock)/(t1*(CLOCKS_PER_SEC/1000)), bold, norm, 1));
else
@@ -594,7 +604,7 @@
if (in_report)
sec = 0.5 + t1;
else
- sec = 0.5 + (estxfer-xfer)/avgrate;
+ sec = 0.5 + 2*(estxfer-xfer)/(avgrate+floatrate32);
int hour = sec / 3600;
int min = (sec % 3600) / 60;
sec = sec % 60;
@@ -844,7 +854,7 @@
}
/** is the block zero ? */
-static int blockiszero(const unsigned char* blk, const size_t ln)
+static ssize_t blockiszero(const unsigned char* blk, const size_t ln)
{
if (i_repeat && i_rep_zero)
return i_rep_zero;
@@ -2054,8 +2064,12 @@
}
if (o_chr && opos != 0) {
- fplog(stderr, FATAL, "outfile not seekable, but opos !=0 requested!\n");
- cleanup(); exit(19);
+ if (force)
+ fplog(stderr, WARN, "ignore non-seekable output with opos != 0 due to --force\n");
+ else {
+ fplog(stderr, FATAL, "outfile not seekable, but opos !=0 requested!\n");
+ cleanup(); exit(19);
+ }
}
if (i_chr && ipos != 0) {
fplog(stderr, FATAL, "infile not seekable, but ipos !=0 requested!\n");
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dd_rescue/file_zblock.c new/dd_rescue/file_zblock.c
--- old/dd_rescue/file_zblock.c 1970-01-01 01:00:00.000000000 +0100
+++ new/dd_rescue/file_zblock.c 2013-08-01 23:22:05.000000000 +0200
@@ -0,0 +1,56 @@
+#define _GNU_SOURCE 1
+#include
+#include
+#include
+#include
+#include
+
+#include "find_nonzero.h"
+
+#define BUFSZ (64*1024)
+unsigned char buf[BUFSZ];
+
+void usage()
+{
+ fprintf(stderr, "Usage: file_zblock FILE1 [FILE2 [FILE3 [...]]]\n");
+ fprintf(stderr, "file_zblock reports files with ()at least) chunk-sized blocks of zeros inside.\n");
+ exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+ int zf = 0;
+ int chunksz = 4096;
+ int i = 1, off;
+ if (argc < 2)
+ usage();
+ if (!memcmp(argv[1], "-c", 2)) {
+ if (strlen(argv[1]) > 2) {
+ chunksz = atoi(argv[1]+2);
+ ++i;
+ } else {
+ chunksz = atoi(argv[2]);
+ i += 2;
+ }
+ }
+ for (; i < argc; ++i) {
+ int fd = open(argv[i], O_RDONLY);
+ if (fd<0) {
+ fprintf(stderr, "ERROR opening file %s: %s\n", argv[i], strerror(errno));
+ continue;
+ }
+ int rd, found = 0;
+ while ((rd = read(fd, buf, BUFSZ)) > 0 && !found) {
+ for (off = 0; off < rd; off += chunksz) {
+ unsigned int tocheck = rd-off > chunksz? chunksz: rd-off;
+ if (find_nonzero(buf+off, tocheck) == tocheck) {
+ ++found; ++zf;
+ printf("%s,%i\n", argv[i], off);
+ break;
+ }
+ }
+ }
+ close(fd);
+ }
+ return zf;
+}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dd_rescue/find_nonzero.c new/dd_rescue/find_nonzero.c
--- old/dd_rescue/find_nonzero.c 2013-07-24 16:47:30.000000000 +0200
+++ new/dd_rescue/find_nonzero.c 2013-08-02 13:33:21.000000000 +0200
@@ -7,7 +7,8 @@
#define IN_FINDZERO
#include "find_nonzero.h"
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(TEST) && (defined(__i386__) || defined(__x86_64__))
+/** Just for testing the speed of the good old x86 string instructions */
size_t find_nonzero_rep(const unsigned char* blk, const size_t ln)
{
unsigned long register res;
@@ -30,15 +31,58 @@
#ifdef __SSE2__
#include
-size_t find_nonzero_simd(const unsigned char* blk, const size_t ln)
+#ifdef TEST
+/** SSE2 version for measuring the initial zero bytes of aligned blk */
+size_t find_nonzero_sse2o(const unsigned char* blk, const size_t ln)
{
- __m128i xmm, zero = _mm_setzero_si128();
- unsigned /*long*/ register eax;
+ __m128i register xmm;
+ const __m128i register zero = _mm_setzero_si128();
+#ifdef SIMD_XOR
+ const __m128i register mask = _mm_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1);
+#endif
+ unsigned register eax;
size_t i = 0;
+ //asm(".align 32");
for (; i < ln; i+= 16) {
xmm = _mm_load_si128((__m128i*)(blk+i));
+#ifdef BUGGY_136
_mm_cmpeq_epi8(xmm, zero);
eax = _mm_movemask_epi8(xmm);
+#else
+ xmm = _mm_cmpeq_epi8(xmm, zero);
+#ifdef SIMD_XOR
+ xmm = _mm_xor_si128(xmm, mask);
+#endif
+ eax = _mm_movemask_epi8(xmm);
+#endif /* BUGGY **/
+#if defined(SIMD_XOR) || defined(BUGGY_136)
+ if (eax)
+ return i + myffs(eax)-1;
+#else
+ if (eax != 0xffff)
+ return i + myffs(eax^0xffff)-1;
+#endif
+ }
+ return ln;
+}
+#endif
+
+/** SSE2 version for measuring the initial zero bytes of 16b aligned blk */
+size_t find_nonzero_sse2(const unsigned char* blk, const size_t ln)
+{
+ register __m128i xmm0, xmm1;
+ register const __m128i zero = _mm_setzero_si128();
+ register unsigned int eax, ebx;
+ size_t i = 0;
+ //asm(".p2align 5");
+ for (; i < ln; i+= 32) {
+ //xmm0 = _mm_load_si128((__m128i*)(blk+i));
+ //xmm1 = _mm_load_si128((__m128i*)(blk+i+16));
+ xmm0 = _mm_cmpeq_epi8(*(__m128i*)(blk+i), zero);
+ xmm1 = _mm_cmpeq_epi8(*(__m128i*)(blk+i+16), zero);
+ eax = _mm_movemask_epi8(xmm0);
+ ebx = _mm_movemask_epi8(xmm1);
+ eax = ~(eax | (ebx << 16));
if (eax)
return i + myffs(eax)-1;
}
@@ -46,11 +90,12 @@
}
#ifdef NEED_SIMD_RUNTIME_DETECTION
+/** Issue an SSE2 insn for runtime detection of SSE2 capability (x86) */
+volatile __m128d _probe_xmm;
void probe_simd()
{
- volatile __m128d xmm;
double val = 3.14159265358979323844;
- xmm = _mm_set_sd(val);
+ _probe_xmm = _mm_set_sd(val);
}
#endif /* NEED_SIMD_RUNTIME_DETECTION */
@@ -58,13 +103,15 @@
#if defined(__arm__)
-/* Inspired by Linaro's strlen() implementation;
- we don't even need NEON here, ldmia does the 3x speedup on A-9 */
-size_t find_nonzero_simd(const unsigned char *blk, const size_t ln)
+/** ASM optimized version for ARM.
+ * Inspired by Linaro's strlen() implementation;
+ * we don't even need NEON here, ldmia does the 3x speedup on Cortexes */
+size_t find_nonzero_arm6(const unsigned char *blk, const size_t ln)
{
register unsigned char* res;
const register unsigned char* end = blk+ln;
asm volatile(
+ //".align 4 \n"
"1: \n"
" ldmia %0!,{r2,r3} \n"
" cmp r2, #0 \n"
@@ -80,7 +127,8 @@
" mov r3, r2 \n"
"3: \n"
" sub %0, #4 \n"
-#ifndef __ARMEB__ /* Little endian bitmasks */
+//#ifndef __ARMEB__ /* Little endian bitmasks */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
" tst r3, #0xff \n"
" bne 10f \n"
" add %0, #1 \n"
@@ -105,10 +153,10 @@
: "r2", "r3");
return res-blk;
}
+#define find_nonzero_simd find_nonzero_arm6
#endif
-
#ifdef TEST
#include
#include
@@ -129,13 +177,41 @@
gettimeofday(&t2, NULL); \
tdiff = t2.tv_sec-t1.tv_sec + 0.000001*(t2.tv_usec-t1.tv_usec); \
printf("%7i x %20s (%8i): %8i (%6.3fs => %5.0fMB/s)\n", \
- rep, #routine, sz, ln, tdiff, (double)(rep)*(double)(sz)/(1024*1024*tdiff))
+ rep, #routine, sz, ln, tdiff, (double)(rep)*(double)(sz+1)/(1024*1024*tdiff)); \
+ if (ln != (tsz %5.0fMB/s)\n", \
+ rep, #routine, sz, ln, tdiff, (double)(rep)*(double)(sz+1)/(1024*1024*tdiff)); \
+ if (ln != (tsz 1)
scale = atoi(argv[1]);
- buf -= (unsigned long)buf%16;
memset(buf, 0xa5, SIZE);
- TESTC(0, find_nonzero_c, 1024*256*scale/16, SIZE);
- TEST_SIMD(0, find_nonzero_simd, 1024*256*scale/16, SIZE);
- TESTC(0, find_nonzero, 1024*256*scale/16, SIZE);
- TEST_REP(0, find_nonzero_rep, 1024*256*scale/16, SIZE);
+ TESTC(0, find_nonzero_c, 1024*1024*scale/16, SIZE);
+ TEST_SIMD(0, find_nonzero_simd, 1024*1024*scale/16, SIZE);
+ TESTC(0, find_nonzero, 1024*1024*scale/16, SIZE);
+ TEST_REP(0, find_nonzero_rep, 1024*1024*scale/16, SIZE);
TESTC(8*1024-15, find_nonzero_c, 1024*256*scale/16, SIZE);
TEST_SIMD(8*1024-15, find_nonzero_simd, 1024*256*scale/16, SIZE);
@@ -175,6 +265,7 @@
buf--;
TESTC(32*1024-9, find_nonzero_c, 1024*64*scale/16, SIZE);
TEST_SIMD(32*1024-9, find_nonzero_simd, 1024*64*scale/16, SIZE);
+ TEST_SIMD2(32*1024-9, find_nonzero_sse2o, 1024*64*scale/16, SIZE);
TESTC(32*1024-9, find_nonzero, 1024*64*scale/16, SIZE);
TEST_REP(32*1024-9, find_nonzero_rep, 1024*64*scale/16, SIZE);
TESTC(128*1024-8, find_nonzero_c, 1024*16*scale/16, SIZE);
@@ -190,15 +281,19 @@
TESTC(64*1024*1024, find_nonzero_c, 32*scale/16, SIZE);
TEST_SIMD(64*1024*1024, find_nonzero_simd, 32*scale/16, SIZE);
- TESTC(64*1024*1024, find_nonzero_c, 32*scale/16, SIZE-16);
- TEST_SIMD(64*1024*1024, find_nonzero_simd, 32*scale/16, SIZE-16);
- TESTC(64*1024*1024, find_nonzero, 32*scale/16, SIZE-16);
- TEST_REP(64*1024*1024, find_nonzero_rep, 32*scale/16, SIZE-16);
-
- TESTC(64*1024*1024, find_nonzero_c, 32*scale/16, SIZE-5);
- TEST_SIMD(64*1024*1024, find_nonzero_simd, 32*scale/16, SIZE-5);
- TESTC(64*1024*1024, find_nonzero, 32*scale/16, SIZE-5);
- TEST_REP(64*1024*1024, find_nonzero_rep, 32*scale/16, SIZE-5);
+ TESTC(64*1024*1024, find_nonzero_c, 1+scale/16, SIZE-16);
+ TEST_SIMD(64*1024*1024, find_nonzero_simd, 1+scale/16, SIZE-16);
+ TESTC(64*1024*1024, find_nonzero, 1+scale/16, SIZE-16);
+ TEST_REP(64*1024*1024, find_nonzero_rep, 1+scale/16, SIZE-16);
+
+ TESTC(64*1024*1024, find_nonzero_c, 1+scale/16, SIZE-5);
+ TEST_SIMD(64*1024*1024, find_nonzero_simd, 1+scale/16, SIZE-5);
+ TESTC(64*1024*1024, find_nonzero, 1+scale/16, SIZE-5);
+ TEST_REP(64*1024*1024, find_nonzero_rep, 1+scale/16, SIZE-5);
+
+ TEST2C(12*1024*1024, find_nonzero_c, 160*scale/16, SIZE);
+ TEST2_SIMD(12*1024*1024, find_nonzero_simd, 160*scale/16, SIZE);
+ TEST2_SIMD2(12*1024*1024, find_nonzero_sse2o, 160*scale/16, SIZE);
free(obuf);
return 0;
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dd_rescue/find_nonzero.h new/dd_rescue/find_nonzero.h
--- old/dd_rescue/find_nonzero.h 2013-07-24 16:47:30.000000000 +0200
+++ new/dd_rescue/find_nonzero.h 2013-08-02 13:33:21.000000000 +0200
@@ -23,17 +23,65 @@
# else
# define myffsl(x) _mm_popcnt_u32(x^(~(-x)))
# endif
-#else
+#else /* NOFFS */
# define myffsl(x) myffs(x)
+/** Find first (lowest) bit set in word val, returns a val b/w 1 and __WORDSIZE, 0 if no bit is set */
static inline int myffsl(unsigned long val)
{
- int i;
- for (i = 1; i <= sizeof(val)*8; ++i) {
- if (val & 0x01)
- return i;
- val >>= 1;
+ int res = 1;
+ if (!val)
+ return 0;
+#if __WORDSIZE == 64
+ unsigned int vlo = val;
+ unsigned int vhi = val >> 32;
+ if (!vlo) {
+ res += 32;
+ vlo = vhi;
+ }
+#else
+ unsigned int vlo = val;
+#endif
+ unsigned int mask = 0x0000ffff;
+ unsigned int shift = 16;
+ while (shift > 0) {
+ if (!(vlo & mask)) {
+ res += shift;
+ vlo >>= shift;
+ }
+ shift >>= 1;
+ mask >>= shift;
}
- return 0;
+ return res;
+}
+#endif
+#if __BYTE_ORDER == __BIG_ENDIAN || defined(TEST)
+/** Find last (highest) bit set in word val, returns a val b/w __WORDSIZE and 1, 0 if no bit is set */
+static inline int myflsl(unsigned long val)
+{
+ int res = __WORDSIZE;
+ if (!val)
+ return 0;
+#if __WORDSIZE == 64
+ unsigned int vlo = val;
+ unsigned int vhi = val >> 32;
+ if (!vhi) {
+ res -= 32;
+ vhi = vlo;
+ }
+#else
+ unsigned int vhi = val;
+#endif
+ unsigned int mask = 0xffff0000;
+ unsigned int shift = 16;
+ while (shift > 0) {
+ if (!(vhi & mask)) {
+ res -= shift;
+ vhi <<= shift;
+ }
+ shift >>= 1;
+ mask <<= shift;
+ }
+ return res;
}
#endif
@@ -78,6 +126,14 @@
#if defined(HAVE_SSE2) || defined(__arm__)
#define HAVE_SIMD
+#ifdef HAVE_AVX2
+#define find_nonzero_simd find_nonzero_avx2
+#elif defined(HAVE_SSE2)
+#define find_nonzero_simd find_nonzero_sse2
+#elif defined(__arm__)
+#define find_nonzero_simd find_nonzero_arm6
+#endif
+
/* FIXME: Is there no library function to find the first non-null byte?
* Something like ffs() for a long byte array?
* Here is an optimized version using SSE2 intrinsics, but there should be
@@ -90,29 +146,39 @@
/* No need for runtime detection here */
const static char have_simd = 0;
#endif
-/** return length of zero bytes */
+/** return number of bytes at beginning of blk that are all zero, assumes __WORDSIZE bit alignment */
static size_t find_nonzero_c(const unsigned char* blk, const size_t ln)
{
const unsigned long* ptr = (const unsigned long*)blk;
const unsigned long* const bptr = ptr;
for (; (size_t)(ptr-bptr) < ln/sizeof(*ptr); ++ptr)
if (*ptr)
+#if __BYTE_ORDER == __BIG_ENDIAN
+ return sizeof(unsigned long)*(ptr-bptr) + sizeof(long)-((myflsl(*ptr)+7)>>3);
+#else
return sizeof(unsigned long)*(ptr-bptr) + ((myffsl(*ptr)-1)>>3);
+#endif
return ln;
}
-/* Generic version, does not require an aligned buffer blk */
+/** return number of bytes at beginning of blk that are all zero
+ * Generic version, does not require an aligned buffer blk or even ln ... */
inline static size_t find_nonzero(const unsigned char* blk, const size_t ln)
{
- const int off = ((unsigned long)blk) % 16;
- if (off) {
- int i;
- for (i = 0; i < 16-off; ++i)
- if (blk[i])
- return i;
- return i+find_nonzero_opt(blk+i, ln-i);
- } else
- return find_nonzero_opt(blk, ln);
+ const int off = (-(unsigned char)(unsigned long)blk) & 0x1f;
+ size_t remain = ln - off;
+ size_t i;
+ for (i = 0; i < off; ++i)
+ if (blk[i])
+ return i;
+ int r2 = remain % 0x1f;
+ size_t res = find_nonzero_opt(blk+off, remain-r2);
+ if (!r2 || res != remain-r2)
+ return off+res;
+ for (i = off+remain; i < ln; ++i)
+ if (blk[i])
+ return i;
+ return ln;
}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dd_rescue/find_nonzero_avx.c new/dd_rescue/find_nonzero_avx.c
--- old/dd_rescue/find_nonzero_avx.c 1970-01-01 01:00:00.000000000 +0100
+++ new/dd_rescue/find_nonzero_avx.c 2013-08-02 13:37:31.000000000 +0200
@@ -0,0 +1,41 @@
+/** find_nonzero_avx.c
+ * AVX2 optimized search for non-zero bytes
+ * taken straight from SSE2 and adapted to use AVX registers
+ * Needs recent (2.23+) binutils to compile ...
+ * (c) Kurt Garloff , 2013
+ * License: GNU GPL v2 or v3
+ */
+
+#define _GNU_SOURCE 1
+#include "find_nonzero.h"
+size_t find_nonzero_sse2(const unsigned char* blk, const size_t ln);
+
+#ifdef __AVX2__
+#if defined(__GNUC__) || defined(__llvm__)
+# warning AVX2 version untested and runtime detection only with gcc 4.8+
+#endif
+#include
+/** AVX2 version for measuring the initial zero bytes of 32b aligned blk */
+size_t find_nonzero_avx2(const unsigned char* blk, const size_t ln)
+{
+#if defined( __GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+ if (!(__builtin_cpu_supports("avx2")))
+ return find_nonzero_sse2(blk, ln);
+#endif
+ __m256i register ymm;
+ const __m256i register zero = _mm256_setzero_si256();
+ unsigned register eax;
+ size_t i = 0;
+ //asm(".p2align 5");
+ for (; i < ln; i+= 32) {
+ //ymm = _mm256_load_si256((__m256i*)(blk+i));
+ ymm = _mm256_cmpeq_epi8(*(__m256i*)(blk+i), zero);
+ eax = ~(_mm256_movemask_epi8(ymm));
+ if (eax)
+ return i + myffs(eax)-1;
+ }
+ return ln;
+}
+#endif
+
+
--
To unsubscribe, e-mail: opensuse-commit+unsubscribe@opensuse.org
For additional commands, e-mail: opensuse-commit+help@opensuse.org