Hello community, here is the log from the commit of package duperemove for openSUSE:Factory checked in at 2014-10-31 18:27:56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/duperemove (Old) and /work/SRC/openSUSE:Factory/.duperemove.new (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Package is "duperemove" Changes: -------- --- /work/SRC/openSUSE:Factory/duperemove/duperemove.changes 2014-09-30 19:42:03.000000000 +0200 +++ /work/SRC/openSUSE:Factory/.duperemove.new/duperemove.changes 2014-10-31 20:03:36.000000000 +0100 @@ -1,0 +2,11 @@ +Fri Oct 31 02:44:53 UTC 2014 - mfasheh@suse.com + +- Update to duperemove v0.09.beta2 + - fix memory leak + - fix hardlink detection on btrfs + - print file number status during csum phase + - print a status bar during extent seearch + - several bugfixes and performance improvements to extent search +- Removed patch: 001-fix-build.patch + +------------------------------------------------------------------- Old: ---- 001-fix-build.patch duperemove-v0.09.beta1.tar.gz New: ---- duperemove-v0.09.beta2.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ duperemove.spec ++++++ --- /var/tmp/diff_new_pack.49iyG2/_old 2014-10-31 20:03:37.000000000 +0100 +++ /var/tmp/diff_new_pack.49iyG2/_new 2014-10-31 20:03:37.000000000 +0100 @@ -17,13 +17,13 @@ %define modname duperemove -%define tar_version v0.09.beta1 +%define tar_version v0.09.beta2 Name: duperemove BuildRequires: gcc-c++ BuildRequires: glib2-devel BuildRequires: libgcrypt-devel -Version: 0.09~beta1 +Version: 0.09~beta2 Release: 0 Summary: Software to find duplicate extents in files and remove them License: GPL-2.0 @@ -31,7 +31,6 @@ Url: https://github.com/markfasheh/duperemove Source: %{modname}-%{tar_version}.tar.gz BuildRoot: %{_tmppath}/%{name}-%{version}-build -Patch1: 001-fix-build.patch %description Duperemove finds duplicate extents in files and prints them to the @@ -48,7 +47,6 @@ %prep %setup -q -n %{modname}-%{tar_version} -%patch1 -p1 %build make ++++++ duperemove-v0.09.beta1.tar.gz -> duperemove-v0.09.beta2.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.09.beta1/Makefile new/duperemove-v0.09.beta2/Makefile --- old/duperemove-v0.09.beta1/Makefile 2014-09-27 01:56:37.000000000 +0200 +++ new/duperemove-v0.09.beta2/Makefile 2014-10-31 00:27:39.000000000 +0100 @@ -1,11 +1,11 @@ -RELEASE=v0.09.beta1 +RELEASE=v0.09.beta2 CC = gcc CFLAGS = -Wall -ggdb MANPAGES=duperemove.8 btrfs-extent-same.8 -DIST_SOURCES=csum-gcrypt.c csum-mhash.c csum.h duperemove.c hash-tree.c hash-tree.h results-tree.c results-tree.h kernel.h LICENSE list.h Makefile rbtree.c rbtree.h rbtree.txt README TODO dedupe.c dedupe.h btrfs-ioctl.h filerec.c filerec.h $(MANPAGES) btrfs-extent-same.c debug.h util.c util.h serialize.c serialize.h hashstats.c +DIST_SOURCES=csum-gcrypt.c csum-mhash.c csum.h duperemove.c hash-tree.c hash-tree.h results-tree.c results-tree.h kernel.h LICENSE list.h Makefile rbtree.c rbtree.h rbtree.txt README TODO dedupe.c dedupe.h btrfs-ioctl.h filerec.c filerec.h btrfs-util.c btrfs-util.h $(MANPAGES) btrfs-extent-same.c debug.h util.c util.h serialize.c serialize.h hashstats.c DIST=duperemove-$(RELEASE) DIST_TARBALL=$(DIST).tar.gz TEMP_INSTALL_DIR:=$(shell mktemp -du -p .) @@ -26,7 +26,7 @@ $(crypt_CFLAGS) $(glib_CFLAGS) LIBRARY_FLAGS += $(crypt_LIBS) $(glib_LIBS) -objects = duperemove.o rbtree.o hash-tree.o results-tree.o dedupe.o filerec.o util.o serialize.o $(hash_obj) +objects = duperemove.o rbtree.o hash-tree.o results-tree.o dedupe.o filerec.o util.o serialize.o btrfs-util.o $(hash_obj) progs = duperemove DESTDIR = / @@ -38,7 +38,7 @@ all: $(progs) kernel.h list.h btrfs-ioctl.h debug.h duperemove: $(objects) kernel.h duperemove.c - $(CC) $(LIBRARY_FLAGS) $(CFLAGS) $(objects) -o duperemove + $(CC) $(CFLAGS) $(objects) -o duperemove $(LIBRARY_FLAGS) tarball: clean mkdir -p $(TEMP_INSTALL_DIR)/$(DIST) @@ -60,14 +60,14 @@ done csum-test: $(hash_obj) csum-test.c - $(CC) $(LIBRARY_FLAGS) $(CFLAGS) $(hash_obj) -o csum-test csum-test.c + $(CC) $(CFLAGS) $(hash_obj) -o csum-test csum-test.c $(LIBRARY_FLAGS) filerec-test: filerec.c filerec.h rbtree.o - $(CC) $(LIBRARY_FLAGS) $(CFLAGS) -DFILEREC_TEST filerec.c rbtree.o -o filerec-test + $(CC) $(CFLAGS) -DFILEREC_TEST filerec.c rbtree.o -o filerec-test $(LIBRARY_FLAGS) hashstats_obj = $(hash_obj) rbtree.o hash-tree.o filerec.o util.o serialize.o results-tree.o hashstats: $(hashstats_obj) hashstats.c - $(CC) $(LIBRARY_FLAGS) $(CFLAGS) $(hashstats_obj) hashstats.c -o hashstats + $(CC) $(CFLAGS) $(hashstats_obj) hashstats.c -o hashstats $(LIBRARY_FLAGS) clean: rm -fr $(objects) $(progs) $(DIST_TARBALL) btrfs-extent-same filerec-test hashstats csum-*.o *~ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.09.beta1/TODO new/duperemove-v0.09.beta2/TODO --- old/duperemove-v0.09.beta1/TODO 2014-09-27 01:56:37.000000000 +0200 +++ new/duperemove-v0.09.beta2/TODO 2014-10-31 00:27:39.000000000 +0100 @@ -1,6 +1,3 @@ -- Inodes between subvolumes on a btrfs file system can have the same i_ino (hooray!). - We need to handle this, probably by recording subvolid on the filerec. - - Store results of our search to speed up subsequent runs. - When writing hashes, store latest btrfs transaction id in the hash file (import find-new from btrfsprogs for this) - When reading hashes and we have a transaction id, do a scan of btrfs objects to see which inodes have changed. @@ -14,6 +11,18 @@ - Add an optional mode to do duplicate checking with resolution of extent owners. +- Multi-threaded extent search. + - This needs locking and atomic updates to some fields, so it's not quite + as straight forward as threading the hash stage. A start would be to have + find_all_dups queue up appropriate dup_blocks_lists to worker threads. + The threads have to properly lock the filerec compared trees and results + trees. b_seen also needs to be read/written atomically. + +- csum_whole_file() still does a read/checksum of holes and unwritten + extents (even though we detect and mark them now). If we calculate and + store (in memory) the checksum of a zero'd block, we can skip the read and + copy our known value directly into the block digest member. + - Allow checking for similar files by some criteria (approximate size, file magic type, file extension, etc) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.09.beta1/btrfs-util.c new/duperemove-v0.09.beta2/btrfs-util.c --- old/duperemove-v0.09.beta1/btrfs-util.c 1970-01-01 01:00:00.000000000 +0100 +++ new/duperemove-v0.09.beta2/btrfs-util.c 2014-10-31 00:27:39.000000000 +0100 @@ -0,0 +1,74 @@ +/* + * btrfs-util.c + * + * Copyright (C) 2014 SUSE. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Some parts of this taken from btrfs-progs, which is also GPLv2 + */ + +#include <unistd.h> +#include <stdlib.h> +#include <stdint.h> +#include <sys/statfs.h> +#include <sys/ioctl.h> +#include <inttypes.h> +#include <errno.h> +#include <string.h> +#include <linux/magic.h> +#include <linux/btrfs.h> + +#include "btrfs-util.h" +#include "debug.h" + +/* For some reason linux/btrfs.h doesn't define this. */ +#define BTRFS_FIRST_FREE_OBJECTID 256ULL + +/* + * For a given: + * - file or directory return the containing tree root id + * - subvolume return its own tree id + * - BTRFS_EMPTY_SUBVOL_DIR_OBJECTID (directory with ino == 2) the result is + * undefined and function returns -1 + */ +int lookup_btrfs_subvolid(int fd, uint64_t *subvolid) +{ + int ret; + struct btrfs_ioctl_ino_lookup_args args; + + memset(&args, 0, sizeof(args)); + args.objectid = BTRFS_FIRST_FREE_OBJECTID; + + ret = ioctl(fd, BTRFS_IOC_INO_LOOKUP, &args); + if (ret) + return errno; + + *subvolid = args.treeid; + + return 0; +} + +int check_file_btrfs(int fd, int *btrfs) +{ + int ret; + struct statfs fs; + + *btrfs = 0; + + ret = fstatfs(fd, &fs); + if (ret) + return errno; + + if (fs.f_type == BTRFS_SUPER_MAGIC) + *btrfs = 1; + + return ret; +} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.09.beta1/btrfs-util.h new/duperemove-v0.09.beta2/btrfs-util.h --- old/duperemove-v0.09.beta1/btrfs-util.h 1970-01-01 01:00:00.000000000 +0100 +++ new/duperemove-v0.09.beta2/btrfs-util.h 2014-10-31 00:27:39.000000000 +0100 @@ -0,0 +1,22 @@ +/* + * btrfs-util.h + * + * Copyright (C) 2014 SUSE. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef __BTRFS_UTIL__ +#define __BTRFS_UTIL__ + +int check_file_btrfs(int fd, int *btrfs); +int lookup_btrfs_subvolid(int fd, uint64_t *rootid); + +#endif /* __BTRFS_UTIL__ */ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.09.beta1/debug.h new/duperemove-v0.09.beta2/debug.h --- old/duperemove-v0.09.beta1/debug.h 2014-09-27 01:56:37.000000000 +0200 +++ new/duperemove-v0.09.beta2/debug.h 2014-10-31 00:27:39.000000000 +0100 @@ -2,6 +2,7 @@ #define __DEBUG_H__ #include <stdio.h> +#include <stdbool.h> extern int verbose; extern int debug; @@ -56,6 +57,7 @@ declare_alloc_tracking_header(filerec); declare_alloc_tracking_header(files_compared); declare_alloc_tracking_header(filerec_token); +declare_alloc_tracking_header(file_hash_head); /* Can be called anywhere we want to dump the above statistics */ void print_mem_stats(void); @@ -73,6 +75,45 @@ } \ } while(0) + +/* + * compiletime_assert and associated code taken from + * linux-3.14.git/include/linux/compiler.h + */ + +# define __compiletime_warning(message) +# define __compiletime_error(message) + +/* + * Sparse complains of variable sized arrays due to the temporary variable in + * __compiletime_assert. Unfortunately we can't just expand it out to make + * sparse see a constant array size without breaking compiletime_assert on old + * versions of GCC (e.g. 4.2.4), so hide the array from sparse altogether. + */ +#define __compiletime_error_fallback(condition) \ + do { ((void)sizeof(char[1 - 2 * condition])); } while (0) + +#define __compiletime_assert(condition, msg, prefix, suffix) \ + do { \ + bool __cond = !(condition); \ + extern void prefix ## suffix(void) __compiletime_error(msg); \ + if (__cond) \ + prefix ## suffix(); \ + __compiletime_error_fallback(__cond); \ + } while (0) + +/** + * compiletime_assert - break build and emit msg if condition is false + * @condition: a compile-time constant condition to check + * @msg: a message to emit if condition is false + * + * In tradition of POSIX assert, this macro will break the build if the + * supplied condition is *false*, emitting the supplied error message if the + * compiler has support to do so. + */ +#define compiletime_assert(condition, msg) \ + __compiletime_assert(condition, msg, __compiletime_assert_, __LINE__) + /* * BUILD_BUG_ON() and associated code taken from * linux-2.6.git/include/linux/bug.h diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.09.beta1/duperemove.c new/duperemove-v0.09.beta2/duperemove.c --- old/duperemove-v0.09.beta1/duperemove.c 2014-09-27 01:56:37.000000000 +0200 +++ new/duperemove-v0.09.beta2/duperemove.c 2014-10-31 00:27:39.000000000 +0100 @@ -17,7 +17,6 @@ #include <sys/types.h> #include <sys/stat.h> -#include <sys/statfs.h> #include <limits.h> #include <fcntl.h> #include <unistd.h> @@ -32,7 +31,6 @@ #include <ctype.h> #include <getopt.h> #include <inttypes.h> -#include <linux/magic.h> #include <assert.h> #include <linux/fiemap.h> @@ -47,6 +45,7 @@ #include "dedupe.h" #include "util.h" #include "serialize.h" +#include "btrfs-util.h" #include "debug.h" /* exported via debug.h */ @@ -73,13 +72,17 @@ static char *serialize_fname = NULL; static unsigned int hash_threads = 0; +static int fancy_status = 0; + static void debug_print_block(struct file_block *e) { struct filerec *f = e->b_file; - printf("%s\tloff: %llu lblock: %llu seen: %u\n", f->filename, + printf("%s\tloff: %llu lblock: %llu seen: %u flags: 0x%x\n", + f->filename, (unsigned long long)e->b_loff, - (unsigned long long)e->b_loff / blocksize, e->b_seen); + (unsigned long long)e->b_loff / blocksize, e->b_seen, + e->b_flags); } static void debug_print_tree(struct hash_tree *tree) @@ -372,10 +375,12 @@ assert(buf != NULL); unsigned char *digest = malloc(DIGEST_LEN_MAX); assert(digest != NULL); + static long long unsigned cur_num_filerecs = 0; GMutex *tree_mutex = g_dataset_get_data(tree, "mutex"); - printf("csum: %s\n", file->filename); + printf("csum: %s \t[%llu/%llu]\n", file->filename, + __sync_add_and_fetch(&cur_num_filerecs, 1), num_filerecs); fc = alloc_fiemap_ctxt(); if (fc == NULL) /* This should be non-fatal */ @@ -448,6 +453,8 @@ filerec_close(file); free(digest); free(buf); + if (fc) + free(fc); return; @@ -456,6 +463,8 @@ err_noclose: free(digest); free(buf); + if (fc) + free(fc); fprintf( stderr, @@ -543,7 +552,7 @@ printf("\t-A\t\tOpens files readonly when deduping. Primarily for use by privileged users on readonly snapshots\n"); printf("\t-b bsize\tUse bsize blocks. Default is %dk.\n", DEFAULT_BLOCKSIZE / 1024); - printf("\t-h\t\tPrint numbers in human-readble format.\n"); + printf("\t-h\t\tPrint numbers in human-readable format.\n"); printf("\t-v\t\tBe verbose.\n"); printf("\t--hash-threads=N\n\t\t\tUse N threads for hashing phase. " "Default is automatically detected.\n"); @@ -593,44 +602,18 @@ return 0; } -static int check_file_fs(struct filerec *file, int *bad_fs) -{ - int ret; - struct statfs fs; - - *bad_fs = 0; - - if (!run_dedupe) - return 0; - - ret = filerec_open(file, 0); - if (ret) - return ret; - - ret = fstatfs(file->fd, &fs); - if (ret) { - ret = -errno; - goto out; - } - - if (fs.f_type != BTRFS_SUPER_MAGIC) - *bad_fs = 1; - -out: - filerec_close(file); - return ret; -} - /* * Returns nonzero on fatal errors only */ static int add_file(const char *name, int dirfd) { int ret, len = strlen(name); - int bad_fs; + int fd; + int on_btrfs = 0; struct stat st; char *pathtmp; struct filerec *file; + uint64_t subvolid; if (len > (path_max - pathp)) { fprintf(stderr, "Path max exceeded: %s %s\n", path, name); @@ -671,26 +654,59 @@ goto out; } - file = filerec_new(path, st.st_ino); - if (file == NULL) { - fprintf(stderr, "Out of memory while allocating file record " - "for: %s\n", path); - return ENOMEM; + fd = open(path, O_RDONLY); + if (fd == -1) { + ret = errno; + fprintf(stderr, "Error %d: %s while opening file \"%s\". " + "Skipping.\n", ret, strerror(ret), path); + goto out; } - ret = check_file_fs(file, &bad_fs); + ret = check_file_btrfs(fd, &on_btrfs); if (ret) { - fprintf(stderr, "Skip file \"%s\" due to errors\n", - file->filename); - filerec_free(file); - return ENOMEM; + close(fd); + fprintf(stderr, "Skip file \"%s\" due to errors\n", path); + goto out; } - if (bad_fs) { - fprintf(stderr, "Can only dedupe files on btrfs\n"); - filerec_free(file); + + if (run_dedupe && !on_btrfs) { + close(fd); + fprintf(stderr, "\"%s\": Can only dedupe files on btrfs\n", + path); return ENOSYS; } + if (on_btrfs) { + /* + * Inodes between subvolumes on a btrfs file system + * can have the same i_ino. Get the subvolume id of + * our file so hard link detection works. + */ + ret = lookup_btrfs_subvolid(fd, &subvolid); + if (ret) { + close(fd); + fprintf(stderr, + "Error %d: %s while finding subvolid for file " + "\"%s\". Skipping.\n", ret, strerror(ret), + path); + goto out; + } + } else { + subvolid = st.st_dev; + } + +// printf("\"%s\", ino: %llu, subvolid: %"PRIu64"\n", path, +// (unsigned long long)st.st_ino, subvolid); + + close(fd); + + file = filerec_new(path, st.st_ino, subvolid); + if (file == NULL) { + fprintf(stderr, "Out of memory while allocating file record " + "for: %s\n", path); + return ENOMEM; + } + out: pathp = pathtmp; return 0; @@ -784,12 +800,27 @@ /* Filter out option combinations that don't make sense. */ if (write_hashes && - (read_hashes || run_dedupe)) + (read_hashes || run_dedupe)) { + if (run_dedupe) + fprintf(stderr, + "Error: Can not dedupe with --write-hashes " + "option. Try writing hashes and then deduping " + "with --read-hashes instead.\n"); + if (read_hashes) + fprintf(stderr, + "Error: Specify only one of --write-hashes or " + "--read-hashes.\n"); + return 1; + } if (read_hashes) { - if (numfiles) + if (numfiles) { + fprintf(stderr, + "Error: --read-hashes option does not take a " + "file list argument\n"); return 1; + } goto out_nofiles; } @@ -852,19 +883,14 @@ (unsigned long long)eoff[1] / blocksize); } -struct dupe_walk_ctxt { - struct file_block *orig; - - struct filerec *orig_file; - struct filerec *walk_file; - - struct results_tree *res; -}; - -static int walk_dupe_block(struct file_block *block, void *priv) +static int walk_dupe_block(struct filerec *orig_file, + struct file_block *orig_file_block, + struct filerec *walk_file, + struct file_block *walk_file_block, + struct results_tree *res) { - struct dupe_walk_ctxt *ctxt = priv; - struct file_block *orig = ctxt->orig; + struct file_block *orig = orig_file_block; + struct file_block *block = walk_file_block; struct file_block *start[2] = { orig, block }; struct file_block *end[2]; struct running_checksum *csum; @@ -891,8 +917,8 @@ * This is kind of ugly, however it does correctly * signify the end of our list. */ - if (orig->b_file_next.next == &ctxt->orig_file->block_list || - block->b_file_next.next == &ctxt->walk_file->block_list) + if (orig->b_file_next.next == &orig_file->block_list || + block->b_file_next.next == &walk_file->block_list) break; orig = list_entry(orig->b_file_next.next, struct file_block, @@ -903,32 +929,59 @@ finish_running_checksum(csum, match_id); - record_match(ctxt->res, match_id, ctxt->orig_file, ctxt->walk_file, + record_match(res, match_id, orig_file, walk_file, start, end); out: return 0; } +/* + * Start an extent search (with orig_block) at each block in our dups + * list which is owned by walk_file. + */ +static void lookup_walk_file_hash_head(struct file_block *orig_block, + struct filerec *walk_file, + struct results_tree *res) +{ + struct dupe_blocks_list *parent = orig_block->b_parent; + struct file_block *cur; + struct file_hash_head *head = find_file_hash_head(parent, walk_file); + + /* find_file_dups should have checked this for us already */ + abort_on(head == NULL); + + list_for_each_entry(cur, &head->h_blocks, b_head_list) { + /* Ignore self. Technically this shouldn't happen (see above) + * until we allow walking a file against itself. */ + if (cur == orig_block) + continue; + + abort_on(cur->b_file != walk_file); + + if (walk_dupe_block(orig_block->b_file, orig_block, + walk_file, cur, res)) + break; + } +} + static void find_file_dupes(struct filerec *file, struct filerec *walk_file, struct results_tree *res) { struct file_block *cur; - struct dupe_walk_ctxt ctxt = { 0, }; list_for_each_entry(cur, &file->block_list, b_file_next) { if (block_seen(cur)) continue; + + if (!file_in_dups_list(cur->b_parent, walk_file)) + continue; + /* * For each file block with the same hash: * - Traverse, along with original file until we have no match * - record */ - memset(&ctxt, 0, sizeof(struct dupe_walk_ctxt)); - ctxt.orig_file = file; - ctxt.walk_file = walk_file; - ctxt.orig = cur; - ctxt.res = res; - for_each_dupe(cur, walk_file, walk_dupe_block, &ctxt); + lookup_walk_file_hash_head(cur, walk_file, res); } clear_all_seen_blocks(); } @@ -941,50 +994,6 @@ return mark_filerecs_compared(file1, file2); } -/* - * This dupe list is too large for the extent search algorithm to - * handle efficiently. Instead of walking the block list, we walk the - * list of files referenced and compare them to each other directly. - * - * A future improvement might be to always do this, at the cost of - * extra memory usage. - */ -static int walk_large_dups(struct hash_tree *tree, - struct results_tree *res, - struct dupe_blocks_list *dups) -{ - int ret; - struct rb_node *node = rb_first(&dups->dl_files_root); - struct rb_node *next; - struct filerec_token *t1, *t2; - struct filerec *file1, *file2; - - while (node) { - t1 = rb_entry(node, struct filerec_token, t_node); - file1 = t1->t_file; - - next = rb_next(node); - while (next) { - t2 = rb_entry(next, struct filerec_token, t_node); - file2 = t2->t_file; - - /* filerec token tree does not allow duplicates */ - abort_on(file1 == file2); - if (!filerecs_compared(file1, file2)) { - ret = compare_files(res, file1, file2); - if (ret) - return ret; - break; - } - - next = rb_next(next); - } - node = rb_next(node); - } - - return 0; -} - static int walk_dupe_hashes(struct dupe_blocks_list *dups, struct results_tree *res) { @@ -1043,13 +1052,59 @@ return 0; } +static void update_extent_search_status(struct hash_tree *tree, + unsigned long long processed) +{ + static int last_pos = -1; + int i, pos; + int width = 40; + float progress; + + if (!fancy_status) + return; + + progress = (float) processed / tree->num_hashes; + pos = width * progress; + + /* Only update our status every width% */ + if (pos <= last_pos) + return; + last_pos = pos; + + printf("\r["); + for(i = 0; i < width; i++) { + if (i < pos) + printf("#"); + else if (i == pos) + printf("%%"); + else + printf(" "); + } + printf("]"); + fflush(stdout); +} + +static void clear_extent_search_status(unsigned long long processed, + int err) +{ + if (!fancy_status) + return; + + if (err) + printf("\rSearch exited (%llu processed) with error %d: " + "\"%s\"\n", processed, err, strerror(err)); + else + printf("\rSearch completed with no errors. \n"); + fflush(stdout); +} + static int find_all_dups(struct hash_tree *tree, struct results_tree *res) { - int ret; + int ret = 0; struct rb_root *root = &tree->root; struct rb_node *node = rb_first(root); struct dupe_blocks_list *dups; - LIST_HEAD(large_dupes); + unsigned long long processed = 0; while (1) { if (node == NULL) @@ -1057,29 +1112,22 @@ dups = rb_entry(node, struct dupe_blocks_list, dl_node); - if (dups->dl_num_files) { - list_add_tail(&dups->dl_large_list, &large_dupes); + update_extent_search_status(tree, processed); - printf("Hash (\""); - debug_print_digest(stdout, dups->dl_hash); - printf("\") has %u items (across %u files), processing later.\n", - dups->dl_num_elem, dups->dl_num_files); - } else if (dups->dl_num_elem > 1) { + if (dups->dl_num_elem > 1) { ret = walk_dupe_hashes(dups, res); if (ret) - return ret; + goto out; } - node = rb_next(node); - } + processed++; - list_for_each_entry(dups, &large_dupes, dl_large_list) { - ret = walk_large_dups(tree, res, dups); - if (ret) - return ret; + node = rb_next(node); } +out: - return 0; + clear_extent_search_status(processed, ret); + return ret; } int main(int argc, char **argv) @@ -1101,6 +1149,9 @@ return EINVAL; } + if (isatty(STDOUT_FILENO)) + fancy_status = 1; + if (read_hashes) { ret = read_hash_tree(serialize_fname, &tree, &blocksize, NULL); if (ret == FILE_VERSION_ERROR) { @@ -1143,8 +1194,9 @@ fprintf(stderr, "Error %d while writing to hash file\n", ret); goto out; } else { - printf("Hashed %"PRIu64" blocks. Calculating duplicate " - "extents - this may take some time.\n", tree.num_blocks); + printf("Hashed %"PRIu64" blocks, resulting in %"PRIu64" unique " + "hashes. Calculating duplicate extents - this may take " + "some time.\n", tree.num_blocks, tree.num_hashes); } ret = find_all_dups(&tree, &res); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.09.beta1/filerec.c new/duperemove-v0.09.beta2/filerec.c --- old/duperemove-v0.09.beta1/filerec.c 2014-09-27 01:56:37.000000000 +0200 +++ new/duperemove-v0.09.beta2/filerec.c 2014-10-31 00:27:39.000000000 +0100 @@ -83,13 +83,19 @@ free_filerec_token(token); } +static void filerec_token_init(struct filerec_token *token, + struct filerec *file) +{ + rb_init_node(&token->t_node); + token->t_file = file; +} + struct filerec_token *filerec_token_new(struct filerec *file) { struct filerec_token *token = malloc_filerec_token(); if (token) { - rb_init_node(&token->t_node); - token->t_file = file; + filerec_token_init(token, file); } return token; } @@ -151,8 +157,23 @@ } } +static int cmp_filerecs(struct filerec *file1, uint64_t file2_inum, + uint64_t file2_subvolid) +{ + if (file1->inum < file2_inum) + return -1; + else if (file1->inum > file2_inum) + return 1; + if (file1->subvolid < file2_subvolid) + return -1; + if (file1->subvolid > file2_subvolid) + return -1; + return 0; +} + static void insert_filerec(struct filerec *file) { + int c; struct rb_node **p = &filerec_by_inum.rb_node; struct rb_node *parent = NULL; struct filerec *tmp; @@ -162,9 +183,10 @@ tmp = rb_entry(parent, struct filerec, inum_node); - if (file->inum < tmp->inum) + c = cmp_filerecs(tmp, file->inum, file->subvolid); + if (c < 0) p = &(*p)->rb_left; - else if (file->inum > tmp->inum) + else if (c > 0) p = &(*p)->rb_right; else abort_lineno(); /* We should never find a duplicate */ @@ -175,17 +197,19 @@ return; } -static struct filerec *find_filerec(uint64_t inum) +static struct filerec *find_filerec(uint64_t inum, uint64_t subvolid) { + int c; struct rb_node *n = filerec_by_inum.rb_node; struct filerec *file; while (n) { file = rb_entry(n, struct filerec, inum_node); - if (inum < file->inum) + c = cmp_filerecs(file, inum, subvolid); + if (c < 0) n = n->rb_left; - else if (inum > file->inum) + else if (c > 0) n = n->rb_right; else return file; @@ -193,7 +217,8 @@ return NULL; } -static struct filerec *filerec_alloc_insert(const char *filename, uint64_t inum) +static struct filerec *filerec_alloc_insert(const char *filename, + uint64_t inum, uint64_t subvolid) { struct filerec *file = calloc_filerec(1); @@ -210,6 +235,7 @@ INIT_LIST_HEAD(&file->tmp_list); rb_init_node(&file->inum_node); file->inum = inum; + file->subvolid = subvolid; file->comparisons = RB_ROOT; insert_filerec(file); @@ -219,11 +245,12 @@ return file; } -struct filerec *filerec_new(const char *filename, uint64_t inum) +struct filerec *filerec_new(const char *filename, uint64_t inum, + uint64_t subvolid) { - struct filerec *file = find_filerec(inum); + struct filerec *file = find_filerec(inum, subvolid); if (!file) - file = filerec_alloc_insert(filename, inum); + file = filerec_alloc_insert(filename, inum, subvolid); return file; } @@ -458,9 +485,12 @@ last = 1; dprintf("(fiemap) [%d] fe_logical: %llu, " - "fe_length: %llu\n", + "fe_length: %llu, fe_physical: %llu, " + "fe_flags: 0x%x\n", i, (unsigned long long)fm_ext[i].fe_logical, - (unsigned long long)fm_ext[i].fe_length); + (unsigned long long)fm_ext[i].fe_length, + (unsigned long long)fm_ext[i].fe_physical, + fm_ext[i].fe_flags); loff = fm_ext[i].fe_logical; ext_len = fm_ext[i].fe_length; @@ -494,8 +524,9 @@ last = 1; } - dprintf("(fiemap) loff: %"PRIu64" ext_len: %"PRIu64"\n", - loff, ext_len); +// dprintf("(fiemap) loff: %"PRIu64" ext_len: %"PRIu64 +// " flags: 0x%x\n", +// loff, ext_len, fm_ext[i].fe_flags); if (fm_ext[i].fe_flags & FIEMAP_EXTENT_SHARED) *shared_bytes += ext_len; @@ -527,7 +558,7 @@ return 1; } - file = filerec_new(argv[1], 500); /* Use made up ino */ + file = filerec_new(argv[1], 500, 1); /* Use made up ino */ if (!file) { fprintf(stderr, "filerec_new(): malloc error\n"); return 1; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.09.beta1/filerec.h new/duperemove-v0.09.beta2/filerec.h --- old/duperemove-v0.09.beta1/filerec.h 2014-09-27 01:56:37.000000000 +0200 +++ new/duperemove-v0.09.beta2/filerec.h 2014-10-31 00:27:39.000000000 +0100 @@ -11,6 +11,7 @@ struct filerec { int fd; /* file descriptor */ char *filename; /* path to file */ + uint64_t subvolid; uint64_t inum; struct rb_node inum_node; @@ -29,7 +30,8 @@ void init_filerec(void); -struct filerec *filerec_new(const char *filename, uint64_t inum); +struct filerec *filerec_new(const char *filename, uint64_t inum, + uint64_t subvolid); void filerec_free(struct filerec *file); int filerec_open(struct filerec *file, int write); void filerec_close(struct filerec *file); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.09.beta1/hash-tree.c new/duperemove-v0.09.beta2/hash-tree.c --- old/duperemove-v0.09.beta1/hash-tree.c 2014-09-27 01:56:37.000000000 +0200 +++ new/duperemove-v0.09.beta2/hash-tree.c 2014-10-31 00:27:39.000000000 +0100 @@ -34,48 +34,83 @@ declare_alloc_tracking(file_block); declare_alloc_tracking(dupe_blocks_list); -static int add_one_filerec_token(struct dupe_blocks_list *dups, - struct filerec *file) +struct file_hash_head *find_file_hash_head(struct dupe_blocks_list *dups, + struct filerec *file) { - struct filerec_token *t = NULL; + struct rb_node *n = dups->dl_files_root.rb_node; + struct file_hash_head *head; - if (find_filerec_token_rb(&dups->dl_files_root, file)) - return 0; - - t = filerec_token_new(file); - if (!t) - return ENOMEM; + while (n) { + head = rb_entry(n, struct file_hash_head, h_node); - insert_filerec_token_rb(&dups->dl_files_root, t); - dups->dl_num_files++; - return 0; + if (head->h_file < file) + n = n->rb_left; + else if (head->h_file > file) + n = n->rb_right; + else return head; + } + return NULL; } -static int add_filerec_tokens(struct dupe_blocks_list *dups) +static void insert_file_hash_head(struct dupe_blocks_list *dups, + struct file_hash_head *head) { - struct file_block *block; + struct rb_node **p = &dups->dl_files_root.rb_node; + struct rb_node *parent = NULL; + struct file_hash_head *tmp; - list_for_each_entry(block, &dups->dl_list, b_list) { - if (add_one_filerec_token(dups, block->b_file)) - return ENOMEM; + while (*p) { + parent = *p; + + tmp = rb_entry(parent, struct file_hash_head, h_node); + + if (tmp->h_file < head->h_file) + p = &(*p)->rb_left; + else if (tmp->h_file > head->h_file) + p = &(*p)->rb_right; + else abort_lineno(); /* We should never find a duplicate */ } - return 0; + + rb_link_node(&head->h_node, parent, p); + rb_insert_color(&head->h_node, &dups->dl_files_root); } -static void free_filerec_tokens(struct dupe_blocks_list *dups) +static int add_file_hash_head(struct dupe_blocks_list *dups, + struct file_block *block) { - struct rb_node *node = rb_first(&dups->dl_files_root); - struct filerec_token *t; + struct filerec *file = block->b_file; + struct file_hash_head *head = find_file_hash_head(dups, file); - while (node) { - t = rb_entry(node, struct filerec_token, t_node); + if (head) + goto add; - node = rb_next(node); + head = malloc(sizeof(*head)); + if (!head) + return ENOMEM; - dups->dl_num_files--; - rb_erase(&t->t_node, &dups->dl_files_root); - filerec_token_free(t); - } + head->h_file = file; + rb_init_node(&head->h_node); + INIT_LIST_HEAD(&head->h_blocks); + insert_file_hash_head(dups, head); + dups->dl_num_files++; +add: + /* We depend on this being added in increasing block order */ + list_add_tail(&block->b_head_list, &head->h_blocks); + return 0; +} + +static void free_one_hash_head(struct dupe_blocks_list *dups, + struct file_hash_head *head) +{ + rb_erase(&head->h_node, &dups->dl_files_root); + free(head); +} + +int file_in_dups_list(struct dupe_blocks_list *dups, struct filerec *file) +{ + if (find_file_hash_head(dups, file)) + return 1; + return 0; } static void insert_block_list(struct hash_tree *tree, @@ -147,30 +182,27 @@ rb_init_node(&d->dl_node); rb_init_node(&d->dl_by_size); INIT_LIST_HEAD(&d->dl_list); - INIT_LIST_HEAD(&d->dl_large_list); d->dl_files_root = RB_ROOT; insert_block_list(tree, d); } - if (d->dl_num_elem >= DUPLIST_CONVERT_LIMIT && d->dl_num_files == 0) { - if (add_filerec_tokens(d)) - return ENOMEM; - } - e->b_file = file; e->b_seen = 0; e->b_loff = loff; e->b_flags = flags; - list_add_tail(&e->b_file_next, &file->block_list); - file->num_blocks++; e->b_parent = d; - if (d->dl_num_files) { - if (add_one_filerec_token(d, file)) - return ENOMEM; + INIT_LIST_HEAD(&e->b_head_list); + + if (add_file_hash_head(d, e)) { + free_file_block(e); + return ENOMEM; } + list_add_tail(&e->b_file_next, &file->block_list); + file->num_blocks++; + d->dl_num_elem++; list_add_tail(&e->b_list, &d->dl_list); @@ -182,6 +214,7 @@ struct file_block *block, struct filerec *file) { struct dupe_blocks_list *blocklist = block->b_parent; + struct file_hash_head *head; abort_on(blocklist->dl_num_elem == 0); @@ -193,12 +226,16 @@ list_del(&block->b_file_next); list_del(&block->b_list); + list_del(&block->b_head_list); + head = find_file_hash_head(blocklist, file); + if (head && list_empty(&head->h_blocks)) + free_one_hash_head(blocklist, head); + blocklist->dl_num_elem--; if (blocklist->dl_num_elem == 0) { rb_erase(&blocklist->dl_node, &tree->root); tree->num_hashes--; - free_filerec_tokens(blocklist); free_dupe_blocks_list(blocklist); } @@ -214,25 +251,6 @@ remove_hashed_block(tree, block, file); } -void for_each_dupe(struct file_block *block, struct filerec *file, - for_each_dupe_t func, void *priv) -{ - struct dupe_blocks_list *parent = block->b_parent; - struct file_block *cur; - - list_for_each_entry(cur, &parent->dl_list, b_list) { - /* Ignore self and any blocks from another file */ - if (cur == block) - continue; - - if (cur->b_file != file) - continue; - - if (func(cur, priv)) - break; - } -} - static unsigned int seen_counter = 1; int block_seen(struct file_block *block) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.09.beta1/hash-tree.h new/duperemove-v0.09.beta2/hash-tree.h --- old/duperemove-v0.09.beta1/hash-tree.h 2014-09-27 01:56:37.000000000 +0200 +++ new/duperemove-v0.09.beta2/hash-tree.h 2014-10-31 00:27:39.000000000 +0100 @@ -14,24 +14,14 @@ unsigned int dl_num_elem; struct list_head dl_list; - /* - * num_files and files_root are used when the total number of - * blocks in the list exceeds DUPLIST_CONVERT_LIMIT (defined - * below) - */ unsigned int dl_num_files; - struct rb_root dl_files_root; - struct list_head dl_large_list; /* Temporary list for - * use by extent finding code */ + struct rb_root dl_files_root; /* stores file_hash_head nodes */ unsigned char dl_hash[DIGEST_LEN_MAX]; }; -/* Max number of blocks before we'll add filerec tokens */ -#define DUPLIST_CONVERT_LIMIT 30000 - /* Fiemap flags that would cause us to skip comparison of the block */ -#define FIEMAP_SKIP_FLAGS (FIEMAP_EXTENT_UNKNOWN|FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_UNWRITTEN) +#define FIEMAP_SKIP_FLAGS (FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_UNWRITTEN) /* Fiemap flags that indicate the extent may have already been deduped */ #define FIEMAP_DEDUPED_FLAGS (FIEMAP_EXTENT_SHARED) @@ -50,15 +40,33 @@ * with this md5. */ struct list_head b_file_next; /* filerec->block_list */ + + struct list_head b_head_list; /* file_hash_head->h_blocks */ }; int insert_hashed_block(struct hash_tree *tree, unsigned char *digest, struct filerec *file, uint64_t loff, unsigned int flags); void remove_hashed_blocks(struct hash_tree *tree, struct filerec *file); -typedef int (for_each_dupe_t)(struct file_block *, void *); -void for_each_dupe(struct file_block *block, struct filerec *file, - for_each_dupe_t func, void *priv); +/* + * Stores a list of blocks with the same hash / filerec + * combination. Each dupe_blocks_list keeps a tree of these (sorted by + * file). + * + * This speeds up the extent search by allowing us to skip blocks that + * don't belong to the file we are 'walking'. Blocks are inserted into + * h_blocks in the same order they are given to insert_hashed_block() + * (that is, in order of increasing offset). + */ +struct file_hash_head { + struct filerec *h_file; + struct rb_node h_node; + struct list_head h_blocks; +}; + +int file_in_dups_list(struct dupe_blocks_list *dups, struct filerec *file); +struct file_hash_head *find_file_hash_head(struct dupe_blocks_list *dups, + struct filerec *file); int block_seen(struct file_block *block); int block_ever_seen(struct file_block *block); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.09.beta1/hashstats.c new/duperemove-v0.09.beta2/hashstats.c --- old/duperemove-v0.09.beta1/hashstats.c 2014-09-27 01:56:37.000000000 +0200 +++ new/duperemove-v0.09.beta2/hashstats.c 2014-10-31 00:27:39.000000000 +0100 @@ -93,6 +93,21 @@ } } +static void printf_file_block_flags(struct file_block *block) +{ + if (!block->b_flags) + return; + + printf("( "); + if (block->b_flags & FILE_BLOCK_SKIP_COMPARE) + printf("skip_compare "); + if (block->b_flags & FILE_BLOCK_DEDUPED) + printf("deduped "); + if (block->b_flags & FILE_BLOCK_HOLE) + printf("hole "); + printf(")"); +} + static void print_by_size(void) { struct rb_node *node = rb_first(&by_size); @@ -104,6 +119,8 @@ else printf("Print top %d hashes\n", num_to_print); + printf("Hash, # Blocks, # Files\n"); + while (1) { if (node == NULL) break; @@ -111,14 +128,18 @@ dups = rb_entry(node, struct dupe_blocks_list, dl_by_size); debug_print_digest(stdout, dups->dl_hash); - printf(", %u\n", dups->dl_num_elem); + printf(", %u, %u\n", dups->dl_num_elem, dups->dl_num_files); if (print_blocks) { list_for_each_entry(block, &dups->dl_list, b_list) { struct filerec *f = block->b_file; - printf(" %s\tloff: %llu lblock: %llu\n", f->filename, + printf(" %s\tloff: %llu lblock: %llu " + "flags: 0x%x ", f->filename, (unsigned long long)block->b_loff, - (unsigned long long)block->b_loff / blocksize); + (unsigned long long)block->b_loff / blocksize, + block->b_flags); + printf_file_block_flags(block); + printf("\n"); } } @@ -133,12 +154,12 @@ { struct filerec *file; - printf("Showing %llu files.\nInode\tBlocks Stored\tFilename\n", + printf("Showing %llu files.\nInode\tBlocks Stored\tSubvold ID\tFilename\n", num_filerecs); list_for_each_entry(file, &filerec_list, rec_list) { - printf("%"PRIu64"\t%"PRIu64"\t%s\n", file->inum, - file->num_blocks, file->filename); + printf("%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%s\n", file->inum, + file->num_blocks, file->subvolid, file->filename); } } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.09.beta1/serialize.c new/duperemove-v0.09.beta2/serialize.c --- old/duperemove-v0.09.beta1/serialize.c 2014-09-27 01:56:37.000000000 +0200 +++ new/duperemove-v0.09.beta2/serialize.c 2014-10-31 00:27:39.000000000 +0100 @@ -34,8 +34,8 @@ #include "debug.h" #include "csum.h" -#include "hash-tree.h" #include "filerec.h" +#include "hash-tree.h" #include "serialize.h" @@ -131,6 +131,7 @@ finfo.ino = swap64(file->inum); finfo.file_size = 0ULL; /* We don't store this yet */ finfo.num_blocks = swap64(file->num_blocks); + finfo.subvolid = swap64(file->subvolid); name_len = strlen(file->filename); finfo.name_len = swap16(name_len); @@ -256,6 +257,7 @@ f->ino = swap64(disk.ino); f->file_size = swap64(disk.file_size); f->num_blocks = swap64(disk.num_blocks); + f->subvolid = swap64(disk.subvolid); f->rec_len = swap16(disk.rec_len); f->name_len = swap16(disk.name_len); @@ -295,7 +297,7 @@ dprintf("Load %"PRIu64" hashes for \"%s\"\n", finfo.num_blocks, fname); - file = filerec_new(fname, finfo.ino); + file = filerec_new(fname, finfo.ino, finfo.subvolid); if (file == NULL) return ENOMEM; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.09.beta1/serialize.h new/duperemove-v0.09.beta2/serialize.h --- old/duperemove-v0.09.beta1/serialize.h 2014-09-27 01:56:37.000000000 +0200 +++ new/duperemove-v0.09.beta2/serialize.h 2014-10-31 00:27:39.000000000 +0100 @@ -47,7 +47,8 @@ uint16_t rec_len; uint16_t name_len; uint32_t pad0; -/*20*/ uint64_t pad1[3]; + uint64_t subvolid; +/*20*/ uint64_t pad1[2]; char name[0]; }; -- To unsubscribe, e-mail: opensuse-commit+unsubscribe@opensuse.org For additional commands, e-mail: opensuse-commit+help@opensuse.org