/[dtapublic]/projs/dtats/trunk/projs/20161007_dedup/qdedup.c
ViewVC logotype

Annotation of /projs/dtats/trunk/projs/20161007_dedup/qdedup.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 75 - (hide annotations) (download)
Sat Nov 5 18:40:38 2016 UTC (8 years, 1 month ago) by dashley
Original Path: projs/trunk/projs/20161007_dedup/qdedup.c
File MIME type: text/plain
File size: 28351 byte(s)
Edits.
1 dashley 71 //----------------------------------------------------------------------------------------------------
2     //qdedup.c
3     //----------------------------------------------------------------------------------------------------
4     //Quick and dirty program to eliminate duplicates from a file tree. A file containing the SHA512
5 dashley 74 //hashes of all the files to be considered must already exist, and must be regenerated each time the
6     //underlying files are deleted/added/modified, which means the file must regenerated after each run
7     //of qdedup. (WARNING: IF YOU DO NOT REGENERATE THE FILE AFTER EACH RUN OF qdedup, YOU WILL
8     //PROBABLY DESTROY DATA. THE MECHANISM WOULD BE THAT THE SHA512 MANIFEST IMPLIES THAT DUPLICATES
9     //EXIST WHEN THEY NO LONGER DO, SO qdedup WILL ERRONEOUSLY DELETE THE LAST COPIES OF FILES.) The
10     //program will eliminate duplicates within a single specified directory or outside a single specified
11     //directory.
12     //
13     //This program will compile and run only on *nix systems and under Cygwin on Windows systems.
14 dashley 71 //----------------------------------------------------------------------------------------------------
15 dashley 74 //Copyright David T. Ashley (dashley@gmail.com), 2016.
16 dashley 71 //----------------------------------------------------------------------------------------------------
17 dashley 74 //Provided under the MIT LICENSE, reproduced immediately below.
18     //----------------------------------------------------------------------------------------------------
19     //Permission is hereby granted, free of charge, to any person obtaining a copy of
20     //this software and associated documentation files (the "Software"), to deal in the
21     //Software without restriction, including without limitation the rights to use,
22     //copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
23     //Software, and to permit persons to whom the Software is furnished to do so,
24     //subject to the following conditions:
25 dashley 71 //
26 dashley 74 //The above copyright notice and this permission notice shall be included in all
27     //copies or substantial portions of the Software.
28 dashley 71 //
29 dashley 74 //THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30     //IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31     //FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32     //AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33     //LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
34     //OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
35     //SOFTWARE.
36 dashley 71 //----------------------------------------------------------------------------------------------------
37     //All paths in the SHA512 file must be absolute or must be relative to the current working directory
38     //at the time this program is run.
39     //
40 dashley 74 //The recommended method to generate the SHA512 file is using the "-exec" option of the "find"
41     //command, i.e.
42     //
43     // find target_directory -type f -exec sha512sum {} \; >sha512sums.txt
44     //
45 dashley 71 //If any files are deleted by the program, a new SHA512 file must be generated before the program is
46     //run again to delete files. The reason for this restriction is that the program will never knowingly
47     //delete the last copy of a file. If the SHA512 file contains the digests of files that no longer
48     //exist, the program may unknowingly delete the last copies of files (because it believes based on
49     //the SHA512 file that other copies exist when in fact they do not).
50     //
51     //The SHA512 file does not need to be sorted (this program sorts it internally by hash before using it).
52     //
53     //This program is designed to compile and run under Cygwin or *nix only.
54     //
55     //Usage:
56     // qdedup
57     // Prints help information and exits.
58     // qdedup ndups <sha512file>
59     // Prints statistics about the number of duplicates in <sha512file>.
60     // qdedup filterdups <sha512file>
61     // Analyzes duplicates and prints the filenames of groups of duplicates. The output is designed
62     // for hand analysis so that insight can be gained into what duplicates exist and where they
63     // are located.
64 dashley 74 // qdedup dedup_preserve_inside <sha512file> <path>
65 dashley 71 // For each group of duplicates that exists, preserves the duplicates that exist within path
66     // and removes all others. If no copies of the duplicate exist within path, no copies of the
67     // duplicate will be removed.
68 dashley 74 // qdedup dryrun_preserve_inside <sha512file> <path>
69     // Exactly like "dedup_preserve_inside", except that no files will be deleted. Text will be
70     // output to explain what would be deleted by "dedup_preserve_inside".
71     // qdedup dedup_nopath <sha512file>
72 dashley 71 // For each group of duplicates that exists, preserves only the first (the only with lowest
73     // sort-order filename).
74 dashley 74 // qdedup dryrun_nopath <sha512file> <path>
75     // Exactly like "dedup_nopath", except that no files will be deleted. Text will be
76     // output to explain what would be deleted by "dedup_nopath".
77     // qdedup dedup_preserve_outside <sha512file> <path>
78     // For each group of duplicates that exists, deletes duplicates only from within the specified
79     // path. If any duplicates do not have at least one copy within <path> no instances of the
80     // duplicate are deleted.
81     // qdedup dryrun_preserve_outside <sha512file>
82     // Exactly like "dedup_preserve_outside", except that no files will be deleted. Text will be
83     // output to explain what would be deleted by "dedup_preserve_outside".
84 dashley 71 //----------------------------------------------------------------------------------------------------
85     #include <math.h>
86     #include <stdio.h>
87     #include <stdlib.h>
88     #include <string.h>
89     #include <time.h>
90     #include <unistd.h>
91     //----------------------------------------------------------------------------------------------------
92     #define LINELEN (78) //Number of printable characters in a line.
93     #define MAXLINELEN (2000) //The maximum number of characters that may be in a line of the
94     //SHA512 input file. This count includes the \0 terminator, so only
95     //this value minus 1 characters may be in a line.
96     #define UNLINKPAUSETIME (0.1) //Number of seconds to pause between file unlinks (deletions). This
97     //is designed to give the user time to abort the program if desired
98     //before catastrophic quantities of files are deleted.
99     //----------------------------------------------------------------------------------------------------
100     //Data structure that holds the character representation of and SHA512 hash, plus the specified
101     //filename.
102     typedef struct
103     {
104     char hash[129];
105     //512/4 = 128 characters for the hash, plus 1 character for zero terminator.
106     char *fname;
107     //Filename as specified in the file, allocated via malloc() family.
108     } tFileHashRecord;
109     //----------------------------------------------------------------------------------------------------
110     //----------------------------------------------------------------------------------------------------
111     //----- CHARACTER CLASSIFICATION FUNCTIONS ---------------------------------------------------------
112     //----------------------------------------------------------------------------------------------------
113     //----------------------------------------------------------------------------------------------------
114     //TRUE if character is part of valid hash.
115     int is_valid_hash_char(char c)
116     {
117     switch(c)
118     {
119     case '0':
120     case '1':
121     case '2':
122     case '3':
123     case '4':
124     case '5':
125     case '6':
126     case '7':
127     case '8':
128     case '9':
129     case 'a':
130     case 'b':
131     case 'c':
132     case 'd':
133     case 'e':
134     case 'f':
135     return(1);
136     break;
137     default:
138     return(0);
139     break;
140     }
141     }
142     //----------------------------------------------------------------------------------------------------
143     //TRUE if character is part of newline sequence
144     int is_newline_sequence_char(char c)
145     {
146     switch(c)
147     {
148     case 13:
149     case 10:
150     return(1);
151     break;
152     default:
153     return(0);
154     break;
155     }
156     }
157    
158     //----------------------------------------------------------------------------------------------------
159     //----------------------------------------------------------------------------------------------------
160     //----- FORMATTED OUTPUT FUNCTIONS -----------------------------------------------------------------
161     //----------------------------------------------------------------------------------------------------
162     //----------------------------------------------------------------------------------------------------
163     //Repeats a character to a stream a specified number of times.
164     //
165     void stream_rep_char(FILE *s, char c, unsigned n)
166     {
167     while(n--)
168     {
169     fprintf(s, "%c", c);
170     }
171     }
172     //----------------------------------------------------------------------------------------------------
173     //Prints a horizontal line to a stream, including the newline.
174     //
175     void stream_hline(FILE *s)
176     {
177     stream_rep_char(s, '-', LINELEN);
178     fprintf(s, "\n");
179     }
180     //----------------------------------------------------------------------------------------------------
181     //Prints a horizontal line to a stdout, including the newline.
182     //
183     void stdout_hline(void)
184     {
185     stream_rep_char(stdout, '-', LINELEN);
186     fprintf(stdout, "\n");
187     }
188     //----------------------------------------------------------------------------------------------------
189     //----------------------------------------------------------------------------------------------------
190     //----- FATAL ERROR FUNCTIONS ----------------------------------------------------------------------
191     //----------------------------------------------------------------------------------------------------
192     //----------------------------------------------------------------------------------------------------
193     //Errors out fatally.
194     //
195     void fatal(const char *desc, const char *file, unsigned line)
196     {
197     stdout_hline();
198     printf("Fatal error: %s\n", desc);
199     printf("Source file: %s\n", file);
200     printf("Line : %u\n", line);
201     stdout_hline();
202     exit(1);
203     }
204     //----------------------------------------------------------------------------------------------------
205     //----------------------------------------------------------------------------------------------------
206     //----- MEMORY ALLOCATION WRAPPERS -----------------------------------------------------------------
207     //----------------------------------------------------------------------------------------------------
208     //----------------------------------------------------------------------------------------------------
209     //malloc() wrapper.
210     void *w_malloc(size_t nbytes)
211     {
212     void *rv;
213    
214     if (!nbytes)
215     {
216     fatal("Memory allocation request for 0 bytes.", __FILE__, __LINE__);
217     }
218    
219     rv = malloc(nbytes);
220    
221     if (!rv)
222     {
223     fatal("Out of memory in malloc() request.", __FILE__, __LINE__);
224     }
225    
226     //Zero out, just for consistency.
227     memset(rv, 0, nbytes);
228     }
229     //----------------------------------------------------------------------------------------------------
230     //realloc() wrapper.
231     void *w_realloc(void *p, size_t n)
232     {
233     void *rv;
234    
235     if (!n)
236     {
237     fatal("Memory reallocation request for 0 bytes.", __FILE__, __LINE__);
238     }
239    
240     if (!p)
241     {
242     fatal("Memory reallocation request with NULL pointer.", __FILE__, __LINE__);
243     }
244    
245     rv = realloc(p, n);
246    
247     if (!rv)
248     {
249     fatal("Out of memory in realloc() request.", __FILE__, __LINE__);
250     }
251     }
252     //----------------------------------------------------------------------------------------------------
253     //----------------------------------------------------------------------------------------------------
254     //----- SLEEP FUNCTIONS ----------------------------------------------------------------------------
255     //----------------------------------------------------------------------------------------------------
256     //----------------------------------------------------------------------------------------------------
257     //Sleep for a time, in seconds.
258     void w_sleep(double seconds)
259     {
260     struct timespec t;
261    
262     if (seconds < 0)
263     {
264     fatal("Sleep for negative time request.", __FILE__, __LINE__);
265     }
266     else if (seconds > 3600)
267     {
268     fatal("Sleep for too long request.", __FILE__, __LINE__);
269     }
270    
271     t.tv_sec = floor(seconds);
272     t.tv_nsec = (seconds - floor(seconds)) * 1E9;
273    
274     nanosleep(&t, NULL);
275     }
276     //----------------------------------------------------------------------------------------------------
277     //----------------------------------------------------------------------------------------------------
278     //----- SHA512 FIELD READ FUNCTIONS ----------------------------------------------------------------
279     //----------------------------------------------------------------------------------------------------
280     //----------------------------------------------------------------------------------------------------
281     //These functions read in an individual field of a standard SHA512 file generated using application
282     //of the standard sha512sum program.
283     //
284     //*rcode = 1, success.
285     // 0, legal end of file, record assigned.
286     void get_sha512file_line(FILE *s, int *rcode, tFileHashRecord *hash_rec)
287     {
288     unsigned bidx;
289     unsigned nchars;
290     int ic;
291     int exitflag;
292     int eoffound;
293     int eolfound;
294     char c;
295     char buf[MAXLINELEN];
296    
297     //Zero out the buffer. This handles string termination automatically.
298     memset(buf, 0, sizeof(buf));
299    
300     //Read characters into the buffer until either hit EOF, newline, or can't
301     //fill the buffer any longer.
302     eoffound = 0;
303     eolfound = 0;
304     exitflag = 0;
305     bidx = 0;
306     do
307     {
308     ic = fgetc(s);
309     c = ic;
310    
311     if (ic == EOF)
312     {
313     eoffound = 1;
314     eolfound = 0;
315     nchars = bidx;
316     exitflag = 1;
317     }
318     else if (is_newline_sequence_char(c))
319     {
320     eoffound = 0;
321     eolfound = 1;
322     nchars = bidx;
323     exitflag = 1;
324     }
325     else if (bidx >= (MAXLINELEN - 1))
326     {
327     fatal("SHA512 hash file line too long to parse.", __FILE__, __LINE__);
328     }
329     else
330     {
331     buf[bidx] = c;
332     bidx++;
333     exitflag = 0;
334     }
335     } while(! exitflag);
336    
337     //If we encountered a newline, inch past it. We may encounter an EOF.
338     if (eolfound)
339     {
340     exitflag = 0;
341     do
342     {
343     ic = fgetc(s);
344     c = ic;
345    
346     if (ic == EOF)
347     {
348     eoffound = 1;
349     eolfound = 0;
350     exitflag = 1;
351     }
352     else if (is_newline_sequence_char(c))
353     {
354     exitflag = 0;
355     }
356     else
357     {
358     //We hit the next line. Put the character back.
359     eoffound = 0;
360     eolfound = 1;
361     ungetc(ic, s);
362     exitflag = 1;
363     }
364     } while(! exitflag);
365     }
366    
367     //For better or worse, we have a \0-terminated line in the buffer.
368     //
369     //Zero the caller's area. This takes care of the hash terminator as well.
370     memset(hash_rec, 0, sizeof(*hash_rec));
371    
372     //Ensure that we have at least 128 characters, and they are all hex characters.
373     //Otherwise, we can't proceed.
374     if (nchars < 128)
375     {
376     fatal("SHA512 hash file line too short.", __FILE__, __LINE__);
377     }
378     else
379     {
380     for (bidx = 0; bidx < 128; bidx++)
381     {
382     if (! is_valid_hash_char(buf[bidx]))
383     {
384     fatal("Character in SHA512 hash portion of line inconsistent with hash.", __FILE__, __LINE__);
385     }
386     }
387     }
388    
389     //The 129th and 130'th character must be present and must be a space and asterisk, respectively.
390     if (nchars < 130)
391     {
392     fatal("SHA512 hash file line too short.", __FILE__, __LINE__);
393     }
394     else if (buf[128] != ' ')
395     {
396     fatal("129th hash line character must be \" \".", __FILE__, __LINE__);
397     }
398 dashley 75 // else if (buf[129] != '*')
399     // {
400     // fatal("130th hash line character must be \"*\".", __FILE__, __LINE__);
401     // }
402     else if (buf[129] != ' ')
403 dashley 71 {
404 dashley 75 //130th character is ' '. Need to figure out why sometimes space and sometimes '*'.
405     fatal("130th hash line character must be \" \".", __FILE__, __LINE__);
406 dashley 71 }
407    
408     //There must be a 131'st character. Beyond that, we can't qualify, because filenames may
409     //have odd characters and may be of any length.
410     if (nchars < 131)
411     {
412     fatal("SHA512 hash file line too short.", __FILE__, __LINE__);
413     }
414    
415     //Copy the hash to the caller's area. The terminator has already been inserted.
416     memcpy(&(hash_rec->hash[0]), buf, 128);
417    
418     //Allocate space for the filename.
419     hash_rec->fname = w_malloc(strlen(buf+130) + 1);
420    
421     //Make the copy.
422     strcpy(hash_rec->fname, buf+130);
423    
424     if (eoffound)
425     *rcode = 0;
426     else
427     *rcode = 1;
428     }
429     //----------------------------------------------------------------------------------------------------
430     void parseinputfile(tFileHashRecord **parsed_recs, unsigned *count, char *fname)
431     {
432     FILE *s;
433     int rcode;
434    
435     //Try to open the file for reading. Inability is a failure.
436     s = fopen(fname, "r");
437     if (!s)
438     {
439     fatal("Hash file open failure.", __FILE__, __LINE__);
440     }
441    
442     //Start off with a count of 0 and a NULL pointer.
443     *count = 0;
444     *parsed_recs = NULL;
445    
446     do
447     {
448     //For the first time, allocate space for one record. Beyond that,
449     //expand it.
450     if (! *parsed_recs)
451     {
452     *parsed_recs = w_malloc(sizeof(tFileHashRecord));
453     }
454     else
455     {
456     *parsed_recs = w_realloc(*parsed_recs, (size_t)((*count + 1)) * sizeof(tFileHashRecord));
457     }
458    
459     //Parse and fill in the space.
460     get_sha512file_line(s, &rcode, (*parsed_recs) + (*count));
461    
462     //We now have one more.
463     (*count)++;
464     } while(rcode == 1);
465    
466     //Try to close the file. Inability is a failure.
467     if (fclose(s))
468     {
469     fatal("Hash file close failure.", __FILE__, __LINE__);
470     }
471     }
472     //----------------------------------------------------------------------------------------------------
473     int sortcmpascendinghash(const void *p0_in, const void *p1_in)
474     {
475     const tFileHashRecord *p0, *p1;
476    
477     p0 = p0_in;
478     p1 = p1_in;
479    
480     return(strcmp(p0->hash, p1->hash));
481     }
482    
483     //----------------------------------------------------------------------------------------------------
484     void sortinternaldsbyhash(tFileHashRecord *parsed_recs, unsigned count)
485     {
486     qsort(parsed_recs, count, sizeof(tFileHashRecord), sortcmpascendinghash);
487     }
488     //----------------------------------------------------------------------------------------------------
489     int sortcmpascendingfname(const void *p0_in, const void *p1_in)
490     {
491     const tFileHashRecord *p0, *p1;
492    
493     p0 = p0_in;
494     p1 = p1_in;
495    
496     return(strcmp(p0->fname, p1->fname));
497     }
498     //----------------------------------------------------------------------------------------------------
499     //This sort has to be run after the hash sort. Within groups of identical hashes, it sorts by
500     //ascending filename.
501     void sortinternalgroupfname(tFileHashRecord *parsed_recs, unsigned count)
502     {
503     unsigned ui;
504     unsigned i_group_min, i_group_max;
505    
506     if (! count)
507     return;
508    
509     i_group_min = 0;
510     i_group_max = 0;
511    
512     do
513     {
514     //Advance i_group_max to the end of the group of duplicates.
515     while ((i_group_max < (count - 1)) && (! strcmp(parsed_recs[i_group_min].hash, parsed_recs[i_group_max + 1].hash)))
516     {
517     i_group_max++;
518     }
519    
520     if (i_group_min != i_group_max)
521     {
522     //Sort the internal group.
523     qsort(parsed_recs + i_group_min,
524     i_group_max - i_group_min + 1,
525     sizeof(tFileHashRecord),
526     sortcmpascendingfname);
527     }
528    
529     //On to the next group.
530     i_group_max++;
531     i_group_min = i_group_max;
532    
533     } while (i_group_max < (count - 1));
534     }
535     //----------------------------------------------------------------------------------------------------
536     void printsinglerecord(tFileHashRecord *rec, unsigned elno)
537     {
538     printf("[%9u]\n", elno);
539     printf("Hash : %s\n", rec->hash);
540     printf("Filename : %s\n", rec->fname);
541     stdout_hline();
542     }
543     //----------------------------------------------------------------------------------------------------
544     void printinternalds(tFileHashRecord *parsed_recs, unsigned count)
545     {
546     unsigned i;
547    
548     for (i=0; i<count; i++)
549     {
550     printsinglerecord(parsed_recs + i, i);
551     }
552     }
553     //----------------------------------------------------------------------------------------------------
554     void gather_dup_stats(tFileHashRecord *parsed_recs, unsigned count, unsigned *out_num_dups, unsigned *out_cumulative_dups)
555     {
556     unsigned i_group_min, i_group_max;
557    
558     *out_num_dups = 0;
559     *out_cumulative_dups = 0;
560    
561     if (! count)
562     return;
563    
564     i_group_min = 0;
565     i_group_max = 0;
566    
567     do
568     {
569     //Advance i_group_max to the end of the group of duplicates.
570     while ((i_group_max < (count - 1)) && (! strcmp(parsed_recs[i_group_min].hash, parsed_recs[i_group_max + 1].hash)))
571     {
572     i_group_max++;
573     }
574    
575     //Log the findings.
576     if (i_group_min != i_group_max)
577     {
578     (*out_num_dups)++;
579     (*out_cumulative_dups) += (i_group_max - i_group_min + 1);
580     }
581    
582     //On to the next group.
583     i_group_max++;
584     i_group_min = i_group_max;
585    
586     } while (i_group_max < (count - 1));
587     }
588     //----------------------------------------------------------------------------------------------------
589     void option_dups(char *fname)
590     {
591     tFileHashRecord *parsed_recs;
592     unsigned count, num_dups, cumulative_dups;
593    
594     parseinputfile(&parsed_recs, &count, fname);
595     //printf("%u records parsed.\n", count);
596     sortinternaldsbyhash(parsed_recs, count);
597     sortinternalgroupfname(parsed_recs, count);
598     printinternalds(parsed_recs, count);
599     stdout_hline();
600     gather_dup_stats(parsed_recs, count, &num_dups, &cumulative_dups);
601     printf("Number of duplicated files : %u\n", num_dups);
602     if (num_dups)
603     {
604     printf("Average number of duplicates: %.2f\n", (double)cumulative_dups/(double)num_dups);
605     }
606     }
607     //----------------------------------------------------------------------------------------------------
608     void option_filterdups(char *fname)
609     {
610     tFileHashRecord *parsed_recs;
611     unsigned dupgroup;
612     unsigned count;
613     unsigned ui;
614     unsigned i_group_min, i_group_max;
615    
616     parseinputfile(&parsed_recs, &count, fname);
617     //printf("%u records parsed.\n", count);
618     sortinternaldsbyhash(parsed_recs, count);
619     sortinternalgroupfname(parsed_recs, count);
620    
621     if (! count)
622     return;
623    
624     dupgroup = 0;
625     i_group_min = 0;
626     i_group_max = 0;
627    
628     do
629     {
630     //Advance i_group_max to the end of the group of duplicates.
631     while ((i_group_max < (count - 1)) && (! strcmp(parsed_recs[i_group_min].hash, parsed_recs[i_group_max + 1].hash)))
632     {
633     i_group_max++;
634     }
635    
636     //Print the findings.
637     if (i_group_min != i_group_max)
638     {
639     printf("Duplicate group %u:\n", dupgroup);
640     for (ui = i_group_min; ui <= i_group_max; ui++)
641     {
642     printf("%s\n", parsed_recs[ui].fname);
643     }
644    
645     dupgroup++;
646    
647     stdout_hline();
648     }
649    
650     //On to the next group.
651     i_group_max++;
652     i_group_min = i_group_max;
653    
654     } while (i_group_max < (count - 1));
655     }
656     //----------------------------------------------------------------------------------------------------
657     //Returns true if the filename is within the specified path, or false otherwise.
658     int is_path_member(const char *fname, const char *path)
659     {
660     if (strlen(fname) == 0)
661     {
662     fatal("Zero-length filename.", __FILE__, __LINE__);
663     }
664     else if (strlen(path) == 0)
665     {
666     fatal("Zero-length path.", __FILE__, __LINE__);
667     }
668     else if (path[strlen(path) - 1] != '/')
669     {
670     fatal("Paths must canonically end with forward slash character.", __FILE__, __LINE__);
671     }
672     else if (strlen(fname) <= strlen(path))
673     {
674     //Can't be in the path because filename is not longer than path name.
675     return 0;
676     }
677     else if (memcmp(fname, path, strlen(path)) == 0)
678     {
679     return 1;
680     }
681     else
682     {
683     return 0;
684     }
685     }
686     //----------------------------------------------------------------------------------------------------
687     void option_dedup(char *fname, char *path, int may_delete, double pause_time)
688     {
689     tFileHashRecord *parsed_recs;
690     unsigned dupgroup;
691     unsigned count;
692     unsigned ui;
693     unsigned within_path;
694     unsigned i_group_min, i_group_max;
695    
696     parseinputfile(&parsed_recs, &count, fname);
697     //printf("%u records parsed.\n", count);
698     sortinternaldsbyhash(parsed_recs, count);
699     sortinternalgroupfname(parsed_recs, count);
700    
701     if (! count)
702     return;
703    
704     dupgroup = 0;
705     i_group_min = 0;
706     i_group_max = 0;
707    
708     do
709     {
710     //Advance i_group_max to the end of the group of duplicates.
711     while ((i_group_max < (count - 1)) && (! strcmp(parsed_recs[i_group_min].hash, parsed_recs[i_group_max + 1].hash)))
712     {
713     i_group_max++;
714     }
715    
716     //If this is a group of duplicates.
717     if (i_group_min != i_group_max)
718     {
719     //Print the findings.
720     printf("Duplicate group %u:\n", dupgroup);
721     for (ui = i_group_min; ui <= i_group_max; ui++)
722     {
723     printf("%s\n", parsed_recs[ui].fname);
724     }
725    
726     dupgroup++;
727    
728     stdout_hline();
729    
730     //Count how many of the group of duplicates are within the supplied path.
731     within_path = 0;
732     for (ui = i_group_min; ui <= i_group_max; ui++)
733     {
734     if (is_path_member(parsed_recs[ui].fname, path))
735     {
736     within_path++;
737     }
738     }
739    
740     //We have to take different actions based on whether we do or don't have any within path.
741     //If we don't have any, we may delete nothing.
742     if (! within_path)
743     {
744     printf("None of these duplicates in path--taking no action.\n");
745     //stdout_hline();
746     }
747     else
748     {
749     for (ui = i_group_min; ui <= i_group_max; ui++)
750     {
751     if (is_path_member(parsed_recs[ui].fname, path))
752     {
753     printf("Not deleting: %s\n", parsed_recs[ui].fname);
754     }
755     else
756     {
757     printf("Deleting : %s\n", parsed_recs[ui].fname);
758     if (may_delete)
759     {
760     if (! unlink(parsed_recs[ui].fname))
761     {
762     printf(" File deleted (unlinked) successfully.\n");
763     }
764     else
765     {
766     printf(" Failure attempting to delete (unlink) file.\n");
767     }
768     }
769     else
770     {
771     printf(" Dry run only.\n");
772     }
773     }
774    
775     //w_sleep(pause_time);
776     }
777     }
778    
779     stdout_hline();
780     }
781    
782     //On to the next group.
783     i_group_max++;
784     i_group_min = i_group_max;
785    
786     } while (i_group_max < (count - 1));
787     }
788     //----------------------------------------------------------------------------------------------------
789     int main(int argc, char* argv[])
790     {
791     stdout_hline();
792     printf("Execution begins.\n");
793     stdout_hline();
794    
795     if (argc == 1)
796     {
797     }
798     else if ((argc == 3) && (strcmp(argv[1], "ndups") == 0))
799     {
800     option_dups(argv[2]);
801     }
802     else if ((argc == 3) && (strcmp(argv[1], "filterdups") == 0))
803     {
804     option_filterdups(argv[2]);
805     }
806 dashley 75 else if ((argc == 3) && (strcmp(argv[1], "dedup_nopath") == 0))
807 dashley 71 {
808     //option_filterdups(argv[2]);
809     }
810 dashley 75 else if ((argc == 3) && (strcmp(argv[1], "dryrun_nopath") == 0))
811 dashley 71 {
812     //option_filterdups(argv[2]);
813     }
814 dashley 75 else if ((argc == 4) && (strcmp(argv[1], "dedup_preserve_inside") == 0))
815 dashley 71 {
816     option_dedup(argv[2], argv[3], 1, UNLINKPAUSETIME);
817     }
818 dashley 75 else if ((argc == 4) && (strcmp(argv[1], "dryrun_preserve_inside") == 0))
819 dashley 71 {
820     option_dedup(argv[2], argv[3], 0, UNLINKPAUSETIME/10.0);
821     }
822     else
823     {
824     printf("Unrecognized parameter form. Try \"dedup\".\n");
825     }
826    
827     //w_sleep(-3 /* UNLINKPAUSETIME*/ );
828    
829     //stdout_hline();
830     printf("Execution ends.\n");
831     stdout_hline();
832    
833     return 0;
834     }
835     //----------------------------------------------------------------------------------------------------
836    

Properties

Name Value
svn:eol-style native

dashley@gmail.com
ViewVC Help
Powered by ViewVC 1.1.25