/[dtapublic]/projs/trunk/projs/20161007_dedup/qdedup.c
ViewVC logotype

Annotation of /projs/trunk/projs/20161007_dedup/qdedup.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 89 - (hide annotations) (download)
Fri Nov 11 04:19:20 2016 UTC (7 years, 4 months ago) by dashley
File MIME type: text/plain
File size: 28467 byte(s)
Changed to accommodate '*' vs. ' ' as 130th character on each line.
1 dashley 71 //----------------------------------------------------------------------------------------------------
2 dashley 88 //$Header$
3     //----------------------------------------------------------------------------------------------------
4 dashley 71 //qdedup.c
5     //----------------------------------------------------------------------------------------------------
6     //Quick and dirty program to eliminate duplicates from a file tree. A file containing the SHA512
7 dashley 74 //hashes of all the files to be considered must already exist, and must be regenerated each time the
8     //underlying files are deleted/added/modified, which means the file must regenerated after each run
9     //of qdedup. (WARNING: IF YOU DO NOT REGENERATE THE FILE AFTER EACH RUN OF qdedup, YOU WILL
10     //PROBABLY DESTROY DATA. THE MECHANISM WOULD BE THAT THE SHA512 MANIFEST IMPLIES THAT DUPLICATES
11     //EXIST WHEN THEY NO LONGER DO, SO qdedup WILL ERRONEOUSLY DELETE THE LAST COPIES OF FILES.) The
12     //program will eliminate duplicates within a single specified directory or outside a single specified
13     //directory.
14     //
15     //This program will compile and run only on *nix systems and under Cygwin on Windows systems.
16 dashley 71 //----------------------------------------------------------------------------------------------------
17 dashley 74 //Copyright David T. Ashley (dashley@gmail.com), 2016.
18 dashley 71 //----------------------------------------------------------------------------------------------------
19 dashley 74 //Provided under the MIT LICENSE, reproduced immediately below.
20     //----------------------------------------------------------------------------------------------------
21     //Permission is hereby granted, free of charge, to any person obtaining a copy of
22     //this software and associated documentation files (the "Software"), to deal in the
23     //Software without restriction, including without limitation the rights to use,
24     //copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
25     //Software, and to permit persons to whom the Software is furnished to do so,
26     //subject to the following conditions:
27 dashley 71 //
28 dashley 74 //The above copyright notice and this permission notice shall be included in all
29     //copies or substantial portions of the Software.
30 dashley 71 //
31 dashley 74 //THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
32     //IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
33     //FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
34     //AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
35     //LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
36     //OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37     //SOFTWARE.
38 dashley 71 //----------------------------------------------------------------------------------------------------
39     //All paths in the SHA512 file must be absolute or must be relative to the current working directory
40     //at the time this program is run.
41     //
42 dashley 74 //The recommended method to generate the SHA512 file is using the "-exec" option of the "find"
43     //command, i.e.
44     //
45     // find target_directory -type f -exec sha512sum {} \; >sha512sums.txt
46     //
47 dashley 71 //If any files are deleted by the program, a new SHA512 file must be generated before the program is
48     //run again to delete files. The reason for this restriction is that the program will never knowingly
49     //delete the last copy of a file. If the SHA512 file contains the digests of files that no longer
50     //exist, the program may unknowingly delete the last copies of files (because it believes based on
51     //the SHA512 file that other copies exist when in fact they do not).
52     //
53     //The SHA512 file does not need to be sorted (this program sorts it internally by hash before using it).
54     //
55     //This program is designed to compile and run under Cygwin or *nix only.
56     //
57     //Usage:
58     // qdedup
59     // Prints help information and exits.
60     // qdedup ndups <sha512file>
61     // Prints statistics about the number of duplicates in <sha512file>.
62     // qdedup filterdups <sha512file>
63     // Analyzes duplicates and prints the filenames of groups of duplicates. The output is designed
64     // for hand analysis so that insight can be gained into what duplicates exist and where they
65     // are located.
66 dashley 74 // qdedup dedup_preserve_inside <sha512file> <path>
67 dashley 71 // For each group of duplicates that exists, preserves the duplicates that exist within path
68     // and removes all others. If no copies of the duplicate exist within path, no copies of the
69     // duplicate will be removed.
70 dashley 74 // qdedup dryrun_preserve_inside <sha512file> <path>
71     // Exactly like "dedup_preserve_inside", except that no files will be deleted. Text will be
72     // output to explain what would be deleted by "dedup_preserve_inside".
73     // qdedup dedup_nopath <sha512file>
74 dashley 71 // For each group of duplicates that exists, preserves only the first (the only with lowest
75     // sort-order filename).
76 dashley 74 // qdedup dryrun_nopath <sha512file> <path>
77     // Exactly like "dedup_nopath", except that no files will be deleted. Text will be
78     // output to explain what would be deleted by "dedup_nopath".
79     // qdedup dedup_preserve_outside <sha512file> <path>
80     // For each group of duplicates that exists, deletes duplicates only from within the specified
81     // path. If any duplicates do not have at least one copy within <path> no instances of the
82     // duplicate are deleted.
83     // qdedup dryrun_preserve_outside <sha512file>
84     // Exactly like "dedup_preserve_outside", except that no files will be deleted. Text will be
85     // output to explain what would be deleted by "dedup_preserve_outside".
86 dashley 71 //----------------------------------------------------------------------------------------------------
87     #include <math.h>
88     #include <stdio.h>
89     #include <stdlib.h>
90     #include <string.h>
91     #include <time.h>
92     #include <unistd.h>
93     //----------------------------------------------------------------------------------------------------
94     #define LINELEN (78) //Number of printable characters in a line.
95     #define MAXLINELEN (2000) //The maximum number of characters that may be in a line of the
96     //SHA512 input file. This count includes the \0 terminator, so only
97     //this value minus 1 characters may be in a line.
98     #define UNLINKPAUSETIME (0.1) //Number of seconds to pause between file unlinks (deletions). This
99     //is designed to give the user time to abort the program if desired
100     //before catastrophic quantities of files are deleted.
101     //----------------------------------------------------------------------------------------------------
102     //Data structure that holds the character representation of and SHA512 hash, plus the specified
103     //filename.
104     typedef struct
105     {
106     char hash[129];
107     //512/4 = 128 characters for the hash, plus 1 character for zero terminator.
108     char *fname;
109     //Filename as specified in the file, allocated via malloc() family.
110     } tFileHashRecord;
111     //----------------------------------------------------------------------------------------------------
112     //----------------------------------------------------------------------------------------------------
113     //----- CHARACTER CLASSIFICATION FUNCTIONS ---------------------------------------------------------
114     //----------------------------------------------------------------------------------------------------
115     //----------------------------------------------------------------------------------------------------
116     //TRUE if character is part of valid hash.
117     int is_valid_hash_char(char c)
118     {
119     switch(c)
120     {
121     case '0':
122     case '1':
123     case '2':
124     case '3':
125     case '4':
126     case '5':
127     case '6':
128     case '7':
129     case '8':
130     case '9':
131     case 'a':
132     case 'b':
133     case 'c':
134     case 'd':
135     case 'e':
136     case 'f':
137     return(1);
138     break;
139     default:
140     return(0);
141     break;
142     }
143     }
144     //----------------------------------------------------------------------------------------------------
145     //TRUE if character is part of newline sequence
146     int is_newline_sequence_char(char c)
147     {
148     switch(c)
149     {
150     case 13:
151     case 10:
152     return(1);
153     break;
154     default:
155     return(0);
156     break;
157     }
158     }
159    
160     //----------------------------------------------------------------------------------------------------
161     //----------------------------------------------------------------------------------------------------
162     //----- FORMATTED OUTPUT FUNCTIONS -----------------------------------------------------------------
163     //----------------------------------------------------------------------------------------------------
164     //----------------------------------------------------------------------------------------------------
165     //Repeats a character to a stream a specified number of times.
166     //
167     void stream_rep_char(FILE *s, char c, unsigned n)
168     {
169     while(n--)
170     {
171     fprintf(s, "%c", c);
172     }
173     }
174     //----------------------------------------------------------------------------------------------------
175     //Prints a horizontal line to a stream, including the newline.
176     //
177     void stream_hline(FILE *s)
178     {
179     stream_rep_char(s, '-', LINELEN);
180     fprintf(s, "\n");
181     }
182     //----------------------------------------------------------------------------------------------------
183     //Prints a horizontal line to a stdout, including the newline.
184     //
185     void stdout_hline(void)
186     {
187     stream_rep_char(stdout, '-', LINELEN);
188     fprintf(stdout, "\n");
189     }
190     //----------------------------------------------------------------------------------------------------
191     //----------------------------------------------------------------------------------------------------
192     //----- FATAL ERROR FUNCTIONS ----------------------------------------------------------------------
193     //----------------------------------------------------------------------------------------------------
194     //----------------------------------------------------------------------------------------------------
195     //Errors out fatally.
196     //
197     void fatal(const char *desc, const char *file, unsigned line)
198     {
199     stdout_hline();
200     printf("Fatal error: %s\n", desc);
201     printf("Source file: %s\n", file);
202     printf("Line : %u\n", line);
203     stdout_hline();
204     exit(1);
205     }
206     //----------------------------------------------------------------------------------------------------
207     //----------------------------------------------------------------------------------------------------
208     //----- MEMORY ALLOCATION WRAPPERS -----------------------------------------------------------------
209     //----------------------------------------------------------------------------------------------------
210     //----------------------------------------------------------------------------------------------------
211     //malloc() wrapper.
212     void *w_malloc(size_t nbytes)
213     {
214     void *rv;
215    
216     if (!nbytes)
217     {
218     fatal("Memory allocation request for 0 bytes.", __FILE__, __LINE__);
219     }
220    
221     rv = malloc(nbytes);
222    
223     if (!rv)
224     {
225     fatal("Out of memory in malloc() request.", __FILE__, __LINE__);
226     }
227    
228     //Zero out, just for consistency.
229     memset(rv, 0, nbytes);
230     }
231     //----------------------------------------------------------------------------------------------------
232     //realloc() wrapper.
233     void *w_realloc(void *p, size_t n)
234     {
235     void *rv;
236    
237     if (!n)
238     {
239     fatal("Memory reallocation request for 0 bytes.", __FILE__, __LINE__);
240     }
241    
242     if (!p)
243     {
244     fatal("Memory reallocation request with NULL pointer.", __FILE__, __LINE__);
245     }
246    
247     rv = realloc(p, n);
248    
249     if (!rv)
250     {
251     fatal("Out of memory in realloc() request.", __FILE__, __LINE__);
252     }
253     }
254     //----------------------------------------------------------------------------------------------------
255     //----------------------------------------------------------------------------------------------------
256     //----- SLEEP FUNCTIONS ----------------------------------------------------------------------------
257     //----------------------------------------------------------------------------------------------------
258     //----------------------------------------------------------------------------------------------------
259     //Sleep for a time, in seconds.
260     void w_sleep(double seconds)
261     {
262     struct timespec t;
263    
264     if (seconds < 0)
265     {
266     fatal("Sleep for negative time request.", __FILE__, __LINE__);
267     }
268     else if (seconds > 3600)
269     {
270     fatal("Sleep for too long request.", __FILE__, __LINE__);
271     }
272    
273     t.tv_sec = floor(seconds);
274     t.tv_nsec = (seconds - floor(seconds)) * 1E9;
275    
276     nanosleep(&t, NULL);
277     }
278     //----------------------------------------------------------------------------------------------------
279     //----------------------------------------------------------------------------------------------------
280     //----- SHA512 FIELD READ FUNCTIONS ----------------------------------------------------------------
281     //----------------------------------------------------------------------------------------------------
282     //----------------------------------------------------------------------------------------------------
283     //These functions read in an individual field of a standard SHA512 file generated using application
284     //of the standard sha512sum program.
285     //
286     //*rcode = 1, success.
287     // 0, legal end of file, record assigned.
288     void get_sha512file_line(FILE *s, int *rcode, tFileHashRecord *hash_rec)
289     {
290     unsigned bidx;
291     unsigned nchars;
292     int ic;
293     int exitflag;
294     int eoffound;
295     int eolfound;
296     char c;
297     char buf[MAXLINELEN];
298    
299     //Zero out the buffer. This handles string termination automatically.
300     memset(buf, 0, sizeof(buf));
301    
302     //Read characters into the buffer until either hit EOF, newline, or can't
303     //fill the buffer any longer.
304     eoffound = 0;
305     eolfound = 0;
306     exitflag = 0;
307     bidx = 0;
308     do
309     {
310     ic = fgetc(s);
311     c = ic;
312    
313     if (ic == EOF)
314     {
315     eoffound = 1;
316     eolfound = 0;
317     nchars = bidx;
318     exitflag = 1;
319     }
320     else if (is_newline_sequence_char(c))
321     {
322     eoffound = 0;
323     eolfound = 1;
324     nchars = bidx;
325     exitflag = 1;
326     }
327     else if (bidx >= (MAXLINELEN - 1))
328     {
329     fatal("SHA512 hash file line too long to parse.", __FILE__, __LINE__);
330     }
331     else
332     {
333     buf[bidx] = c;
334     bidx++;
335     exitflag = 0;
336     }
337     } while(! exitflag);
338    
339     //If we encountered a newline, inch past it. We may encounter an EOF.
340     if (eolfound)
341     {
342     exitflag = 0;
343     do
344     {
345     ic = fgetc(s);
346     c = ic;
347    
348     if (ic == EOF)
349     {
350     eoffound = 1;
351     eolfound = 0;
352     exitflag = 1;
353     }
354     else if (is_newline_sequence_char(c))
355     {
356     exitflag = 0;
357     }
358     else
359     {
360     //We hit the next line. Put the character back.
361     eoffound = 0;
362     eolfound = 1;
363     ungetc(ic, s);
364     exitflag = 1;
365     }
366     } while(! exitflag);
367     }
368    
369     //For better or worse, we have a \0-terminated line in the buffer.
370     //
371     //Zero the caller's area. This takes care of the hash terminator as well.
372     memset(hash_rec, 0, sizeof(*hash_rec));
373    
374     //Ensure that we have at least 128 characters, and they are all hex characters.
375     //Otherwise, we can't proceed.
376     if (nchars < 128)
377     {
378     fatal("SHA512 hash file line too short.", __FILE__, __LINE__);
379     }
380     else
381     {
382     for (bidx = 0; bidx < 128; bidx++)
383     {
384     if (! is_valid_hash_char(buf[bidx]))
385     {
386     fatal("Character in SHA512 hash portion of line inconsistent with hash.", __FILE__, __LINE__);
387     }
388     }
389     }
390    
391     //The 129th and 130'th character must be present and must be a space and asterisk, respectively.
392     if (nchars < 130)
393     {
394     fatal("SHA512 hash file line too short.", __FILE__, __LINE__);
395     }
396     else if (buf[128] != ' ')
397     {
398     fatal("129th hash line character must be \" \".", __FILE__, __LINE__);
399     }
400 dashley 89 else if (buf[129] != '*')
401     {
402     fatal("130th hash line character must be \"*\".", __FILE__, __LINE__);
403     }
404     // else if (buf[129] != ' ')
405 dashley 75 // {
406 dashley 89 // //130th character is ' '. Need to figure out why sometimes space and sometimes '*'.
407     // fatal("130th hash line character must be \" \".", __FILE__, __LINE__);
408     // }
409 dashley 71
410     //There must be a 131'st character. Beyond that, we can't qualify, because filenames may
411     //have odd characters and may be of any length.
412     if (nchars < 131)
413     {
414     fatal("SHA512 hash file line too short.", __FILE__, __LINE__);
415     }
416    
417     //Copy the hash to the caller's area. The terminator has already been inserted.
418     memcpy(&(hash_rec->hash[0]), buf, 128);
419    
420     //Allocate space for the filename.
421     hash_rec->fname = w_malloc(strlen(buf+130) + 1);
422    
423     //Make the copy.
424     strcpy(hash_rec->fname, buf+130);
425    
426     if (eoffound)
427     *rcode = 0;
428     else
429     *rcode = 1;
430     }
431     //----------------------------------------------------------------------------------------------------
432     void parseinputfile(tFileHashRecord **parsed_recs, unsigned *count, char *fname)
433     {
434     FILE *s;
435     int rcode;
436    
437     //Try to open the file for reading. Inability is a failure.
438     s = fopen(fname, "r");
439     if (!s)
440     {
441     fatal("Hash file open failure.", __FILE__, __LINE__);
442     }
443    
444     //Start off with a count of 0 and a NULL pointer.
445     *count = 0;
446     *parsed_recs = NULL;
447    
448     do
449     {
450     //For the first time, allocate space for one record. Beyond that,
451     //expand it.
452     if (! *parsed_recs)
453     {
454     *parsed_recs = w_malloc(sizeof(tFileHashRecord));
455     }
456     else
457     {
458     *parsed_recs = w_realloc(*parsed_recs, (size_t)((*count + 1)) * sizeof(tFileHashRecord));
459     }
460    
461     //Parse and fill in the space.
462     get_sha512file_line(s, &rcode, (*parsed_recs) + (*count));
463    
464     //We now have one more.
465     (*count)++;
466     } while(rcode == 1);
467    
468     //Try to close the file. Inability is a failure.
469     if (fclose(s))
470     {
471     fatal("Hash file close failure.", __FILE__, __LINE__);
472     }
473     }
474     //----------------------------------------------------------------------------------------------------
475     int sortcmpascendinghash(const void *p0_in, const void *p1_in)
476     {
477     const tFileHashRecord *p0, *p1;
478    
479     p0 = p0_in;
480     p1 = p1_in;
481    
482     return(strcmp(p0->hash, p1->hash));
483     }
484    
485     //----------------------------------------------------------------------------------------------------
486     void sortinternaldsbyhash(tFileHashRecord *parsed_recs, unsigned count)
487     {
488     qsort(parsed_recs, count, sizeof(tFileHashRecord), sortcmpascendinghash);
489     }
490     //----------------------------------------------------------------------------------------------------
491     int sortcmpascendingfname(const void *p0_in, const void *p1_in)
492     {
493     const tFileHashRecord *p0, *p1;
494    
495     p0 = p0_in;
496     p1 = p1_in;
497    
498     return(strcmp(p0->fname, p1->fname));
499     }
500     //----------------------------------------------------------------------------------------------------
501     //This sort has to be run after the hash sort. Within groups of identical hashes, it sorts by
502     //ascending filename.
503     void sortinternalgroupfname(tFileHashRecord *parsed_recs, unsigned count)
504     {
505     unsigned ui;
506     unsigned i_group_min, i_group_max;
507    
508     if (! count)
509     return;
510    
511     i_group_min = 0;
512     i_group_max = 0;
513    
514     do
515     {
516     //Advance i_group_max to the end of the group of duplicates.
517     while ((i_group_max < (count - 1)) && (! strcmp(parsed_recs[i_group_min].hash, parsed_recs[i_group_max + 1].hash)))
518     {
519     i_group_max++;
520     }
521    
522     if (i_group_min != i_group_max)
523     {
524     //Sort the internal group.
525     qsort(parsed_recs + i_group_min,
526     i_group_max - i_group_min + 1,
527     sizeof(tFileHashRecord),
528     sortcmpascendingfname);
529     }
530    
531     //On to the next group.
532     i_group_max++;
533     i_group_min = i_group_max;
534    
535     } while (i_group_max < (count - 1));
536     }
537     //----------------------------------------------------------------------------------------------------
538     void printsinglerecord(tFileHashRecord *rec, unsigned elno)
539     {
540     printf("[%9u]\n", elno);
541     printf("Hash : %s\n", rec->hash);
542     printf("Filename : %s\n", rec->fname);
543     stdout_hline();
544     }
545     //----------------------------------------------------------------------------------------------------
546     void printinternalds(tFileHashRecord *parsed_recs, unsigned count)
547     {
548     unsigned i;
549    
550     for (i=0; i<count; i++)
551     {
552     printsinglerecord(parsed_recs + i, i);
553     }
554     }
555     //----------------------------------------------------------------------------------------------------
556     void gather_dup_stats(tFileHashRecord *parsed_recs, unsigned count, unsigned *out_num_dups, unsigned *out_cumulative_dups)
557     {
558     unsigned i_group_min, i_group_max;
559    
560     *out_num_dups = 0;
561     *out_cumulative_dups = 0;
562    
563     if (! count)
564     return;
565    
566     i_group_min = 0;
567     i_group_max = 0;
568    
569     do
570     {
571     //Advance i_group_max to the end of the group of duplicates.
572     while ((i_group_max < (count - 1)) && (! strcmp(parsed_recs[i_group_min].hash, parsed_recs[i_group_max + 1].hash)))
573     {
574     i_group_max++;
575     }
576    
577     //Log the findings.
578     if (i_group_min != i_group_max)
579     {
580     (*out_num_dups)++;
581     (*out_cumulative_dups) += (i_group_max - i_group_min + 1);
582     }
583    
584     //On to the next group.
585     i_group_max++;
586     i_group_min = i_group_max;
587    
588     } while (i_group_max < (count - 1));
589     }
590     //----------------------------------------------------------------------------------------------------
591     void option_dups(char *fname)
592     {
593     tFileHashRecord *parsed_recs;
594     unsigned count, num_dups, cumulative_dups;
595    
596     parseinputfile(&parsed_recs, &count, fname);
597     //printf("%u records parsed.\n", count);
598     sortinternaldsbyhash(parsed_recs, count);
599     sortinternalgroupfname(parsed_recs, count);
600     printinternalds(parsed_recs, count);
601     stdout_hline();
602     gather_dup_stats(parsed_recs, count, &num_dups, &cumulative_dups);
603     printf("Number of duplicated files : %u\n", num_dups);
604     if (num_dups)
605     {
606     printf("Average number of duplicates: %.2f\n", (double)cumulative_dups/(double)num_dups);
607     }
608     }
609     //----------------------------------------------------------------------------------------------------
610     void option_filterdups(char *fname)
611     {
612     tFileHashRecord *parsed_recs;
613     unsigned dupgroup;
614     unsigned count;
615     unsigned ui;
616     unsigned i_group_min, i_group_max;
617    
618     parseinputfile(&parsed_recs, &count, fname);
619     //printf("%u records parsed.\n", count);
620     sortinternaldsbyhash(parsed_recs, count);
621     sortinternalgroupfname(parsed_recs, count);
622    
623     if (! count)
624     return;
625    
626     dupgroup = 0;
627     i_group_min = 0;
628     i_group_max = 0;
629    
630     do
631     {
632     //Advance i_group_max to the end of the group of duplicates.
633     while ((i_group_max < (count - 1)) && (! strcmp(parsed_recs[i_group_min].hash, parsed_recs[i_group_max + 1].hash)))
634     {
635     i_group_max++;
636     }
637    
638     //Print the findings.
639     if (i_group_min != i_group_max)
640     {
641     printf("Duplicate group %u:\n", dupgroup);
642     for (ui = i_group_min; ui <= i_group_max; ui++)
643     {
644     printf("%s\n", parsed_recs[ui].fname);
645     }
646    
647     dupgroup++;
648    
649     stdout_hline();
650     }
651    
652     //On to the next group.
653     i_group_max++;
654     i_group_min = i_group_max;
655    
656     } while (i_group_max < (count - 1));
657     }
658     //----------------------------------------------------------------------------------------------------
659     //Returns true if the filename is within the specified path, or false otherwise.
660     int is_path_member(const char *fname, const char *path)
661     {
662     if (strlen(fname) == 0)
663     {
664     fatal("Zero-length filename.", __FILE__, __LINE__);
665     }
666     else if (strlen(path) == 0)
667     {
668     fatal("Zero-length path.", __FILE__, __LINE__);
669     }
670     else if (path[strlen(path) - 1] != '/')
671     {
672     fatal("Paths must canonically end with forward slash character.", __FILE__, __LINE__);
673     }
674     else if (strlen(fname) <= strlen(path))
675     {
676     //Can't be in the path because filename is not longer than path name.
677     return 0;
678     }
679     else if (memcmp(fname, path, strlen(path)) == 0)
680     {
681     return 1;
682     }
683     else
684     {
685     return 0;
686     }
687     }
688     //----------------------------------------------------------------------------------------------------
689     void option_dedup(char *fname, char *path, int may_delete, double pause_time)
690     {
691     tFileHashRecord *parsed_recs;
692     unsigned dupgroup;
693     unsigned count;
694     unsigned ui;
695     unsigned within_path;
696     unsigned i_group_min, i_group_max;
697    
698     parseinputfile(&parsed_recs, &count, fname);
699     //printf("%u records parsed.\n", count);
700     sortinternaldsbyhash(parsed_recs, count);
701     sortinternalgroupfname(parsed_recs, count);
702    
703     if (! count)
704     return;
705    
706     dupgroup = 0;
707     i_group_min = 0;
708     i_group_max = 0;
709    
710     do
711     {
712     //Advance i_group_max to the end of the group of duplicates.
713     while ((i_group_max < (count - 1)) && (! strcmp(parsed_recs[i_group_min].hash, parsed_recs[i_group_max + 1].hash)))
714     {
715     i_group_max++;
716     }
717    
718     //If this is a group of duplicates.
719     if (i_group_min != i_group_max)
720     {
721     //Print the findings.
722     printf("Duplicate group %u:\n", dupgroup);
723     for (ui = i_group_min; ui <= i_group_max; ui++)
724     {
725     printf("%s\n", parsed_recs[ui].fname);
726     }
727    
728     dupgroup++;
729    
730     stdout_hline();
731    
732     //Count how many of the group of duplicates are within the supplied path.
733     within_path = 0;
734     for (ui = i_group_min; ui <= i_group_max; ui++)
735     {
736     if (is_path_member(parsed_recs[ui].fname, path))
737     {
738     within_path++;
739     }
740     }
741    
742     //We have to take different actions based on whether we do or don't have any within path.
743     //If we don't have any, we may delete nothing.
744     if (! within_path)
745     {
746     printf("None of these duplicates in path--taking no action.\n");
747     //stdout_hline();
748     }
749     else
750     {
751     for (ui = i_group_min; ui <= i_group_max; ui++)
752     {
753     if (is_path_member(parsed_recs[ui].fname, path))
754     {
755     printf("Not deleting: %s\n", parsed_recs[ui].fname);
756     }
757     else
758     {
759     printf("Deleting : %s\n", parsed_recs[ui].fname);
760     if (may_delete)
761     {
762     if (! unlink(parsed_recs[ui].fname))
763     {
764     printf(" File deleted (unlinked) successfully.\n");
765     }
766     else
767     {
768     printf(" Failure attempting to delete (unlink) file.\n");
769     }
770     }
771     else
772     {
773     printf(" Dry run only.\n");
774     }
775     }
776    
777     //w_sleep(pause_time);
778     }
779     }
780    
781     stdout_hline();
782     }
783    
784     //On to the next group.
785     i_group_max++;
786     i_group_min = i_group_max;
787    
788     } while (i_group_max < (count - 1));
789     }
790     //----------------------------------------------------------------------------------------------------
791     int main(int argc, char* argv[])
792     {
793     stdout_hline();
794     printf("Execution begins.\n");
795     stdout_hline();
796    
797     if (argc == 1)
798     {
799     }
800     else if ((argc == 3) && (strcmp(argv[1], "ndups") == 0))
801     {
802     option_dups(argv[2]);
803     }
804     else if ((argc == 3) && (strcmp(argv[1], "filterdups") == 0))
805     {
806     option_filterdups(argv[2]);
807     }
808 dashley 75 else if ((argc == 3) && (strcmp(argv[1], "dedup_nopath") == 0))
809 dashley 71 {
810     //option_filterdups(argv[2]);
811     }
812 dashley 75 else if ((argc == 3) && (strcmp(argv[1], "dryrun_nopath") == 0))
813 dashley 71 {
814     //option_filterdups(argv[2]);
815     }
816 dashley 75 else if ((argc == 4) && (strcmp(argv[1], "dedup_preserve_inside") == 0))
817 dashley 71 {
818     option_dedup(argv[2], argv[3], 1, UNLINKPAUSETIME);
819     }
820 dashley 75 else if ((argc == 4) && (strcmp(argv[1], "dryrun_preserve_inside") == 0))
821 dashley 71 {
822     option_dedup(argv[2], argv[3], 0, UNLINKPAUSETIME/10.0);
823     }
824     else
825     {
826     printf("Unrecognized parameter form. Try \"dedup\".\n");
827     }
828    
829     //w_sleep(-3 /* UNLINKPAUSETIME*/ );
830    
831     //stdout_hline();
832     printf("Execution ends.\n");
833     stdout_hline();
834    
835     return 0;
836     }
837     //----------------------------------------------------------------------------------------------------
838    

Properties

Name Value
svn:eol-style native
svn:keywords Header

dashley@gmail.com
ViewVC Help
Powered by ViewVC 1.1.25