/[dtapublic]/projs/dtats/trunk/projs/2016/20161007_ddeedduupp/win/ddeedduupp/ddeedduupp.cpp
ViewVC logotype

Contents of /projs/dtats/trunk/projs/2016/20161007_ddeedduupp/win/ddeedduupp/ddeedduupp.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 311 - (show annotations) (download)
Thu Jan 2 00:22:39 2020 UTC (4 years, 2 months ago) by dashley
File size: 28844 byte(s)
Reorganize.
Flesh out ddeedduupp.
1 //$Header$
2 //{580a1622-c0bc-439d-af57-ea4413b6e096}
3 //-------------------------------------------------------------------------------------------------
4 //Copyright (c) 2020, David T. Ashley
5 //
6 //This file is part of "ddeedduupp", a program for identifying and removing duplicate files
7 //from a directory tree.
8 //
9 //This source code and any program in which it is compiled/used is licensed under the MIT License,
10 //reproduced below.
11 //
12 //Permission is hereby granted, free of charge, to any person obtaining a copy of
13 //this software and associated documentation files(the "Software"), to deal in the
14 //Software without restriction, including without limitation the rights to use,
15 //copy, modify, merge, publish, distribute, sublicense, and / or sell copies of the
16 //Software, and to permit persons to whom the Software is furnished to do so,
17 //subject to the following conditions :
18 //
19 //The above copyright notice and this permission notice shall be included in all
20 //copies or substantial portions of the Software.
21 //
22 //THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 //IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 //FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
25 //AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 //LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 //OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 //SOFTWARE.
29 //-------------------------------------------------------------------------------------------------
30 extern "C" int c_main(int argc, char** argv);
31
32 int main(int argc, char** argv)
33 {
34 int rv;
35
36 //Call the C program from C++.
37 rv = c_main(argc, argv);
38
39 return(rv);
40 }
41
42 #if 0
43 // ddeedduupp.cpp : This file contains the 'main' function. Program execution begins and ends there.
44 //
45
46 #include <iostream>
47
48 int main()
49 {
50 std::cout << "Hello World!\n";
51 }
52
53 // Run program: Ctrl + F5 or Debug > Start Without Debugging menu
54 // Debug program: F5 or Debug > Start Debugging menu
55
56 // Tips for Getting Started:
57 // 1. Use the Solution Explorer window to add/manage files
58 // 2. Use the Team Explorer window to connect to source control
59 // 3. Use the Output window to see build output and other messages
60 // 4. Use the Error List window to view errors
61 // 5. Go to Project > Add New Item to create new code files, or Project > Add Existing Item to add existing code files to the project
62 // 6. In the future, to open this project again, go to File > Open > Project and select the .sln file
63 #endif
64
65
66 #if 0
67 //----------------------------------------------------------------------------------------------------
68 //$Header$
69 //----------------------------------------------------------------------------------------------------
70 //qdedup.c
71 //----------------------------------------------------------------------------------------------------
72 //Quick and dirty program to eliminate duplicates from a file tree. A file containing the SHA512
73 //hashes of all the files to be considered must already exist, and must be regenerated each time the
74 //underlying files are deleted/added/modified, which means the file must regenerated after each run
75 //of qdedup. (WARNING: IF YOU DO NOT REGENERATE THE FILE AFTER EACH RUN OF qdedup, YOU WILL
76 //PROBABLY DESTROY DATA. THE MECHANISM WOULD BE THAT THE SHA512 MANIFEST IMPLIES THAT DUPLICATES
77 //EXIST WHEN THEY NO LONGER DO, SO qdedup WILL ERRONEOUSLY DELETE THE LAST COPIES OF FILES.) The
78 //program will eliminate duplicates within a single specified directory or outside a single specified
79 //directory.
80 //
81 //This program will compile and run only on *nix systems and under Cygwin on Windows systems.
82 //----------------------------------------------------------------------------------------------------
83 //Copyright David T. Ashley (dashley@gmail.com), 2016.
84 //----------------------------------------------------------------------------------------------------
85 //Provided under the MIT LICENSE, reproduced immediately below.
86 //----------------------------------------------------------------------------------------------------
87 //Permission is hereby granted, free of charge, to any person obtaining a copy of
88 //this software and associated documentation files (the "Software"), to deal in the
89 //Software without restriction, including without limitation the rights to use,
90 //copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
91 //Software, and to permit persons to whom the Software is furnished to do so,
92 //subject to the following conditions:
93 //
94 //The above copyright notice and this permission notice shall be included in all
95 //copies or substantial portions of the Software.
96 //
97 //THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
98 //IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
99 //FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
100 //AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
101 //LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
102 //OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
103 //SOFTWARE.
104 //----------------------------------------------------------------------------------------------------
105 //All paths in the SHA512 file must be absolute or must be relative to the current working directory
106 //at the time this program is run.
107 //
108 //The recommended method to generate the SHA512 file is using the "-exec" option of the "find"
109 //command, i.e.
110 //
111 // find target_directory -type f -exec sha512sum {} \; >sha512sums.txt
112 //
113 //If any files are deleted by the program, a new SHA512 file must be generated before the program is
114 //run again to delete files. The reason for this restriction is that the program will never knowingly
115 //delete the last copy of a file. If the SHA512 file contains the digests of files that no longer
116 //exist, the program may unknowingly delete the last copies of files (because it believes based on
117 //the SHA512 file that other copies exist when in fact they do not).
118 //
119 //The SHA512 file does not need to be sorted (this program sorts it internally by hash before using it).
120 //
121 //This program is designed to compile and run under Cygwin or *nix only.
122 //
123 //Usage:
124 // qdedup
125 // Prints help information and exits.
126 // qdedup ndups <sha512file>
127 // Prints statistics about the number of duplicates in <sha512file>.
128 // qdedup filterdups <sha512file>
129 // Analyzes duplicates and prints the filenames of groups of duplicates. The output is designed
130 // for hand analysis so that insight can be gained into what duplicates exist and where they
131 // are located.
132 // qdedup dedup_preserve_inside <sha512file> <path>
133 // For each group of duplicates that exists, preserves the duplicates that exist within path
134 // and removes all others. If no copies of the duplicate exist within path, no copies of the
135 // duplicate will be removed.
136 // qdedup dryrun_preserve_inside <sha512file> <path>
137 // Exactly like "dedup_preserve_inside", except that no files will be deleted. Text will be
138 // output to explain what would be deleted by "dedup_preserve_inside".
139 // qdedup dedup_nopath <sha512file>
140 // For each group of duplicates that exists, preserves only the first (the only with lowest
141 // sort-order filename).
142 // qdedup dryrun_nopath <sha512file> <path>
143 // Exactly like "dedup_nopath", except that no files will be deleted. Text will be
144 // output to explain what would be deleted by "dedup_nopath".
145 // qdedup dedup_preserve_outside <sha512file> <path>
146 // For each group of duplicates that exists, deletes duplicates only from within the specified
147 // path. If any duplicates do not have at least one copy within <path> no instances of the
148 // duplicate are deleted.
149 // qdedup dryrun_preserve_outside <sha512file>
150 // Exactly like "dedup_preserve_outside", except that no files will be deleted. Text will be
151 // output to explain what would be deleted by "dedup_preserve_outside".
152 //----------------------------------------------------------------------------------------------------
153 #include <math.h>
154 #include <stdio.h>
155 #include <stdlib.h>
156 #include <string.h>
157 #include <time.h>
158 #include <unistd.h>
159 //----------------------------------------------------------------------------------------------------
160 #define LINELEN (78) //Number of printable characters in a line.
161 #define MAXLINELEN (2000) //The maximum number of characters that may be in a line of the
162 //SHA512 input file. This count includes the \0 terminator, so only
163 //this value minus 1 characters may be in a line.
164 #define UNLINKPAUSETIME (0.1) //Number of seconds to pause between file unlinks (deletions). This
165 //is designed to give the user time to abort the program if desired
166 //before catastrophic quantities of files are deleted.
167 //----------------------------------------------------------------------------------------------------
168 //Data structure that holds the character representation of and SHA512 hash, plus the specified
169 //filename.
170 typedef struct
171 {
172 char hash[129];
173 //512/4 = 128 characters for the hash, plus 1 character for zero terminator.
174 char* fname;
175 //Filename as specified in the file, allocated via malloc() family.
176 } tFileHashRecord;
177 //----------------------------------------------------------------------------------------------------
178 //----------------------------------------------------------------------------------------------------
179 //----- CHARACTER CLASSIFICATION FUNCTIONS ---------------------------------------------------------
180 //----------------------------------------------------------------------------------------------------
181 //----------------------------------------------------------------------------------------------------
182 //TRUE if character is part of valid hash.
183 int is_valid_hash_char(char c)
184 {
185 switch (c)
186 {
187 case '0':
188 case '1':
189 case '2':
190 case '3':
191 case '4':
192 case '5':
193 case '6':
194 case '7':
195 case '8':
196 case '9':
197 case 'a':
198 case 'b':
199 case 'c':
200 case 'd':
201 case 'e':
202 case 'f':
203 return(1);
204 break;
205 default:
206 return(0);
207 break;
208 }
209 }
210 //----------------------------------------------------------------------------------------------------
211 //TRUE if character is part of newline sequence
212 int is_newline_sequence_char(char c)
213 {
214 switch (c)
215 {
216 case 13:
217 case 10:
218 return(1);
219 break;
220 default:
221 return(0);
222 break;
223 }
224 }
225
226 //----------------------------------------------------------------------------------------------------
227 //----------------------------------------------------------------------------------------------------
228 //----- FORMATTED OUTPUT FUNCTIONS -----------------------------------------------------------------
229 //----------------------------------------------------------------------------------------------------
230 //----------------------------------------------------------------------------------------------------
231 //Repeats a character to a stream a specified number of times.
232 //
233 void stream_rep_char(FILE* s, char c, unsigned n)
234 {
235 while (n--)
236 {
237 fprintf(s, "%c", c);
238 }
239 }
240 //----------------------------------------------------------------------------------------------------
241 //Prints a horizontal line to a stream, including the newline.
242 //
243 void stream_hline(FILE* s)
244 {
245 stream_rep_char(s, '-', LINELEN);
246 fprintf(s, "\n");
247 }
248 //----------------------------------------------------------------------------------------------------
249 //Prints a horizontal line to a stdout, including the newline.
250 //
251 void stdout_hline(void)
252 {
253 stream_rep_char(stdout, '-', LINELEN);
254 fprintf(stdout, "\n");
255 }
256 //----------------------------------------------------------------------------------------------------
257 //----------------------------------------------------------------------------------------------------
258 //----- FATAL ERROR FUNCTIONS ----------------------------------------------------------------------
259 //----------------------------------------------------------------------------------------------------
260 //----------------------------------------------------------------------------------------------------
261 //Errors out fatally.
262 //
263 void fatal(const char* desc, const char* file, unsigned line)
264 {
265 stdout_hline();
266 printf("Fatal error: %s\n", desc);
267 printf("Source file: %s\n", file);
268 printf("Line : %u\n", line);
269 stdout_hline();
270 exit(1);
271 }
272 //----------------------------------------------------------------------------------------------------
273 //----------------------------------------------------------------------------------------------------
274 //----- MEMORY ALLOCATION WRAPPERS -----------------------------------------------------------------
275 //----------------------------------------------------------------------------------------------------
276 //----------------------------------------------------------------------------------------------------
277 //malloc() wrapper.
278 void* w_malloc(size_t nbytes)
279 {
280 void* rv;
281
282 if (!nbytes)
283 {
284 fatal("Memory allocation request for 0 bytes.", __FILE__, __LINE__);
285 }
286
287 rv = malloc(nbytes);
288
289 if (!rv)
290 {
291 fatal("Out of memory in malloc() request.", __FILE__, __LINE__);
292 }
293
294 //Zero out, just for consistency.
295 memset(rv, 0, nbytes);
296 }
297 //----------------------------------------------------------------------------------------------------
298 //realloc() wrapper.
299 void* w_realloc(void* p, size_t n)
300 {
301 void* rv;
302
303 if (!n)
304 {
305 fatal("Memory reallocation request for 0 bytes.", __FILE__, __LINE__);
306 }
307
308 if (!p)
309 {
310 fatal("Memory reallocation request with NULL pointer.", __FILE__, __LINE__);
311 }
312
313 rv = realloc(p, n);
314
315 if (!rv)
316 {
317 fatal("Out of memory in realloc() request.", __FILE__, __LINE__);
318 }
319 }
320 //----------------------------------------------------------------------------------------------------
321 //----------------------------------------------------------------------------------------------------
322 //----- SLEEP FUNCTIONS ----------------------------------------------------------------------------
323 //----------------------------------------------------------------------------------------------------
324 //----------------------------------------------------------------------------------------------------
325 //Sleep for a time, in seconds.
326 void w_sleep(double seconds)
327 {
328 struct timespec t;
329
330 if (seconds < 0)
331 {
332 fatal("Sleep for negative time request.", __FILE__, __LINE__);
333 }
334 else if (seconds > 3600)
335 {
336 fatal("Sleep for too long request.", __FILE__, __LINE__);
337 }
338
339 t.tv_sec = floor(seconds);
340 t.tv_nsec = (seconds - floor(seconds)) * 1E9;
341
342 nanosleep(&t, NULL);
343 }
344 //----------------------------------------------------------------------------------------------------
345 //----------------------------------------------------------------------------------------------------
346 //----- SHA512 FIELD READ FUNCTIONS ----------------------------------------------------------------
347 //----------------------------------------------------------------------------------------------------
348 //----------------------------------------------------------------------------------------------------
349 //These functions read in an individual field of a standard SHA512 file generated using application
350 //of the standard sha512sum program.
351 //
352 //*rcode = 1, success.
353 // 0, legal end of file, record assigned.
354 void get_sha512file_line(FILE* s, int* rcode, tFileHashRecord* hash_rec)
355 {
356 unsigned bidx;
357 unsigned nchars;
358 int ic;
359 int exitflag;
360 int eoffound;
361 int eolfound;
362 char c;
363 char buf[MAXLINELEN];
364
365 //Zero out the buffer. This handles string termination automatically.
366 memset(buf, 0, sizeof(buf));
367
368 //Read characters into the buffer until either hit EOF, newline, or can't
369 //fill the buffer any longer.
370 eoffound = 0;
371 eolfound = 0;
372 exitflag = 0;
373 bidx = 0;
374 do
375 {
376 ic = fgetc(s);
377 c = ic;
378
379 if (ic == EOF)
380 {
381 eoffound = 1;
382 eolfound = 0;
383 nchars = bidx;
384 exitflag = 1;
385 }
386 else if (is_newline_sequence_char(c))
387 {
388 eoffound = 0;
389 eolfound = 1;
390 nchars = bidx;
391 exitflag = 1;
392 }
393 else if (bidx >= (MAXLINELEN - 1))
394 {
395 fatal("SHA512 hash file line too long to parse.", __FILE__, __LINE__);
396 }
397 else
398 {
399 buf[bidx] = c;
400 bidx++;
401 exitflag = 0;
402 }
403 } while (!exitflag);
404
405 //If we encountered a newline, inch past it. We may encounter an EOF.
406 if (eolfound)
407 {
408 exitflag = 0;
409 do
410 {
411 ic = fgetc(s);
412 c = ic;
413
414 if (ic == EOF)
415 {
416 eoffound = 1;
417 eolfound = 0;
418 exitflag = 1;
419 }
420 else if (is_newline_sequence_char(c))
421 {
422 exitflag = 0;
423 }
424 else
425 {
426 //We hit the next line. Put the character back.
427 eoffound = 0;
428 eolfound = 1;
429 ungetc(ic, s);
430 exitflag = 1;
431 }
432 } while (!exitflag);
433 }
434
435 //For better or worse, we have a \0-terminated line in the buffer.
436 //
437 //Zero the caller's area. This takes care of the hash terminator as well.
438 memset(hash_rec, 0, sizeof(*hash_rec));
439
440 //Ensure that we have at least 128 characters, and they are all hex characters.
441 //Otherwise, we can't proceed.
442 if (nchars < 128)
443 {
444 fatal("SHA512 hash file line too short.", __FILE__, __LINE__);
445 }
446 else
447 {
448 for (bidx = 0; bidx < 128; bidx++)
449 {
450 if (!is_valid_hash_char(buf[bidx]))
451 {
452 fatal("Character in SHA512 hash portion of line inconsistent with hash.", __FILE__, __LINE__);
453 }
454 }
455 }
456
457 //The 129th and 130'th character must be present and must be a space and asterisk, respectively.
458 if (nchars < 130)
459 {
460 fatal("SHA512 hash file line too short.", __FILE__, __LINE__);
461 }
462 else if (buf[128] != ' ')
463 {
464 fatal("129th hash line character must be \" \".", __FILE__, __LINE__);
465 }
466 else if (buf[129] != '*')
467 {
468 fatal("130th hash line character must be \"*\".", __FILE__, __LINE__);
469 }
470 // else if (buf[129] != ' ')
471 // {
472 // //130th character is ' '. Need to figure out why sometimes space and sometimes '*'.
473 // fatal("130th hash line character must be \" \".", __FILE__, __LINE__);
474 // }
475
476 //There must be a 131'st character. Beyond that, we can't qualify, because filenames may
477 //have odd characters and may be of any length.
478 if (nchars < 131)
479 {
480 fatal("SHA512 hash file line too short.", __FILE__, __LINE__);
481 }
482
483 //Copy the hash to the caller's area. The terminator has already been inserted.
484 memcpy(&(hash_rec->hash[0]), buf, 128);
485
486 //Allocate space for the filename.
487 hash_rec->fname = w_malloc(strlen(buf + 130) + 1);
488
489 //Make the copy.
490 strcpy(hash_rec->fname, buf + 130);
491
492 if (eoffound)
493 * rcode = 0;
494 else
495 *rcode = 1;
496 }
497 //----------------------------------------------------------------------------------------------------
498 void parseinputfile(tFileHashRecord** parsed_recs, unsigned* count, char* fname)
499 {
500 FILE* s;
501 int rcode;
502
503 //Try to open the file for reading. Inability is a failure.
504 s = fopen(fname, "r");
505 if (!s)
506 {
507 fatal("Hash file open failure.", __FILE__, __LINE__);
508 }
509
510 //Start off with a count of 0 and a NULL pointer.
511 *count = 0;
512 *parsed_recs = NULL;
513
514 do
515 {
516 //For the first time, allocate space for one record. Beyond that,
517 //expand it.
518 if (!*parsed_recs)
519 {
520 *parsed_recs = w_malloc(sizeof(tFileHashRecord));
521 }
522 else
523 {
524 *parsed_recs = w_realloc(*parsed_recs, (size_t)((*count + 1)) * sizeof(tFileHashRecord));
525 }
526
527 //Parse and fill in the space.
528 get_sha512file_line(s, &rcode, (*parsed_recs) + (*count));
529
530 //We now have one more.
531 (*count)++;
532 } while (rcode == 1);
533
534 //Try to close the file. Inability is a failure.
535 if (fclose(s))
536 {
537 fatal("Hash file close failure.", __FILE__, __LINE__);
538 }
539 }
540 //----------------------------------------------------------------------------------------------------
541 int sortcmpascendinghash(const void* p0_in, const void* p1_in)
542 {
543 const tFileHashRecord* p0, * p1;
544
545 p0 = p0_in;
546 p1 = p1_in;
547
548 return(strcmp(p0->hash, p1->hash));
549 }
550
551 //----------------------------------------------------------------------------------------------------
552 void sortinternaldsbyhash(tFileHashRecord* parsed_recs, unsigned count)
553 {
554 qsort(parsed_recs, count, sizeof(tFileHashRecord), sortcmpascendinghash);
555 }
556 //----------------------------------------------------------------------------------------------------
557 int sortcmpascendingfname(const void* p0_in, const void* p1_in)
558 {
559 const tFileHashRecord* p0, * p1;
560
561 p0 = p0_in;
562 p1 = p1_in;
563
564 return(strcmp(p0->fname, p1->fname));
565 }
566 //----------------------------------------------------------------------------------------------------
567 //This sort has to be run after the hash sort. Within groups of identical hashes, it sorts by
568 //ascending filename.
569 void sortinternalgroupfname(tFileHashRecord* parsed_recs, unsigned count)
570 {
571 unsigned ui;
572 unsigned i_group_min, i_group_max;
573
574 if (!count)
575 return;
576
577 i_group_min = 0;
578 i_group_max = 0;
579
580 do
581 {
582 //Advance i_group_max to the end of the group of duplicates.
583 while ((i_group_max < (count - 1)) && (!strcmp(parsed_recs[i_group_min].hash, parsed_recs[i_group_max + 1].hash)))
584 {
585 i_group_max++;
586 }
587
588 if (i_group_min != i_group_max)
589 {
590 //Sort the internal group.
591 qsort(parsed_recs + i_group_min,
592 i_group_max - i_group_min + 1,
593 sizeof(tFileHashRecord),
594 sortcmpascendingfname);
595 }
596
597 //On to the next group.
598 i_group_max++;
599 i_group_min = i_group_max;
600
601 } while (i_group_max < (count - 1));
602 }
603 //----------------------------------------------------------------------------------------------------
604 void printsinglerecord(tFileHashRecord* rec, unsigned elno)
605 {
606 printf("[%9u]\n", elno);
607 printf("Hash : %s\n", rec->hash);
608 printf("Filename : %s\n", rec->fname);
609 stdout_hline();
610 }
611 //----------------------------------------------------------------------------------------------------
612 void printinternalds(tFileHashRecord* parsed_recs, unsigned count)
613 {
614 unsigned i;
615
616 for (i = 0; i < count; i++)
617 {
618 printsinglerecord(parsed_recs + i, i);
619 }
620 }
621 //----------------------------------------------------------------------------------------------------
622 void gather_dup_stats(tFileHashRecord* parsed_recs, unsigned count, unsigned* out_num_dups, unsigned* out_cumulative_dups)
623 {
624 unsigned i_group_min, i_group_max;
625
626 *out_num_dups = 0;
627 *out_cumulative_dups = 0;
628
629 if (!count)
630 return;
631
632 i_group_min = 0;
633 i_group_max = 0;
634
635 do
636 {
637 //Advance i_group_max to the end of the group of duplicates.
638 while ((i_group_max < (count - 1)) && (!strcmp(parsed_recs[i_group_min].hash, parsed_recs[i_group_max + 1].hash)))
639 {
640 i_group_max++;
641 }
642
643 //Log the findings.
644 if (i_group_min != i_group_max)
645 {
646 (*out_num_dups)++;
647 (*out_cumulative_dups) += (i_group_max - i_group_min + 1);
648 }
649
650 //On to the next group.
651 i_group_max++;
652 i_group_min = i_group_max;
653
654 } while (i_group_max < (count - 1));
655 }
656 //----------------------------------------------------------------------------------------------------
657 void option_dups(char* fname)
658 {
659 tFileHashRecord* parsed_recs;
660 unsigned count, num_dups, cumulative_dups;
661
662 parseinputfile(&parsed_recs, &count, fname);
663 //printf("%u records parsed.\n", count);
664 sortinternaldsbyhash(parsed_recs, count);
665 sortinternalgroupfname(parsed_recs, count);
666 printinternalds(parsed_recs, count);
667 stdout_hline();
668 gather_dup_stats(parsed_recs, count, &num_dups, &cumulative_dups);
669 printf("Number of duplicated files : %u\n", num_dups);
670 if (num_dups)
671 {
672 printf("Average number of duplicates: %.2f\n", (double)cumulative_dups / (double)num_dups);
673 }
674 }
675 //----------------------------------------------------------------------------------------------------
676 void option_filterdups(char* fname)
677 {
678 tFileHashRecord* parsed_recs;
679 unsigned dupgroup;
680 unsigned count;
681 unsigned ui;
682 unsigned i_group_min, i_group_max;
683
684 parseinputfile(&parsed_recs, &count, fname);
685 //printf("%u records parsed.\n", count);
686 sortinternaldsbyhash(parsed_recs, count);
687 sortinternalgroupfname(parsed_recs, count);
688
689 if (!count)
690 return;
691
692 dupgroup = 0;
693 i_group_min = 0;
694 i_group_max = 0;
695
696 do
697 {
698 //Advance i_group_max to the end of the group of duplicates.
699 while ((i_group_max < (count - 1)) && (!strcmp(parsed_recs[i_group_min].hash, parsed_recs[i_group_max + 1].hash)))
700 {
701 i_group_max++;
702 }
703
704 //Print the findings.
705 if (i_group_min != i_group_max)
706 {
707 printf("Duplicate group %u:\n", dupgroup);
708 for (ui = i_group_min; ui <= i_group_max; ui++)
709 {
710 printf("%s\n", parsed_recs[ui].fname);
711 }
712
713 dupgroup++;
714
715 stdout_hline();
716 }
717
718 //On to the next group.
719 i_group_max++;
720 i_group_min = i_group_max;
721
722 } while (i_group_max < (count - 1));
723 }
724 //----------------------------------------------------------------------------------------------------
725 //Returns true if the filename is within the specified path, or false otherwise.
726 int is_path_member(const char* fname, const char* path)
727 {
728 if (strlen(fname) == 0)
729 {
730 fatal("Zero-length filename.", __FILE__, __LINE__);
731 }
732 else if (strlen(path) == 0)
733 {
734 fatal("Zero-length path.", __FILE__, __LINE__);
735 }
736 else if (path[strlen(path) - 1] != '/')
737 {
738 fatal("Paths must canonically end with forward slash character.", __FILE__, __LINE__);
739 }
740 else if (strlen(fname) <= strlen(path))
741 {
742 //Can't be in the path because filename is not longer than path name.
743 return 0;
744 }
745 else if (memcmp(fname, path, strlen(path)) == 0)
746 {
747 return 1;
748 }
749 else
750 {
751 return 0;
752 }
753 }
754 //----------------------------------------------------------------------------------------------------
755 void option_dedup(char* fname, char* path, int may_delete, double pause_time)
756 {
757 tFileHashRecord* parsed_recs;
758 unsigned dupgroup;
759 unsigned count;
760 unsigned ui;
761 unsigned within_path;
762 unsigned i_group_min, i_group_max;
763
764 parseinputfile(&parsed_recs, &count, fname);
765 //printf("%u records parsed.\n", count);
766 sortinternaldsbyhash(parsed_recs, count);
767 sortinternalgroupfname(parsed_recs, count);
768
769 if (!count)
770 return;
771
772 dupgroup = 0;
773 i_group_min = 0;
774 i_group_max = 0;
775
776 do
777 {
778 //Advance i_group_max to the end of the group of duplicates.
779 while ((i_group_max < (count - 1)) && (!strcmp(parsed_recs[i_group_min].hash, parsed_recs[i_group_max + 1].hash)))
780 {
781 i_group_max++;
782 }
783
784 //If this is a group of duplicates.
785 if (i_group_min != i_group_max)
786 {
787 //Print the findings.
788 printf("Duplicate group %u:\n", dupgroup);
789 for (ui = i_group_min; ui <= i_group_max; ui++)
790 {
791 printf("%s\n", parsed_recs[ui].fname);
792 }
793
794 dupgroup++;
795
796 stdout_hline();
797
798 //Count how many of the group of duplicates are within the supplied path.
799 within_path = 0;
800 for (ui = i_group_min; ui <= i_group_max; ui++)
801 {
802 if (is_path_member(parsed_recs[ui].fname, path))
803 {
804 within_path++;
805 }
806 }
807
808 //We have to take different actions based on whether we do or don't have any within path.
809 //If we don't have any, we may delete nothing.
810 if (!within_path)
811 {
812 printf("None of these duplicates in path--taking no action.\n");
813 //stdout_hline();
814 }
815 else
816 {
817 for (ui = i_group_min; ui <= i_group_max; ui++)
818 {
819 if (is_path_member(parsed_recs[ui].fname, path))
820 {
821 printf("Not deleting: %s\n", parsed_recs[ui].fname);
822 }
823 else
824 {
825 printf("Deleting : %s\n", parsed_recs[ui].fname);
826 if (may_delete)
827 {
828 if (!unlink(parsed_recs[ui].fname))
829 {
830 printf(" File deleted (unlinked) successfully.\n");
831 }
832 else
833 {
834 printf(" Failure attempting to delete (unlink) file.\n");
835 }
836 }
837 else
838 {
839 printf(" Dry run only.\n");
840 }
841 }
842
843 //w_sleep(pause_time);
844 }
845 }
846
847 stdout_hline();
848 }
849
850 //On to the next group.
851 i_group_max++;
852 i_group_min = i_group_max;
853
854 } while (i_group_max < (count - 1));
855 }
856 //----------------------------------------------------------------------------------------------------
857 int main(int argc, char* argv[])
858 {
859 stdout_hline();
860 printf("Execution begins.\n");
861 stdout_hline();
862
863 if (argc == 1)
864 {
865 }
866 else if ((argc == 3) && (strcmp(argv[1], "ndups") == 0))
867 {
868 option_dups(argv[2]);
869 }
870 else if ((argc == 3) && (strcmp(argv[1], "filterdups") == 0))
871 {
872 option_filterdups(argv[2]);
873 }
874 else if ((argc == 3) && (strcmp(argv[1], "dedup_nopath") == 0))
875 {
876 //option_filterdups(argv[2]);
877 }
878 else if ((argc == 3) && (strcmp(argv[1], "dryrun_nopath") == 0))
879 {
880 //option_filterdups(argv[2]);
881 }
882 else if ((argc == 4) && (strcmp(argv[1], "dedup_preserve_inside") == 0))
883 {
884 option_dedup(argv[2], argv[3], 1, UNLINKPAUSETIME);
885 }
886 else if ((argc == 4) && (strcmp(argv[1], "dryrun_preserve_inside") == 0))
887 {
888 option_dedup(argv[2], argv[3], 0, UNLINKPAUSETIME / 10.0);
889 }
890 else
891 {
892 printf("Unrecognized parameter form. Try \"dedup\".\n");
893 }
894
895 //w_sleep(-3 /* UNLINKPAUSETIME*/ );
896
897 //stdout_hline();
898 printf("Execution ends.\n");
899 stdout_hline();
900
901 return 0;
902 }
903 //----------------------------------------------------------------------------------------------------
904 #endif

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision URL Header

dashley@gmail.com
ViewVC Help
Powered by ViewVC 1.1.25