1 |
dashley |
231 |
//$Header$ |
2 |
|
|
//{211fc600-db2e-4703-bf40-968a2f063b13} |
3 |
|
|
//------------------------------------------------------------------------------------------------- |
4 |
|
|
//Copyright (c) 2018, David T. Ashley |
5 |
|
|
// |
6 |
|
|
//This file is part of "ets_dedup", a program for eliminating duplicate files in a subdirectory |
7 |
|
|
//tree. |
8 |
|
|
// |
9 |
|
|
//This source code and any program in which it is compiled/used is licensed under the MIT License, |
10 |
|
|
//reproduced below. |
11 |
|
|
// |
12 |
|
|
//Permission is hereby granted, free of charge, to any person obtaining a copy of |
13 |
|
|
//this software and associated documentation files(the "Software"), to deal in the |
14 |
|
|
//Software without restriction, including without limitation the rights to use, |
15 |
|
|
//copy, modify, merge, publish, distribute, sublicense, and / or sell copies of the |
16 |
|
|
//Software, and to permit persons to whom the Software is furnished to do so, |
17 |
|
|
//subject to the following conditions : |
18 |
|
|
// |
19 |
|
|
//The above copyright notice and this permission notice shall be included in all |
20 |
|
|
//copies or substantial portions of the Software. |
21 |
|
|
// |
22 |
|
|
//THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
23 |
|
|
//IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
24 |
|
|
//FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
25 |
|
|
//AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
26 |
|
|
//LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
27 |
|
|
//OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
28 |
|
|
//SOFTWARE. |
29 |
|
|
//------------------------------------------------------------------------------------------------- |
30 |
|
|
#include <stdio.h> |
31 |
|
|
|
32 |
dashley |
235 |
const char *ets_dedup_description[] = |
33 |
|
|
{ |
34 |
|
|
"ets_dedup (mnemonic: DE-DUPlicate) is a program for identifying and", |
35 |
|
|
"eliminating duplicate files (by any name) at any depth in a subdirectory and" |
36 |
|
|
"its children. The most common application for the program would be the", |
37 |
|
|
"reduction of personal clutter, i.e. duplicate photos and downloads.", |
38 |
|
|
}; |
39 |
|
|
|
40 |
|
|
const char *ets_dedup_instructions[] = |
41 |
|
|
{ |
42 |
|
|
"Usage", |
43 |
|
|
"-----" |
44 |
|
|
"ets_dedup [-option_1 [ ... [-option_n]]] [--] [pref_dir_1 [ ... [pref_dir_n]]]" |
45 |
|
|
" If no options are provided, emits full documentation to stdout. If options", |
46 |
|
|
" are provided, analyzes and optionally deletes duplicate files in the", |
47 |
|
|
" current working directory and all its children. With large sets of files", |
48 |
|
|
" This program can take a long time to run (hours), because it calculates", |
49 |
|
|
" the SHA512 digest of every file." |
50 |
|
|
"", |
51 |
|
|
" ets_dedup is a dangerous program in that it can destroy information (which", |
52 |
|
|
" file is in which directory is information, and it is possible to destroy", |
53 |
|
|
" information without deleting the last of a set of duplicate files).", |
54 |
|
|
" However, ets_dedup is safe in the sense that it will never delete the last", |
55 |
|
|
" of a set of identical files (this cannot be done using this program,", |
56 |
|
|
" automatically or manually)." |
57 |
|
|
"", |
58 |
|
|
"Options", |
59 |
|
|
"-------", |
60 |
|
|
"-report", |
61 |
|
|
" Analyzes the current working directory and all its children for duplicates,", |
62 |
|
|
" and writes a full report to the console. The report includes which files", |
63 |
|
|
" are duplicates, and approximately how much storage space would be saved by", |
64 |
|
|
" eliminating all duplicates and by eliminating duplicates of individual", |
65 |
|
|
" files and of subdirectories. The report is voluminous and is typically", |
66 |
|
|
" redirected to a file.", |
67 |
|
|
"-dedup_full_auto", |
68 |
|
|
" Deletes all duplicate files, leaving only one copy (by any name or", |
69 |
|
|
" extension) of any file. If duplicates are in the same directory, the", |
70 |
|
|
" first one in alphabetical order is retained. If duplicates are in different", |
71 |
|
|
" directories, a non-deterministic algorithm is used that tends to leave", |
72 |
|
|
" larger directories intact while consuming smaller directories.", |
73 |
|
|
"-dedup_auto_dir_pri", |
74 |
|
|
" Deletes all duplicate files, leaving only one copy (by any name or", |
75 |
|
|
" extension). However, in the selection of which duplicates to delete, the", |
76 |
|
|
" copy in pref_dir_1 is given preference to remain over the copy in", |
77 |
|
|
" pref_dir_2, ..., over the copy in pref_dir_n, and finally over files in", |
78 |
|
|
" subdirectories not covered by any of the specified directories. If", |
79 |
|
|
" multiple copies of a file exist in the highest-priority preferred directory", |
80 |
|
|
" specified, they are all retained. If there are duplicates that exist only", |
81 |
|
|
" outside the set of specified preferred directories, none are deleted." |
82 |
|
|
"-dedup_auto_dir_equal", |
83 |
|
|
" The specified directories are given priority over all directories not", |
84 |
|
|
" specified (this creates two equivalence classes--the directories specified", |
85 |
|
|
" and the directories not specified). Duplicates that exist both in at least", |
86 |
|
|
" one of the specified directories and outside the set of specified", |
87 |
|
|
" directories have the outside copies deleted. No files within the set of", |
88 |
|
|
" specified directories are deleted. If a file has copies only outside, no", |
89 |
|
|
" copies are deleted.", |
90 |
|
|
"-dedup_manual_interactive", |
91 |
|
|
" Performs a full analysis, then allows interactive manual operations. The", |
92 |
|
|
" operations involve descending into and ascending out of directories, and", |
93 |
|
|
" setting a given directory or file as authoritative (meaning all external", |
94 |
|
|
" copies will be deleted) or non-authoritative (meaning that duplicates", |
95 |
|
|
" within the non-authoritative object are not retained).", |
96 |
|
|
"-dry_run", |
97 |
|
|
" Provides all information about what would have been deleted, but deletes no", |
98 |
|
|
" files. This option can be useful for ensuring that the behavior of the", |
99 |
|
|
" program will be acceptable.", |
100 |
|
|
"", |
101 |
|
|
"Limitations", |
102 |
|
|
"-----------", |
103 |
|
|
" ( 1) Unicode in path names supplied on the command line and in file and", |
104 |
|
|
" directory names is not supported.", |
105 |
|
|
" ( 2) Unicode in file and directory names may or may not be supported.", |
106 |
|
|
" This depends on technical details of Linux/Unix and Windows that", |
107 |
|
|
" are tool voluminous to include here.", |
108 |
|
|
" ( 3) The program rebuilds its internal data structures each time it is", |
109 |
|
|
" run (which involves calculating the SHA512 digest of every file in", |
110 |
|
|
" the current working directory and its children). This is a very", |
111 |
|
|
" time-consuming operation. The program does not save any information", |
112 |
|
|
" between invocations." |
113 |
|
|
" ( 4) The program builds all data structures in memory, and so is limited" |
114 |
|
|
" by the amount of usable memory in the computer system. A reasonable", |
115 |
|
|
" estimate of memory consumption might be 250 bytes per file to be", |
116 |
|
|
" analyzed (100 bytes for the path name, 128 bytes for the SHA512", |
117 |
|
|
" digest, and 22 bytes for other overhead). Assuming 1GB of usable", |
118 |
|
|
" memory, this gives an upper limit of around 4 million files.", |
119 |
|
|
" This suggests that the program would be usable for most de-duplication", |
120 |
|
|
" tasks.", |
121 |
|
|
" ( 5) The program does not provide information about near duplicates.", |
122 |
|
|
" The program processes files only in terms of same or different.", |
123 |
|
|
"", |
124 |
|
|
"Technical Notes", |
125 |
|
|
"---------------", |
126 |
|
|
" ( 1) Although the probability of two files with different contents having", |
127 |
|
|
" the same SHA512 digest is astronomically small (a hash collision has", |
128 |
|
|
" never been found), the program handles this case by ", |
129 |
|
|
|
130 |
|
|
}; |
131 |
|
|
|
132 |
|
|
|
133 |
dashley |
231 |
int c_main(int argc, char **argv) |
134 |
|
|
{ |
135 |
|
|
printf("Execution begins.\n"); |
136 |
|
|
printf("Execution ends.\n"); |
137 |
|
|
return 0; |
138 |
|
|
} |