-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpy_duplicates.py
executable file
·264 lines (239 loc) · 8.56 KB
/
py_duplicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = "nagracks"
__date__ = "15-07-2016"
__license__ = "MIT"
__copyright__ = "Copyright © 2016 nagracks"
import argparse
import collections
import hashlib
import os
import sys
def get_filesize(filepath):
"""
Get size of file as bytes
:filepath: path to file
:returns: int, file size in bytes
"""
return os.stat(filepath).st_size
def get_hash_md5(filepath):
"""
Get md5 hash of a file
:filepath: path of file
:returns: md5 hash
"""
# md5 hash object #
m = hashlib.md5()
try:
# Don't read at once #
# Because it will be inefficient if file is large #
# Read in 1024 chunks #
with open(filepath, 'rb', 1024) as f:
# Read data until there is no data #
while True:
data = f.read(1024)
if not data:
break
# Update hash object #
m.update(data)
except (FileNotFoundError, OSError) as e:
print(e)
# Return hexadecimal digits #
return m.hexdigest()
def get_filesize_dict(path, filesize_dict):
"""
Make size:file dictionary, {key=size:value=files}
:path: full path of file
:filesize_dict {key=size:value=files}
:returns: the updated filesize_dict
"""
for base_dir, dirs, files in os.walk(path):
for filename in files:
full_path = os.path.join(base_dir, filename)
filesize_dict[get_filesize(full_path)].append(full_path)
return filesize_dict
def hash_dict_from_filesize_dict(filesize_dict):
"""
Make hash:file dictionary from filesize:file dictionary
Note: only includes files which have duplicate filesizes.
:filesize_dict: dictionary, contains filesize:files
returns: hash_file_dict {key=hash, value=filepath}
"""
hash_file_dict = collections.defaultdict(list)
for size, files in filesize_dict.items():
if len(files) < 2:
continue
for filepath in files:
hash_file_dict[get_hash_md5(filepath)].append(filepath)
return hash_file_dict
def print_duplicates(hash_file_dict):
"""
Print duplicate files
:hash_file_dict: dictionary, contains hash:files
:returns: None
"""
for k, v in hash_file_dict.items():
# If it contain duplicates #
# Print them #
if len(v) > 1:
print ("Duplicate Files => {}".format(', '.join(v)))
def summarize_duplicates(hash_file_dict):
"""
Summarize file duplicate searching
:hash_file_dict: dictionary, contains hash:files
:returns: dictionary containing the search summary
"""
summary = { 'dupcount' : 0, # number of unique files with duplicates
'empty' : 0, # number of empty files
'duptotal' : 0 } # overall number of duplicate files
for k, v in hash_file_dict.items():
if len(v) > 1:
summary['dupcount'] += 1
summary['duptotal'] += len(v)
if os.stat(v[0]).st_size == 0:
summary['empty'] += len(v)
return summary
def delete_all_duplicates(hash_file_dict):
"""
Delete all files with duplicates
:hash_file_dict: dictionary, contains hash:files
:returns: None
"""
for k, v in hash_file_dict.items():
if len(v) > 1:
for dup in v:
os.remove(dup)
print ("All duplicate files are deleted.")
def move_duplicates(hash_file_dict, dirname):
"""
Move duplicates to location specified by parameter dirname
:hash_file_dict: dictionary, contains hash:files
:dirname: location on where to transfer duplicates
:returns: None
"""
if not os.path.isdir(dirname):
print("Invalid Directory {}. Aborted.".format(dirname))
return
for k, v in hash_file_dict.items():
if len(v) > 1:
for dup in v:
os.rename(dup, os.path.join(dirname, os.path.basename(dup)))
print ("All duplicate files are moved to {}.".format(dirname))
def open_file(filepath):
"""
Opens file using default programs
:filepath: filepath of the file to be openned
:returns: None
"""
if sys.platform.startswith('darwin'): # mac
os.system("open {}".format(filepath))
elif os.name == 'nt': # windows
os.system("start {}".format(filepath))
elif os.name == 'posix': # unix
os.system("xdg-open {}".format(filepath))
def interactive_mode(hash_file_dict):
"""
Interactively go through each of the duplicate files to choose action.
Interactive actions on individual files include:
[d]eleting duplicate
[v]iew file contents
:hash_file_dict: dictionary, contains hash:files
:returns: None
"""
print("=" * 80)
for k, v in hash_file_dict.items():
if len(v) > 1:
print ("Duplicate Files => {}".format(', '.join(v)))
while True:
action = input("[s]kip, take [a]ction > ").lower()
if action in "sa" and len(action) == 1:
break
if action == "s":
continue
# action == "a", choose action for each of the duplicates
for i, dup in enumerate(v):
print("Duplicate {}: {}".format(i, dup))
while True:
action = input("[s]kip [d]elete [o]pen [r]ename [m]ove > ")\
.lower()
if action in "sdorm" and len(action) == 1:
if action == "s":
break
elif action == "d":
os.remove(dup)
break
elif action == "o":
open_file(dup)
# after opening, it is assumed that the user
# might want to take another action
elif action == "r":
newname = input("new name > ")
os.rename(dup,
os.path.join(os.path.dirname(dup), newname))
break
elif action == "m":
while True:
destdir = input("directory name > ")
if os.path.isdir(destdir):
break
os.rename(dup,
os.path.join(destdir, os.path.basename(dup)))
break
def get_duplicates(paths):
"""
From a list of paths, returns duplicates path dictionnary
returns:{key:hash, value:filename}
"""
filesize_dict = collections.defaultdict(list)
for path in paths:
filesize_dict = get_filesize_dict(path, filesize_dict)
hash_file_dict = hash_dict_from_filesize_dict(filesize_dict)
return hash_file_dict
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Python Duplicates - Find duplicates"
)
parser.add_argument(
'paths',
nargs='+',
help="paths where to find duplicates")
parser.add_argument(
'-d',
'--delete',
action='store_true',
help="delete all files with duplicates"
)
parser.add_argument(
'-i',
'--interactive',
action='store_true',
help="interactively manage duplicates"
)
parser.add_argument(
'-s',
'--summary',
action='store_true',
help="display summary of duplicate search"
)
parser.add_argument(
'-m',
'--move',
metavar='DIR',
help="move all duplicates to another directory"
)
args = parser.parse_args()
duplicates_dict = get_duplicates(args.paths)
print_duplicates(duplicates_dict)
if args.delete:
delete_all_duplicates(duplicates_dict)
elif args.interactive:
interactive_mode(duplicates_dict)
elif args.summary:
summary = summarize_duplicates(duplicates_dict)
print("** summary **")
print(
"{dupcount} files have duplicates, having a total of {duptotal}"
" duplicate files.\n{empty} files are empty.".format(**summary)
)
elif args.move:
move_duplicates(duplicates_dict, args.move)