图片名称不同内容相同滤重
#!/usr/bin/env python#coding:utf8import osimport picklefrom hashlib import md5from collections import defaultdictfrom pprint import pprintfrom random import choiceIMG_EXTS = ['.jpg', '.gif...
·
#!/usr/bin/env python #coding:utf8 import os import pickle from hashlib import md5 from collections import defaultdict from pprint import pprint from random import choice IMG_EXTS = ['.jpg', '.gif', '.jpeg', '.png'] OBJ_FILE = 'obj.pickle' # images path IMG_PATH = r'E:\creatism_data\beautiful people' def rm(path): print( 'remove %s' % path) # 真正执行删除 os.unlink(path) def remove_dup(dup_list): keep = choice(dup_list) print( 'Keep %s' % keep) dup_list.remove(keep) [rm(f) for f in dup_list] def store_obj(obj, fp=OBJ_FILE): print ('Dump obj to %s' % fp) with open(fp, 'wb') as fb: pickle.dump(obj, fb) print ('Done') def calc_md5(fp): chunk = 4 * 1024 m = md5() with open(fp,'rb') as fb: while True: content = fb.read(chunk) if not content: break m.update(content) return m.hexdigest() def get_files(path): for root, dirs, files in os.walk(path): for fn in files: if os.path.splitext(fn)[-1].lower() in IMG_EXTS: fp = os.path.join(root, fn) yield fp def get_obj(): if os.path.exists(OBJ_FILE): print ('Obj file exists, we can get result from that :)') with open(OBJ_FILE,'rb') as fb: return pickle.load(fb) print ('Calculating all the image files md5 value ...') dup_dl = defaultdict(list) for fp in get_files(IMG_PATH): fp_md5 = calc_md5(fp) dup_dl[fp_md5].append(fp) print ('Done') return dup_dl def main(): obj = get_obj() for key, value in obj.items(): if len(value) >= 2: remove_dup(value) store_obj(obj) if __name__ == "__main__": main()
更多推荐
已为社区贡献1条内容
所有评论(0)