#!/usr/bin/env python
#coding:utf8

import os
import pickle

from hashlib import md5
from collections import defaultdict
from pprint import pprint
from random import choice

IMG_EXTS = ['.jpg', '.gif', '.jpeg', '.png']
OBJ_FILE = 'obj.pickle'

# images path
IMG_PATH = r'E:\creatism_data\beautiful people'

def rm(path):
    print( 'remove %s' % path)
    # 真正执行删除
    os.unlink(path)

def remove_dup(dup_list):
    keep = choice(dup_list)
    print( 'Keep %s' % keep)
    dup_list.remove(keep)
    [rm(f) for f in dup_list]

def store_obj(obj, fp=OBJ_FILE):
    print ('Dump obj to %s' % fp)
    with open(fp, 'wb') as fb:
        pickle.dump(obj, fb)
    print ('Done')

def calc_md5(fp):
    chunk = 4 * 1024
    m = md5()
    with open(fp,'rb') as fb:
        while True:
            content = fb.read(chunk)
            if not content: break
            m.update(content)
    return m.hexdigest()

def get_files(path):
    for root, dirs, files in os.walk(path):
        for fn in files:
            if os.path.splitext(fn)[-1].lower() in IMG_EXTS:
                fp = os.path.join(root, fn)
                yield fp

def get_obj():
    if os.path.exists(OBJ_FILE):
        print ('Obj file exists, we can get result from that :)')
        with open(OBJ_FILE,'rb') as fb:
            return pickle.load(fb)
    print ('Calculating all the image files md5 value ...')
    dup_dl = defaultdict(list)
    for fp in get_files(IMG_PATH):
        fp_md5 = calc_md5(fp)
        dup_dl[fp_md5].append(fp)
    print ('Done')
    return dup_dl

def main():
    obj = get_obj()
    for key, value in obj.items():
        if len(value) >= 2:
            remove_dup(value)

    store_obj(obj)

if __name__ == "__main__":
    main()
Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐