#!/usr/bin/env python #-*-coding:utf-8-*- # # $Id: deldupes,v 0.4 2009/10/07 18:34:14 stef Exp $ # # Find duplicates (files with matching MD5 hashes) in a directory and # optionally delete all except one of the duplicates. # # Copyright (c) Stefan Pettersson 2009, http://www.bigpointyteeth.se/ # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # import os, sys import hashlib me = os.path.basename(sys.argv[0]) def usage(): print "Usage: %s FILE ..." % me sys.exit(1) def getyesno(question): """ Displays the question string to the user and expects a yes/no answer. Returns True if yes, False if no.""" answer = "a" while True: if answer.lower() in ("yes", "y"): return True elif answer.lower() in ("no", "n"): return False else: answer = raw_input(question + " (yes/no) ") def choose(msg, choices): """ Take a message string and a list of strings (filenames) as input. Print all the filenames with numbers to console and wait for a numbered input from the user. Return the number given, or if "x" is chosen (none), return -1. """ # list them in alphabetical order choices.sort() for i in range(len(choices)): print "(%d) %s" % (i, choices[i]) print "(x) all of them" answer = -1 while True: answer = raw_input(msg + " ") try: answer = int(answer) except ValueError: if answer == "x": return -1 continue if 0 <= answer and answer <= len(choices)-1: return choices[int(answer)] if len(sys.argv) < 2: usage() # key => filename string # value => file's cryptographic hash hashes = {} # key => hash # value => list of names with the same hash duplicates = {} # files to delete delete = [] try: # get filenames from command line filenames = sys.argv[1:] # calculate hashes for all files for file in filenames: try: F = open(file, "r") except IOError: continue S = F.read() hashes[file] = hashlib.md5(S).hexdigest() F.close() # check for duplicates and populate duplicates dictionary while hashes: name, hash = hashes.popitem() duplicates[hash] = [name] for pair in hashes.items(): if hash == pair[1]: # found duplicate, add to duplicates, remove from files duplicates[pair[1]].append(pair[0]) hashes.pop(pair[0]) # go through the list of duplicates and choose which to keep for key, values in duplicates.items(): if len(values) > 1: keep = choose("Which one do you want to keep?", values) if keep == -1: # the user wants to keep all the files continue for value in values: # store any file names that are to be deleted if value != keep: delete.append(value) # did we find any duplicates? # where any of them chosen for deletion? if len(delete) < 1: print "Done. Nothing removed." sys.exit(0) print "" # print the files that are to be deleted for file in delete: print file if getyesno("Delete these files?"): for file in delete: os.remove(file) pass print "Done. %d files removed." % len(delete) sys.exit(0) else: print "Done. Nothing removed." sys.exit(0) except KeyboardInterrupt: print "\nUser-abort." sys.exit(1) # EOF