FileCmp
Compare Files
  1. #!/usr/bin/python
  2.  
  3. import filecmp
  4. import os
  5.  
  6. #Compare two files
  7. print os.stat('server.py');
  8. print os.stat('server2.py');
  9. print filecmp.cmp('server.py', 'server2.py', shallow = False);
  1. #Compare the assignments submitted by students
  2. #Compare all files from one student with all files from another student
  3. #!/usr/bin/python
  4.  
  5. import filecmp
  6. import os
  7. import sys
  8.  
  9. def getFiles(d):
  10. '''Get all files in a folder
  11. '''
  12. files = []
  13. for (dirpath, dirnames, filenames) in os.walk(d):
  14. for f in filenames:
  15. if f[0] != '.':
  16. files.append(os.path.join(dirpath, f));
  17. return files;
  18.  
  19. def getDir(d):
  20. '''Get all directories in a folder
  21. '''
  22. l = os.listdir(d);
  23. return [os.path.join(d, e) for e in l if os.path.isdir(os.path.join(d,e))];
  24.  
  25. def compareFolders(dir1, dir2):
  26. '''Compare the files in a directory with the files in another directory
  27. '''
  28. files_1 = getFiles(dir1);
  29. files_2 = getFiles(dir2);
  30. for f1 in files_1:
  31. for f2 in files_2:
  32. if filecmp.cmp(f1, f2, shallow = False):
  33. print f1, f2
  34.  
  35. if __name__ == '__main__':
  36. if len(sys.argv) != 2:
  37. print 'Usage: python comparison.py dir'
  38. sys.exit(1);
  39.  
  40. #Get all folders
  41. folders = getDir(sys.argv[1]);
  42.  
  43. for index, d1 in enumerate(folders):
  44. for d2 in range(index+1, len(folders)):
  45. compareFolders(d1, folders[d2]);
Check Similarity
  1. #Compare the assignments submitted by students
  2. #Compare all files from one student with all files from another student, and report file pairs that have high similarity
  3. #!/usr/bin/python
  4.  
  5. import filecmp
  6. import os
  7. import sys
  8. from sklearn.feature_extraction.text import TfidfVectorizer
  9.  
  10. def getFiles(d):
  11. '''Get all files in a folder
  12. '''
  13. files = []
  14. for (dirpath, dirnames, filenames) in os.walk(d):
  15. for f in filenames:
  16. if f[0] != '.':
  17. files.append(os.path.join(dirpath, f));
  18. return files;
  19.  
  20. def getDir(d):
  21. '''Get all directories in a folder
  22. '''
  23. l = os.listdir(d);
  24. return [os.path.join(d, e) for e in l if os.path.isdir(os.path.join(d,e))];
  25.  
  26. def compareFiles(f1, f2):
  27. '''Check the similarity of a pair of files
  28. '''
  29. with open(f1, 'r') as content_file:
  30. c1= content_file.read()
  31. with open(f2, 'r') as content_file:
  32. c2= content_file.read()
  33. documents = [c1, c2];
  34. tfidf = TfidfVectorizer().fit_transform(documents)
  35. pairwise_similarity = tfidf * tfidf.T
  36. return pairwise_similarity[0, 1]
  37.  
  38. def compareFolders(dir1, dir2):
  39. '''Compare the files in a directory with the files in another directory
  40. '''
  41. files_1 = getFiles(dir1);
  42. files_2 = getFiles(dir2);
  43. for f1 in files_1:
  44. for f2 in files_2:
  45. similarity = compareFiles(f1, f2);
  46. if similarity > 0.8:
  47. print f1, f2, similarity
  48.  
  49. if __name__ == '__main__':
  50. if len(sys.argv) != 2:
  51. print 'Usage: python comparison.py dir'
  52. sys.exit(1);
  53.  
  54. #Get all folders
  55. folders = getDir(sys.argv[1]);
  56.  
  57. for index, d1 in enumerate(folders):
  58. for d2 in range(index+1, len(folders)):
  59. compareFolders(d1, folders[d2]);
Compare Directories
  1. #import filecmp
  2.  
  3. dc = filecmp.dircmp(dir_left, dir_right);
  4. print dc.left_list # terms in left directory
  5. print dc.right_list # terms in right directory
  6. print dc.left_only # terms in left directory only
  7. print dc.right_only # terms in right directory only
  8. print dc.common_files # files in both directories
  9. print dc.diff_files # same file name but different content, compared with os.state() only
  10. print dc.same_files # same file name and same content
Reference