- #!/usr/bin/python
-
- import filecmp
- import os
-
- #Compare two files
- print os.stat('server.py');
- print os.stat('server2.py');
- print filecmp.cmp('server.py', 'server2.py', shallow = False);
-
- #Compare the assignments submitted by students
- #Compare all files from one student with all files from another student
- #!/usr/bin/python
-
- import filecmp
- import os
- import sys
-
- def getFiles(d):
- '''Get all files in a folder
- '''
- files = []
- for (dirpath, dirnames, filenames) in os.walk(d):
- for f in filenames:
- if f[0] != '.':
- files.append(os.path.join(dirpath, f));
- return files;
-
- def getDir(d):
- '''Get all directories in a folder
- '''
- l = os.listdir(d);
- return [os.path.join(d, e) for e in l if os.path.isdir(os.path.join(d,e))];
-
- def compareFolders(dir1, dir2):
- '''Compare the files in a directory with the files in another directory
- '''
- files_1 = getFiles(dir1);
- files_2 = getFiles(dir2);
- for f1 in files_1:
- for f2 in files_2:
- if filecmp.cmp(f1, f2, shallow = False):
- print f1, f2
-
- if __name__ == '__main__':
- if len(sys.argv) != 2:
- print 'Usage: python comparison.py dir'
- sys.exit(1);
-
- #Get all folders
- folders = getDir(sys.argv[1]);
-
- for index, d1 in enumerate(folders):
- for d2 in range(index+1, len(folders)):
- compareFolders(d1, folders[d2]);
-
- #Compare the assignments submitted by students
- #Compare all files from one student with all files from another student, and report file pairs that have high similarity
- #!/usr/bin/python
-
- import filecmp
- import os
- import sys
- from sklearn.feature_extraction.text import TfidfVectorizer
-
- def getFiles(d):
- '''Get all files in a folder
- '''
- files = []
- for (dirpath, dirnames, filenames) in os.walk(d):
- for f in filenames:
- if f[0] != '.':
- files.append(os.path.join(dirpath, f));
- return files;
-
- def getDir(d):
- '''Get all directories in a folder
- '''
- l = os.listdir(d);
- return [os.path.join(d, e) for e in l if os.path.isdir(os.path.join(d,e))];
-
- def compareFiles(f1, f2):
- '''Check the similarity of a pair of files
- '''
- with open(f1, 'r') as content_file:
- c1= content_file.read()
- with open(f2, 'r') as content_file:
- c2= content_file.read()
- documents = [c1, c2];
- tfidf = TfidfVectorizer().fit_transform(documents)
- pairwise_similarity = tfidf * tfidf.T
- return pairwise_similarity[0, 1]
-
- def compareFolders(dir1, dir2):
- '''Compare the files in a directory with the files in another directory
- '''
- files_1 = getFiles(dir1);
- files_2 = getFiles(dir2);
- for f1 in files_1:
- for f2 in files_2:
- similarity = compareFiles(f1, f2);
- if similarity > 0.8:
- print f1, f2, similarity
-
- if __name__ == '__main__':
- if len(sys.argv) != 2:
- print 'Usage: python comparison.py dir'
- sys.exit(1);
-
- #Get all folders
- folders = getDir(sys.argv[1]);
-
- for index, d1 in enumerate(folders):
- for d2 in range(index+1, len(folders)):
- compareFolders(d1, folders[d2]);
-
- #import filecmp
-
- dc = filecmp.dircmp(dir_left, dir_right);
- print dc.left_list # terms in left directory
- print dc.right_list # terms in right directory
- print dc.left_only # terms in left directory only
- print dc.right_only # terms in right directory only
- print dc.common_files # files in both directories
- print dc.diff_files # same file name but different content, compared with os.state() only
- print dc.same_files # same file name and same content
-