How to compare files in two zip file are totally same or not?

I have two zip files. I want to see if everything (file name and each file's contents in the zip) are the same.

There is a similar question. But the answer does not support zip files.

Anyone has a good idea?

3 answers

  • answered 2021-03-08 06:13 jasonshu

    Seems zip will have different hashes even you zip identical items. We can divide it two parts: first is to unzip, second is to compare folders after unzip.

    import os
    import filecmp
    import zipfile
    
    def are_dir_trees_equal(dir1, dir2):
    
        dirs_cmp = filecmp.dircmp(dir1, dir2)
        if len(dirs_cmp.left_only)>0 or len(dirs_cmp.right_only)>0 or \
           len(dirs_cmp.funny_files)>0:
           return False
        (_, mismatch, errors) =  filecmp.cmpfiles(
            dir1, dir2, dirs_cmp.common_files, shallow=False)
        if len(mismatch)>0 or len(errors)>0:
            return False
        for common_dir in dirs_cmp.common_dirs:
            new_dir1 = os.path.join(dir1, common_dir)
            new_dir2 = os.path.join(dir2, common_dir)
            if not are_dir_trees_equal(new_dir1, new_dir2):
               return False
        return True
    
    
    BASE_PATH = '/Users/Documents/'
    model1 = os.path.join(BASE_PATH, 'test1.zip')
    model2 = os.path.join(BASE_PATH, 'test2.zip')
    
    with zipfile.ZipFile(model1,"r") as zip_ref:
        zip_ref.extractall(BASE_PATH)
    with zipfile.ZipFile(model2,"r") as zip_ref:
        zip_ref.extractall(BASE_PATH)
    
    folder1 = model1.split('.')[0]
    folder2 = model2.split('.')[0]
    
    is_equal = are_dir_trees_equal(folder1, folder2)
    print(is_equal)
    

  • answered 2021-03-08 06:46 BrainCity

    I tried using zipfile builtin module in python.

    from zipfile import ZipFile
    
    def compare(file1, file2):
        try:
            with ZipFile(file1, 'r') as file:
                f1 = str([x for x in file.infolist()])
            with ZipFile(file2, 'r') as file:
                f2 = str([x for x in file.infolist()])
            return f1 == f2
        except FileNotFoundError:
            return f"Either file at {file1} or {file2} does not exist"
    
    
    f = compare(file1='storage/1.zip', file2='storage/2.zip')
    print(f)
    

    I don't know this is correct approach or not (if it is not please correct me)

  • answered 2021-03-09 03:12 RootTwo

    Here's my stab at it. It may be sufficient to just make sure the ZipFiles contain the same items and that the items have matching CRC32s. (What is the chance that two ZipFiles being compared have files with the same name and same CRC32 but are different files?) If that is good enough, omit the loop that compares the file contents.

    from zipfile import ZipFile
    
    BUFSIZE = 1024
    
    def are_equivalent(filename1, filename2):
        """Compare two ZipFiles to see if they would expand into the same directory structure
        without actually extracting the files.
        """
        
        with ZipFile(filename1, 'r') as zip1, ZipFile(filename2, 'r') as zip2:
            
            # Index items in the ZipFiles by filename. For duplicate filenames, a later
            # item in the ZipFile will overwrite an ealier item; just like a later file
            # will overwrite an earlier file with the same name when extracting.
            zipinfo1 = {info.filename:info for info in zip1.infolist()}
            zipinfo2 = {info.filename:info for info in zip2.infolist()}
            
            # Do some simple checks first
            # Do the ZipFiles contain the same the files?
            if zipinfo1.keys() != zipinfo2.keys():
                return False
            
            # Do the files in the archives have the same CRCs? (This is a 32-bit CRC of the
            # uncompressed item. Is that good enough to confirm the files are the same?)
            if any(zipinfo1[name].CRC != zipinfo2[name].CRC for name in zipinfo1.keys()):
                return False
            
            # Skip/omit this loop if matching names and CRCs is good enough.
            # Open the corresponding files and compare them.
            for name in zipinfo1.keys():
                
                # 'ZipFile.open()' returns a ZipExtFile instance, which has a 'read()' method
                # that accepts a max number of bytes to read. In contrast, 'ZipFile.read()' reads
                # all the bytes at once.
                with zip1.open(zipinfo1[name]) as file1, zip2.open(zipinfo2[name]) as file2:
                    
                    while True:
                        buffer1 = file1.read(BUFSIZE)
                        buffer2 = file2.read(BUFSIZE)
                        
                        if buffer1 != buffer2:
                            return False
                        
                        if not buffer1:
                            break
                            
            return True