Script for comparing two CIS PDF files (#15307)

2024-11-06 00:45:19 +00:00 · 2023-12-06 09:21:12 -05:00 · 2023-12-06 09:21:12 -05:00 · d40555e7cd
commit d40555e7cd
parent f89d78d065
1 changed files with 130 additions and 0 deletions
--- a/tools/cis/CIS-Benchmark-diff.py
+++ b/tools/cis/CIS-Benchmark-diff.py
@ -0,0 +1,130 @@
+# This script takes two CIS Benchmark PDFs as input and diffs them
+# For example: It will generate a diff of the Win10 & W11 benchmarks
+# Requires installation of the PyMuPDF dep (pip3 install PyMuPDF).
+# cmd line example: Python3 ./CIS-Benchmark-diff.py File1.pdf File2.pdf 
+
+import fitz  # PyMuPDF
+import re
+import difflib
+import sys
+from datetime import datetime
+
+def is_start_of_new_item(line):
+    """
+    Check if a line starts with a number pattern like '1', '1.1', up to '100.7.32'.
+    """
+    return bool(re.match(r'\d{1,3}(?:\.\d{1,2}){0,2}', line.strip()))
+
+def remove_trailing_whitespace(text):
+    """
+    Remove trailing whitespace from each line in the text.
+    """
+    return '\n'.join(line.rstrip() for line in text.split('\n'))
+
+def correct_word_wrapping(text):
+    """
+    Correct word wrapping issues in the extracted text.
+    Each line should start with a number pattern from '1' to '100.7.32'.
+    """
+    lines = text.split('\n')
+    corrected_lines = []
+    for line in lines:
+        if corrected_lines and not is_start_of_new_item(line):
+            # Append this line to the previous one
+            corrected_lines[-1] += ' ' + line
+        else:
+            corrected_lines.append(line)
+    return '\n'.join(corrected_lines)
+
+def extract_recommendations_fitz(pdf_path, start_phrase, end_phrase):
+    """
+    Extract a specific section from a PDF file.
+    """
+    doc = fitz.open(pdf_path)
+    recommendations = ""
+    capture = False
+
+    for page in doc:
+        text_blocks = page.get_text("blocks")
+        for block in text_blocks:
+            block_text = block[4].strip()  # Extract text from the block
+            if block_text:
+                # Check for the start and end of the section
+                if start_phrase in block_text and not capture:
+                    capture = True
+                elif end_phrase in block_text and capture:
+                    capture = False
+                    break
+
+                if capture:
+                    recommendations += block_text + "\n"
+
+    # Cleanup process
+    recommendations_cleaned = re.sub(r'Page\s+\d{1,3}', '', recommendations)  # Remove "Page <number>" lines
+    recommendations_cleaned = re.sub(r'\.{2,}\s*\d+', '', recommendations_cleaned)  # Remove periods followed by page numbers
+    recommendations_cleaned = re.sub(r'\s+\d{2,4}\s*$', '', recommendations_cleaned, flags=re.MULTILINE)  # Remove 2 to 4 digit numbers at the end of lines
+    recommendations_corrected = correct_word_wrapping(recommendations_cleaned)  # Correct word wrapping
+    final_recommendations = remove_trailing_whitespace(recommendations_corrected)  # Remove trailing whitespace
+
+    return final_recommendations
+
+def create_custom_diff(text1, text2):
+    """
+    Create a custom diff of two texts with custom labels.
+    """
+    text1_lines = text1.splitlines()
+    text2_lines = text2.splitlines()
+
+    # Generate a diff without additional context lines
+    diff = difflib.unified_diff(text1_lines, text2_lines, lineterm='', 
+                                fromfile='file1', tofile='file2', 
+                                n=0)  # 'n=0' for no context lines
+
+    # Customizing diff output to replace '+' and '-' with 'file1' and 'file2'
+    custom_diff = []
+    for line in diff:
+        if line.startswith('-'):
+            custom_diff.append('file1: ' + line[1:])
+        elif line.startswith('+'):
+            custom_diff.append('file2: ' + line[1:])
+        else:
+            custom_diff.append(line)
+
+    return '\n'.join(custom_diff)
+
+def main(file1, file2):
+    # Start and end phrases for the extraction
+    start_phrase = "Recommendations ..."
+    end_phrase = "Appendix: Summary Table ..."
+
+    # Extract recommendations from both PDFs
+    recommendations_file1 = extract_recommendations_fitz(file1, start_phrase, end_phrase)
+    recommendations_file2 = extract_recommendations_fitz(file2, start_phrase, end_phrase)
+
+    # Write the cleaned and corrected data to a file
+    with open('cleaned.txt', 'w') as file:
+        file.write("Cleaned Data from file 1 PDF:\n\n")
+        file.write(recommendations_file1)
+        file.write("\n\nCleaned Data from file 2 PDF:\n\n")
+        file.write(recommendations_file2)
+    print("Cleaned data file created: cleaned.txt")
+
+    # Perform the custom diff
+    diff_result = create_custom_diff(recommendations_file1, recommendations_file2)
+
+    # Write the diff result to a file with a timestamp
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    with open('cis_benchmarks_diff.txt', 'w') as file:
+        file.write(f"Diff generated on: {timestamp}\n\n")
+        file.write(diff_result)
+    print("Diff file created: cis_benchmarks_diff.txt")
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python script.py <path_to_cis_benchmark_1_pdf> <path_to_cis_benchmark_2_pdf>")
+        sys.exit(1)
+    
+    file1 = sys.argv[1]
+    file2 = sys.argv[2]
+    main(file1, file2)
+