# Similarity measure calculation of web pages # GPL v. 3 # Jesper Bergman (jesperbe@dsv.su.se) #! /bin/python import sys import os import logging import numpy as np import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import pairwise_distances from sklearn.metrics import multilabel_confusion_matrix from sklearn.metrics import jaccard_score def write_results(entry, handle): logging.info("Results output file created.") handle.write(entry) def calculate_sim_score(cw, dw): # Create vector corpus = [] corpus.append(cw) corpus.append(dw) vectorizer = CountVectorizer(lowercase='false', analyzer='char', decode_error='ignore') X = vectorizer.fit_transform(corpus) feature_names = vectorizer.get_feature_names_out() df = pd.DataFrame(data=X.toarray(), columns=feature_names) js_x = X[0].toarray()[0] js_y = X[1].toarray()[0] cosine_score = str(np.mean(cosine_similarity(X[0],X[1]))) j_score = jaccard_score(js_x, js_y, average="micro") # Debug print(feature_names) print(X[0].toarray()[0]) print(X[1].toarray()[0]) print("\n| -- Cosine: " , cosine_score) print("\n| -- Jaccard: " , j_score) #print("|---- Manhattan score X[0]: ", pairwise_distances(X[0],X[1], metric='manhattan')) #print("|---- Manhattan score: ", pairwise_distances(X, metric='manhattan')) #print("|---- L2 score: ", pairwise_distances(X, metric='l2')) #print("|---- Cosine score: ", pairwise_distances(X, metric='cosine')) return [cosine_score, j_score] def read_files(cw_path, dw_path): try: cw_directory = os.path.isdir(cw_path) cw_directory = os.listdir(cw_path) dw_directory = os.path.isdir(dw_path) dw_directory = os.listdir(dw_path) logging.info(f'{cw_path} is the clear web pages input directory.') logging.info(f'{dw_path} is the clear web pages input directory.') except Exception as e: print("Exception: ", e) file_name_list = [] # Loop through CW files for cw_file in cw_directory: os.chdir(cw_path) if os.path.isfile(cw_file): file_name_list.append(os.path.basename(cw_file)) print("Appending basename: ", os.path.basename(cw_file)) for file_name in file_name_list: os.chdir(cw_path) cw_file_stream = open(file_name, 'r') cw_file_content = cw_file_stream.read() print("CW Current file: ", os.getcwd(), "/", file_name); cw_file_name = str("cw_scrape/" + file_name) os.chdir(dw_path) try: dw_file_stream = open(file_name, 'r') dw_file_content = dw_file_stream.read() dw_file_name = str("dw_scrape/" + file_name) print("DW Current file: ", os.getcwd(), "/" , file_name); # Register input files with open("/tmp/similarity_score_results.txt", 'a') as f: result = calculate_sim_score(cw_file_content, dw_file_content) output_string = str(cw_file_name + " vs. " + dw_file_name + "\n Cosine score: " + str(result[0]) + " Jaccard score: " + str(result[1]) + "\n\n") f.write(output_string) except FileNotFoundError as e: print("|-- In " , os.getcwd(), e) pass def main(): # Start logging logging.basicConfig(filename='/tmp/similarity_score_calculation.log', filemode='a', format='%(name)s - %(levelname)s - %(message)s') logging.info("Program started.") # Input files cw_path = sys.argv[1] dw_path = sys.argv[2] # Read and calculate sim. read_files(cw_path, dw_path) # Start if __name__ == "__main__": main()