DIDECT2S/D3-Collector/similarity_measure.py

114 lines
3.7 KiB
Python

# Similarity measure calculation of web pages
# GPL v. 3
# Jesper Bergman (jesperbe@dsv.su.se)
#! /bin/python
import sys
import os
import logging
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import jaccard_score
def write_results(entry, handle):
logging.info("Results output file created.")
handle.write(entry)
def calculate_sim_score(cw, dw):
# Create vector
corpus = []
corpus.append(cw)
corpus.append(dw)
vectorizer = CountVectorizer(lowercase='false', analyzer='char', decode_error='ignore')
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()
df = pd.DataFrame(data=X.toarray(), columns=feature_names)
js_x = X[0].toarray()[0]
js_y = X[1].toarray()[0]
cosine_score = str(np.mean(cosine_similarity(X[0],X[1])))
j_score = jaccard_score(js_x, js_y, average="micro")
# Debug
print(feature_names)
print(X[0].toarray()[0])
print(X[1].toarray()[0])
print("\n| -- Cosine: " , cosine_score)
print("\n| -- Jaccard: " , j_score)
#print("|---- Manhattan score X[0]: ", pairwise_distances(X[0],X[1], metric='manhattan'))
#print("|---- Manhattan score: ", pairwise_distances(X, metric='manhattan'))
#print("|---- L2 score: ", pairwise_distances(X, metric='l2'))
#print("|---- Cosine score: ", pairwise_distances(X, metric='cosine'))
return [cosine_score, j_score]
def read_files(cw_path, dw_path):
try:
cw_directory = os.path.isdir(cw_path)
cw_directory = os.listdir(cw_path)
dw_directory = os.path.isdir(dw_path)
dw_directory = os.listdir(dw_path)
logging.info(f'{cw_path} is the clear web pages input directory.')
logging.info(f'{dw_path} is the clear web pages input directory.')
except Exception as e:
print("Exception: ", e)
file_name_list = []
# Loop through CW files
for cw_file in cw_directory:
os.chdir(cw_path)
if os.path.isfile(cw_file):
file_name_list.append(os.path.basename(cw_file))
print("Appending basename: ", os.path.basename(cw_file))
for file_name in file_name_list:
os.chdir(cw_path)
cw_file_stream = open(file_name, 'r')
cw_file_content = cw_file_stream.read()
print("CW Current file: ", os.getcwd(), "/", file_name);
cw_file_name = str("cw_scrape/" + file_name)
os.chdir(dw_path)
try:
dw_file_stream = open(file_name, 'r')
dw_file_content = dw_file_stream.read()
dw_file_name = str("dw_scrape/" + file_name)
print("DW Current file: ", os.getcwd(), "/" , file_name);
# Register input files
with open("/tmp/similarity_score_results.txt", 'a') as f:
result = calculate_sim_score(cw_file_content, dw_file_content)
output_string = str(cw_file_name + " vs. " + dw_file_name + "\n Cosine score: " + str(result[0]) + " Jaccard score: " + str(result[1]) + "\n\n")
f.write(output_string)
except FileNotFoundError as e:
print("|-- In " , os.getcwd(), e)
pass
def main():
# Start logging
logging.basicConfig(filename='/tmp/similarity_score_calculation.log', filemode='a', format='%(name)s - %(levelname)s - %(message)s')
logging.info("Program started.")
# Input files
cw_path = sys.argv[1]
dw_path = sys.argv[2]
# Read and calculate sim.
read_files(cw_path, dw_path)
# Start
if __name__ == "__main__":
main()