114 lines
3.7 KiB
Python
114 lines
3.7 KiB
Python
# Similarity measure calculation of web pages
|
|
# GPL v. 3
|
|
# Jesper Bergman (jesperbe@dsv.su.se)
|
|
|
|
#! /bin/python
|
|
import sys
|
|
import os
|
|
import logging
|
|
import numpy as np
|
|
import pandas as pd
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
from sklearn.metrics.pairwise import pairwise_distances
|
|
from sklearn.metrics import multilabel_confusion_matrix
|
|
from sklearn.metrics import jaccard_score
|
|
|
|
def write_results(entry, handle):
|
|
logging.info("Results output file created.")
|
|
|
|
handle.write(entry)
|
|
|
|
def calculate_sim_score(cw, dw):
|
|
# Create vector
|
|
corpus = []
|
|
corpus.append(cw)
|
|
corpus.append(dw)
|
|
vectorizer = CountVectorizer(lowercase='false', analyzer='char', decode_error='ignore')
|
|
X = vectorizer.fit_transform(corpus)
|
|
feature_names = vectorizer.get_feature_names_out()
|
|
df = pd.DataFrame(data=X.toarray(), columns=feature_names)
|
|
js_x = X[0].toarray()[0]
|
|
js_y = X[1].toarray()[0]
|
|
|
|
cosine_score = str(np.mean(cosine_similarity(X[0],X[1])))
|
|
j_score = jaccard_score(js_x, js_y, average="micro")
|
|
|
|
# Debug
|
|
print(feature_names)
|
|
print(X[0].toarray()[0])
|
|
print(X[1].toarray()[0])
|
|
print("\n| -- Cosine: " , cosine_score)
|
|
print("\n| -- Jaccard: " , j_score)
|
|
|
|
|
|
#print("|---- Manhattan score X[0]: ", pairwise_distances(X[0],X[1], metric='manhattan'))
|
|
#print("|---- Manhattan score: ", pairwise_distances(X, metric='manhattan'))
|
|
#print("|---- L2 score: ", pairwise_distances(X, metric='l2'))
|
|
#print("|---- Cosine score: ", pairwise_distances(X, metric='cosine'))
|
|
|
|
return [cosine_score, j_score]
|
|
|
|
def read_files(cw_path, dw_path):
|
|
|
|
try:
|
|
cw_directory = os.path.isdir(cw_path)
|
|
cw_directory = os.listdir(cw_path)
|
|
dw_directory = os.path.isdir(dw_path)
|
|
dw_directory = os.listdir(dw_path)
|
|
logging.info(f'{cw_path} is the clear web pages input directory.')
|
|
logging.info(f'{dw_path} is the clear web pages input directory.')
|
|
|
|
except Exception as e:
|
|
print("Exception: ", e)
|
|
|
|
file_name_list = []
|
|
|
|
# Loop through CW files
|
|
for cw_file in cw_directory:
|
|
os.chdir(cw_path)
|
|
if os.path.isfile(cw_file):
|
|
file_name_list.append(os.path.basename(cw_file))
|
|
print("Appending basename: ", os.path.basename(cw_file))
|
|
|
|
for file_name in file_name_list:
|
|
os.chdir(cw_path)
|
|
cw_file_stream = open(file_name, 'r')
|
|
cw_file_content = cw_file_stream.read()
|
|
print("CW Current file: ", os.getcwd(), "/", file_name);
|
|
cw_file_name = str("cw_scrape/" + file_name)
|
|
|
|
os.chdir(dw_path)
|
|
try:
|
|
dw_file_stream = open(file_name, 'r')
|
|
dw_file_content = dw_file_stream.read()
|
|
dw_file_name = str("dw_scrape/" + file_name)
|
|
|
|
print("DW Current file: ", os.getcwd(), "/" , file_name);
|
|
|
|
# Register input files
|
|
with open("/tmp/similarity_score_results.txt", 'a') as f:
|
|
result = calculate_sim_score(cw_file_content, dw_file_content)
|
|
output_string = str(cw_file_name + " vs. " + dw_file_name + "\n Cosine score: " + str(result[0]) + " Jaccard score: " + str(result[1]) + "\n\n")
|
|
f.write(output_string)
|
|
|
|
except FileNotFoundError as e:
|
|
print("|-- In " , os.getcwd(), e)
|
|
pass
|
|
|
|
def main():
|
|
# Start logging
|
|
logging.basicConfig(filename='/tmp/similarity_score_calculation.log', filemode='a', format='%(name)s - %(levelname)s - %(message)s')
|
|
logging.info("Program started.")
|
|
|
|
# Input files
|
|
cw_path = sys.argv[1]
|
|
dw_path = sys.argv[2]
|
|
|
|
# Read and calculate sim.
|
|
read_files(cw_path, dw_path)
|
|
|
|
# Start
|
|
if __name__ == "__main__":
|
|
main()
|