#coding=utf-8 import xlrd import distance from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer import numpy as np from scipy.linalg import norm workbook = xlrd.open_workbook(u'工程師問答.xls') sheet_names= workbook.sheet_names() ls = [] for sheet_name in sheet_names: sheet1 = workbook.sheet_by_name(sheet_name) for i in range(1, 3858): row = sheet1.row_values(i) ls.append(row[0]) # print len(ls) target = u'D90的發動機熱效率是多少?' print u'目標語句:' + target # 編輯距離計算 def edit_distance(s1, s2): return distance.levenshtein(s1, s2) results = list(filter(lambda x: edit_distance(x, target) <= 5, ls)) print u'1)編輯距離計算,閾值為5' for i in results: print i # 傑卡德係數計算 def jaccard_similarity(s1, s2): def add_space(s): return ' '.join(list(s)) # 將字中間加入空格 s1, s2 = add_space(s1), add_space(s2) # 轉化為TF矩陣 cv = CountVectorizer(tokenizer=lambda s: s.split()) corpus = [s1, s2] vectors = cv.fit_transform(corpus).toarray() # 求交集 numerator = np.sum(np.min(vectors, axis=0)) # 求並集 denominator = np.sum(np.max(vectors, axis=0)) # 計算傑卡德係數 return 1.0 * numerator / denominator results = list(filter(lambda x: jaccard_similarity(x, target) > 0.6, ls)) print u'2)傑卡德係數計算,閾值為0.6' for i in results: print i # TF 計算 def tf_similarity(s1, s2): def add_space(s): return ' '.join(list(s)) # 將字中間加入空格 s1, s2 = add_space(s1), add_space(s2) # 轉化為TF矩陣 cv = CountVectorizer(tokenizer=lambda s: s.split()) corpus = [s1, s2] vectors = cv.fit_transform(corpus).toarray() # 計算TF係數 return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1])) results = list(filter(lambda x: tf_similarity(x, target) > 0.7, ls)) print u'3)TF 計算,閾值為0.7' for i in results: print i # TFIDF 係數 def tfidf_similarity(s1, s2): def add_space(s): return ' '.join(list(s)) # 將字中間加入空格 s1, s2 = add_space(s1), add_space(s2) # 轉化為TF矩陣 cv = TfidfVectorizer(tokenizer=lambda s: s.split()) corpus = [s1, s2] vectors = cv.fit_transform(corpus).toarray() # 計算TF係數 return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1])) results = list(filter(lambda x: tfidf_similarity(x, target) > 0.6, ls)) print u'4)TFIDF 係數,閾值為0.6' for i in results: print i