python 相似語句匹配(非機器學習)

右介發表於2018-11-20
#coding=utf-8

import xlrd
import distance
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import numpy as np
from scipy.linalg import norm

workbook = xlrd.open_workbook(u'工程師問答.xls')
sheet_names= workbook.sheet_names()

ls = []
for sheet_name in sheet_names:

    sheet1 = workbook.sheet_by_name(sheet_name)
    for i in range(1, 3858):
        row = sheet1.row_values(i)
        ls.append(row[0])

# print len(ls)
target = u'D90的發動機熱效率是多少?'
print u'目標語句:' + target


# 編輯距離計算
def edit_distance(s1, s2):
    return distance.levenshtein(s1, s2)

results = list(filter(lambda x: edit_distance(x, target) <= 5, ls))
print u'1)編輯距離計算,閾值為5'
for i in results:
    print i

# 傑卡德係數計算
def jaccard_similarity(s1, s2):
    def add_space(s):
        return ' '.join(list(s))
    
    # 將字中間加入空格
    s1, s2 = add_space(s1), add_space(s2)
    # 轉化為TF矩陣
    cv = CountVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = cv.fit_transform(corpus).toarray()
    # 求交集
    numerator = np.sum(np.min(vectors, axis=0))
    # 求並集
    denominator = np.sum(np.max(vectors, axis=0))
    # 計算傑卡德係數
    return 1.0 * numerator / denominator

results = list(filter(lambda x: jaccard_similarity(x, target) > 0.6, ls))
print u'2)傑卡德係數計算,閾值為0.6'
for i in results:
    print i


# TF 計算
def tf_similarity(s1, s2):
    def add_space(s):
        return ' '.join(list(s))
    
    # 將字中間加入空格
    s1, s2 = add_space(s1), add_space(s2)
    # 轉化為TF矩陣
    cv = CountVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = cv.fit_transform(corpus).toarray()
    # 計算TF係數
    return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))

results = list(filter(lambda x: tf_similarity(x, target) > 0.7, ls))
print u'3)TF 計算,閾值為0.7'
for i in results:
    print i


# TFIDF 係數
def tfidf_similarity(s1, s2):
    def add_space(s):
        return ' '.join(list(s))
    
    # 將字中間加入空格
    s1, s2 = add_space(s1), add_space(s2)
    # 轉化為TF矩陣
    cv = TfidfVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = cv.fit_transform(corpus).toarray()
    # 計算TF係數
    return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))

results = list(filter(lambda x: tfidf_similarity(x, target) > 0.6, ls))
print u'4)TFIDF 係數,閾值為0.6'
for i in results:
    print i

 

相關文章