1# −∗− coding : utf −8 −∗−2import j i e b a
3import j i e b a . analyse
4import numpy
5import numpy as np
6import pandas as pd
7import openpyxl as op
8import xlwt
9import math
101112 c l a s s SimHash (object):13defsimHash( s e l f , content ):14 seg = j i e b a . cut ( content )15 j i e b a . analyse . set_stop_words ( ’ stopwords . txt ’ )16 keyWords = j i e b a . analyse . extract_tags ( ” | ” . j o i n ( seg ), topK=10,
withWeight=True)17# print ( keyWords)18 keyList =[]19 f o r feature , weight in keyWords :20# weight = math . c e i l ( weight )21 weight = i n t ( weight ∗ 10)+122# print ( ’ weight : { } ’ . format ( weight ) )23 b i n s t r = s e l f . string_hash ( f e a t u r e )24 temp =[]25 f o r c in b i n s t r :26 i f ( c == ’ 1 ’ ):27 temp . append ( weight )28 e l s e :29 temp . append(−weight )30 keyList . append ( temp )31 listSum = np .sum(np . array ( keyList ), a x i s =0)32# print ( listSum )33 i f ( keyList ==[]):34return ’ 00 ’
35 simhash = ’ ’
36 f o r i in listSum :37 i f ( i >0):38 simhash = simhash + ’ 1 ’
39 e l s e :40 simhash = simhash + ’ 0 ’
4142return simhash
4344defstring_hash( s e l f , source ):45 i f source == ” ” :46return047 e l s e :48 x =ord( source [0])<<749 m =100000350 mask =2 ∗∗ 128 − 151 f o r c in source :52 x =(( x ∗ m)^ord( c ))& mask
53 x ^=len( source )54 i f x == −1:55 x = −256 x =bin( x ). r e p l a c e ( ’ 0b ’ , ’ ’ ). z f i l l (64)[ −64:]57# print ( ’ strint_hash : %s , %s ’ % ( source , x ) )5859return s t r ( x )6061defgetDistance( s e l f , hashstr1 , hashstr2 ):62 ’ ’ ’
63 计 算 两 个 simhash 的 汉 明 距 离
64 ’ ’
65 length =066 f o r index , char inenumerate( hashstr1 ):67 i f char == hashstr2 [ index ]:68continue69 e l s e :70 length +=17172return length
737475 i f __name__ == ’__main__ ’ :76 simhash = SimHash ()77 f i l e = pd . read_excel ( ” t a g s i m i l a r . xlsx ” )78 questions = pd . read_excel ( ” question . xlsx ” )79 mid5 = xlwt . Workbook( encoding=’ utf −8 ’ , style_compression =0)80 sheet = mid5 . add_sheet ( ’ Sheet ’ , cell_overwrite_ok=True)8182 f o r i inrange(1,101):83 sheet . write ( i ,0, i )84 sheet . write (0,0, ’INDEX ’ )85 sheet . write (0,1, ’SUBJECT ’ )86 sheet . write (0,2, ’SIMILARITY ’ )8788 text_w =1089 pic_w =290 typ_w =1591 bac_w =59293 pic_stand =1/(math . s qr t (14028))94 typ_stand =1/(math . sq rt (6339))95 bac_stand =1/(math . sqr t (8085))9697 t a r g e t = i n t (input( ”请 输 入 需 要 检 测 相 似 度 的 题 号: ” ))9899 s t r 1= questions . values [ target − 1][1]. r e p l a c e ( ”\n” , ” ” ). r e p l a c e ( ”\ t ” , ” ” ).
r e p l a c e ( ” ” , ” ” )# 被 测 题 目100 pic = questions . values [ target − 1][2]101 typ = questions . values [ target − 1][3]102 bac = questions . values [ target − 1][4]103104# s i m i l a r i t y = np . zeros ((100 , 100) , f l o a t )105# print ( s i m i l a r i t y [ 9 9 ] [ 9 9 ] )106107 f o r i inrange(0,100):108 s t r 2= f i l e . values [ i ][1]. r e p l a c e ( ”\ r ” , ” ” ). r e p l a c e ( ”\n” , ” ” ). r e p l a c e ( ”\
t ” , ” ” ). r e p l a c e ( ” ” , ” ” )# 竖 行 题 目109 sheet . write ( i +1,1, s t r 2)110111# print ( s t r 2 )112# print(”=========”)113114 s1 = simhash . simHash ( s t r 1)115 s2 = simhash . simHash ( s t r 2)116 text_num =(64−simhash . getDistance ( s1 , s2 ))/64/6117118 i f f i l e . values [ i ][2]== pic :119 pic_num = pic_stand ∗ 2120 e l s e :121 pic_num = pic_stand
122123 i f f i l e . values [ i ][3]== typ :124 typ_num = pic_stand ∗ 2125 e l s e :126 typ_num = pic_stand
127128 i f f i l e . values [ i ][4]== bac :129 bac_num = bac_stand ∗ 2130 e l s e :131 bac_num = bac_stand
132133 s i m i l a r i t y = text_num ∗ text_w + pic_num ∗ pic_w + \
134 typ_num ∗ typ_w + bac_num ∗ bac_w
135136 sheet . write ( i +1,2, s i m i l a r i t y )137138print( ”您 要 查 找 与 该 题 相 似 的 题 目: ” )139print( s t r 1)140print( ”−−−−−−−−−−−−−−−−−−−−−−−−−−−−” )141print( ”查 询 成 功, 请 查 看 文 件 Search_result . x l s !” )142print( ”−−−−−−−−−−−−−−−−−−−−−−−−−−−−” )143 save_path = ” Search_result . x l s ”
144 mid5 . save ( save_path )