all (414)

# 6030048821 (16.12) 1 (2021-02-26 21:58) def get_unique( words ): unique_words = [] for word in words: if word not in unique_words: unique_words.append(word) return unique_words def jaccard(words_1, words_2): union = words_1 + words_2 union = get_unique(union) intersect = [] for w in words_1: if w in words_2: intersect.append(w) jaccard_coef = len(intersect)/len(union) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = [] for tweet_id in range(len(norm_tweets)): jac = jaccard(norm_tweets[tweet_id], norm_query) top.append([tweet_id,jac]) top.sort(key=lambda s:(-s[1],s[0])) return top[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): cont = tweet_content.split(' ') # print('c',len(cont)) print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') s = ' ' idx = 0 while idx != len(cont): if len(s + ' ' + cont[idx]) > print_width: print(s) s = ' ' else: if idx == len(cont)-1: print(s + ' ' + cont[idx]) else: s += (' '+ cont[idx]) idx += 1 #--------------------------------------------
# 6030239321 (13.76) 2 (2021-03-01 22:40) def get_unique( words ): uniq = [] for word in words : if word not in uniq : uniq.append(word) return uniq def jaccard(words_1, words_2): union = 0 for i in words_2 : if i in words_1 : union += 1 return union/(len(words_1)+len(words_2)-union) def top_n_similarity(norm_tweets, norm_query, n): top_val = [] top_tweets = [] for i in norm_tweets : jac = jaccard(i,norm_query) if len(top_val) == 0 : top_val.append(jac) top_tweets.append(i) if len(top_val) > n : nth = min(top_val) idx = top_val.index(nth) if i > nth : top_val[idx] = jac top_tweets[idx] = i else : top_val.append(jac) top_tweets.append(i) return list(zip(top_tweets,top_val)) def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("\n","#"+str(tweet_id),round(jc_coef ,2)) words = tweet_content.split(' ') count = 0 width = print_width - 2 print(' ',end='') for i in words : if count + len(i) > width : print("\n"+" "+i+" ",end='') count = len(i)+1 else : print(i+' ',end='') count += len(i)+1 #--------------------------------------------
# 6030380021 (12.13) 3 (2021-03-01 23:56) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return(unique_words) def jaccard(words_1, words_2): out1 = [] out2 = [] for i in words_1: if i in words_2 and i not in out1: out1.append(i) for i in words_2: if i in words_1 and i not in out1: out1.append(i) for i in words_1: if i not in out2: out2.append(i) for i in words_2: if i not in out2: out2.append(i) return(len(out1)/len(out2)) def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): jaccards = jaccard(norm_tweets[i],norm_query) top_n.append([i,jaccards]) return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): output = "\n" output += "#" + str(tweet_id) + " (" + str(round(jc_coef,2)) + ")\n " prev_space_index = 0 n = 1 while True: space_index = tweet_content.find(" ",prev_space_index + 1) if space_index == -1: output += tweet_content[prev_space_index:] break elif space_index >= print_width*n -2: n += 1 output+="\n "+ tweet_content[prev_space_index+1:space_index] else: output += tweet_content[prev_space_index:space_index] prev_space_index = space_index print(output) #--------------------------------------------
# 6030924521 (15.15) 4 (2021-03-01 16:55) def get_unique( words ): unique_words=[] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): s=0 for i in words_1: if i in words_2: s+=1 w=get_unique(words_1+words_2) jaccard_coef= round(s/len(w),2) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): norm_tweets_jac=[] for i in norm_tweets: jac=jaccard(i,norm_query) norm_tweets_jac.append(jac) top_n=[] for i in range(n): maxi=max(norm_tweets_jac) if maxi==0: return [] num=norm_tweets_jac.index(maxi) top_n.append([num,maxi]) norm_tweets_jac[num]=0 return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') A=tweet_content.split(' ') if '' in A: n=A.index('') A=A[0:n]+A[n+1:len(A)] while A!=[]: st=0 w=' ' for i in A: t=w+i if len(t)>print_width: print(w) n=A.index(i) A=A[n:len(A)] st+=1 break w=t #if i == A[len(A)-1]: #print(w) # break t=w+' ' if len(t)>print_width: print(w) n=A.index(i) A=A[n+1:len(A)] st+=1 break w=t if len(A)==0 and st==0: print(w) break if i == A[len(A)-1] and st==0: print(w) break #--------------------------------------------
# 6130097621 (14.89) 5 (2021-03-01 22:56) def get_unique( words ): unique_words=[] for i in words : if i not in unique_words : unique_words += [i] return unique_words def jaccard(words_1, words_2): x=[] y=[] for i in words_1: for j in words_2: if i == j : x += [i] if len(words_1)>=len(words_2): y += words_1 for i in words_2 : if i not in y : y+= [i] if len(words_1)<=len(words_2): y += words_2 for i in words_1 : if i not in y : y+= [i] jaccard_coef = (len(x)/len(y)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = [] for tweet_id in range(len(norm_tweets)) : x = jaccard(norm_tweets[tweet_id], norm_query) if x>0 : top += [[tweet_id,x]] a=[] b=[] top_n=[] for i in range(len(top)): a += [top[i][1]] for i in range(n) : b += [max(a)] a.remove(max(a)) top_x=[] for i in range(len(top)): if top[i][1] in b : top_x += [top[i]] top_y=[] for i in range(len(top_x)): if top_x[i][1]==min(b): top_y+=[top_x[i]] for i in top_y: top_x.remove(i) top_n=top_x+[top_y[0]] for i in range(n): if len(top_n)!=n: top_n+=[top_y[i+1]] top_n.sort() for j in range(50): for i in range(len(top_n)-1): if top_n[i][1]<top_n[i+1][1]: top_n[i+1],top_n[i]=top_n[i],top_n[i+1] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("\n") print("#"+str(tweet_id),"("+str(round(jc_coef,2))+")") tweet_content = tweet_content.split() tt = [] ans = " " for i in tweet_content : if len(ans) <= print_width : ans += i ans += " " tt += [i] if len(ans) == print_width + 1 : print(ans) ans = " " if len(ans) > print_width : x = ans x = x.split() ans1 = " " for e in range(len(x)-1) : ans1 += x[e] ans1 += " " print(ans1) ans = " "+i+" " if tt == tweet_content : print(ans) #--------------------------------------------
# 6130917221 (9.98) 6 (2021-03-01 23:19) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) elif i in unique_words: pass return unique_words def jaccard(words_1, words_2): words_1 = get_unique(words_1) words_2 = get_unique(words_2) x = y = 0 for i in words_1: if i in words_2: x += 1 words_3 = words_1 + words_2 words_4 = [] for i in words_3: if i not in words_4: words_4.append(i) y = len(words_4) jaccard_coef = x/y return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): y = [] z = [] for i in range(len(norm_tweets)): x = jaccard(norm_tweets[i],norm_query) y.append([i*-1,x]) y.sort() for i in y: i[0] *= -1 z.append([i[0],i[1]]) a = z[-n:] top_n = a[::-1] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content = tweet_content.split(' ') jc_coef = round(jc_coef,2) s = ' ' for i in tweet_content: x = s + ' ' + i if len(x) > print_width: print(s) s = ' ' + i else: s = x print() print('#' + str(tweet_id) + ' (' + str(jc_coef) + ')') print(s) #--------------------------------------------
# 6130924621 (7.65) 7 (2021-03-01 23:58) def get_unique( words ): unique_words = [] for x in words: if x not in unique_words: unique_words.append(x) elif x in unique_words: pass return unique_words def jaccard(words_1, words_2): words_1=get_unique(words_1) words_2=get_unique(words_2) x=0 y=0 for i in words_1: if i in words_2: x+=1 words_3=words_1+words_2 words_4=[] for i in words_3: if i not in words_4: words_4.append(i) y=len(words_4) jaccard_coef=round(x/y) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): jaccard_norm = [] for i in range(len(norm_tweets)): jc_co = jaccard(norm_tweets[i], norm_query) jaccard_norm.append([jc_co, i]) jaccard_norm.sort(key=lambda k:(k[0],-k[1])) top_n = [] for i in range(len(jaccard_norm)-1, len(jaccard_norm)-1-n, -1): j = jaccard_norm[i] top_n.append([j[1], j[0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content = tweet_content.split(' ') jc_coef = round(jc_coef,2) p = ' ' for i in tweet_content: x = p + ' ' + i if len(x) > print_width: print(p) p = ' ' + i else: p = x print() print('#' + str(tweet_id) + ' (' + str(jc_coef) + ')') print(p) #--------------------------------------------
# 6230041021 (17.00) 8 (2021-02-28 23:52) def get_unique( words ): unique_words = [] for i in words : if i in unique_words : pass else: unique_words.append(i) return unique_words def jaccard(words_1, words_2): x1 = get_unique(words_1) x2 = get_unique(words_2) st = 0 for i in x1: if i in x2 : st += 1 E_st = len(x1)+ len(x2)- st jaccard_coef = st/E_st return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] fake_all = [] for i in range(len(norm_tweets)): Jaccard_coefficient = jaccard(norm_tweets[i], norm_query) if Jaccard_coefficient > 0: fake_all.append([Jaccard_coefficient, -i]) fake_all.sort(reverse=True) for i in range(n): top_n.append([-(fake_all[i][1]),fake_all[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(' ') print('#'+ str(tweet_id)+' ('+str(round(jc_coef,2))+')') t = tweet_content.split(' ') word_lines = ' ' for i in range(len(t)): if len(word_lines) + len(str(t[i])) <= print_width : word_lines += str(t[i])+' ' elif len(word_lines) + len(str(t[i])) > print_width : print(word_lines) word_lines = ' ' word_lines += str(t[i])+' ' if word_lines != ' ' : print(word_lines) #--------------------------------------------
# 6230092021 (18.01) 9 (2021-02-28 12:54) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): c = 0 w1=[] w2=[] for i in range(len(words_1)): if words_1[i] in words_2: c += 1 else: w1.append(words_1[i]) for i in range(len(words_2)): if words_2[i] not in words_1: w2.append(words_2[i]) num_words1 = len(w1) num_words2 = len(w2) all_num = num_words1 + num_words2 + c jaccard_coef = c/all_num return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a = [] for i in range(len(norm_tweets)): a.append([i,jaccard(norm_tweets[i],norm_query)]) a.sort() b = [] for k in a: k[0],k[1] = -k[1],k[0] b.append(k) b.sort() top_n =[] for e in b: e[0],e[1] = e[1],-e[0] top_n.append(e) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n'+'#'+str(tweet_id)+' ('+str((round((jc_coef),2)))+')') t = tweet_content.split(' ') content = ' ' for i in t: if len(content) + len(i) <= print_width: content += i + ' ' else: print(content) content = ' ' + i + ' ' print(content) #--------------------------------------------
# 6230131921 (18.01) 10 (2021-03-01 21:23) def get_unique(words): words.sort unique_words = [] for i in words: if i in unique_words: pass else: unique_words.append(i) return unique_words #-------------------------------------------------------- def jaccard(words_1, words_2): words_3 = words_1 + words_2 words_3.sort() b = [words_3[0]] for i in range(len(words_3)-1): if words_3[i] == words_3[i+1]: pass else: b.append(words_3[i+1]) c = [] for i in range(len(words_3)-1): if words_3[i] == words_3[i+1]: c.append(words_3[i+1]) else: pass jaccard_coef = (len(c)/len(b)) return jaccard_coef #-------------------------------------------------------- def top_n_similarity(norm_tweets, norm_query, n): empty = [] for i in range(len(norm_tweets)): value = jaccard(norm_tweets[i], norm_query) empty.append([-value,i]) empty.sort() for i in range(len(empty)): empty[i][0],empty[i][1] = empty[i][1],-empty[i][0] top_n = empty[0:n:1] return top_n #-------------------------------------------------------- def show_tweet(tweet_id, tweet_content, jc_coef, print_width): brokenword = tweet_content.split(" ") print() print("#"+str(tweet_id)+" "+"("+str(round(jc_coef,2))+")") emptystring = " " for i in range(len(brokenword)): if len(emptystring) + len(brokenword[i]) <= print_width: emptystring+= brokenword[i] + " " elif len(emptystring) + len(brokenword[i]) > print_width: print(emptystring) emptystring = " " emptystring += brokenword[i] + " " if emptystring != " ": print(emptystring) #--------------------------------------------
# 6230133121 (20.00) 11 (2021-03-01 16:54) def get_unique( words ): unique_words = [] for e in words: for d in words: if not e in unique_words and e == d: unique_words.append(e) return unique_words def jaccard(words_1, words_2): words_repeat =[] for e in words_1: for d in words_2: if e == d: words_repeat.append(e) words = words_1 + words_2 words_sum = get_unique( words ) jaccard_coef = len(words_repeat)/len(words_sum) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): jc = [] jct = [] top_n = [] for e in norm_tweets: tweet_id = norm_tweets.index(e) jac = jaccard(norm_tweets[tweet_id], norm_query) if jac > 0 : jc.append([-jac,tweet_id]) jc.sort() jct = jc[0:n] for [a,b] in jct: top_n.append([b,-a]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(" ") print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') content = tweet_content.split(' ') tweet=' ' for e in content: if len(tweet+' '+e) <= print_width: tweet+=' '+e else: print(tweet) tweet=' '+ e print(tweet) #--------------------------------------------
# 6230153721 (17.95) 12 (2021-03-01 18:38) def get_unique( words ): unique_words=[] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): d=[] y=[] for i in words_1: if i in words_2: d.append(i) if i not in y: y.append(i) for i in words_2: if i not in y: y.append(i) jaccard_coef=len(d)/len(y) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): norm_tweets2=[] for i in norm_tweets: w=get_unique(i) norm_tweets2.append(w) top_n1=[] for i in range(len(norm_tweets2)): x=jaccard(norm_query,norm_tweets2[i]) if len(top_n1)<n: top_n1.append([x,i]) top_n1.sort() else: if x>top_n1[0][0]: s=False for k in range(len(top_n1)): if top_n1[0][0]!=top_n1[k][0] and s==False: top_n1.pop(k-1) top_n1.append([x,i]) top_n1.sort() s=True elif k==len(top_n1)-1 and s==False: top_n1.pop(k) top_n1.append([x,i]) top_n1.sort() s=True b=[] for i in range(len(top_n1)): if i==0: b.append(top_n1[i][1]) elif top_n1[i][0]==top_n1[i-1][0] and i!=len(top_n1)-1: b.append(top_n1[i][1]) elif top_n1[i][0]!=top_n1[i-1][0]: for s in range(len(b)): top_n1[i-1-s][1]=b[s] b=[] b.append(top_n1[i][1]) elif i==len(top_n1)-1 and top_n1[i][0]==top_n1[i-1][0]: b.append(top_n1[i][1]) for s in range(len(b)): top_n1[i-s][1]=b[s] top_n2=[] for i in top_n1: top_n2.append([i[1],i[0]]) top_n=[] for i in top_n2: top_n.insert(0,i) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') tweet_content=tweet_content.split() b=' ' for i in range(len(tweet_content)): if len(b)<print_width and len(b)+len(' '+tweet_content[i])<=print_width and i<len(tweet_content)-1 : b+=' '+tweet_content[i] elif i>=len(tweet_content)-1 and len(b)+len(' '+tweet_content[i])<=print_width: b+=' '+tweet_content[i] print(b) elif i>=len(tweet_content)-1 and len(b)+len(' '+tweet_content[i])>print_width: print(b) print(' '+tweet_content[i]) else: print(b) b=' ' b+=' '+tweet_content[i] #--------------------------------------------
# 6230154321 (18.01) 13 (2021-03-01 20:27) def get_unique( words ): unique_words=[] for n in words: if n not in unique_words: unique_words.append(n) return unique_words def jaccard(words_1, words_2): a=[] for e in words_1: if e in words_2: a.append(e) b=words_1 + words_2 c=[] for n in b: if n not in c: c.append(n) jaccard_coef = len(a)/len(c) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): e=[] norm=[] for i in norm_tweets: c=get_unique( i ) norm.append(c) for h in range(len(norm)): d=jaccard(norm[h], norm_query) e.append([d,-h]) e.sort() top_n=e[-n:] top_n.reverse() for i in top_n: i[1]*=-1 i.reverse() return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): n = tweet_content.split(' ') e=' '+' ' print(' ') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') for i in n: if len(e+i)>print_width: print(e) e=' '+' '+i+' ' elif len(e+i)<=print_width: e+=i+' ' print(e) #--------------------------------------------
# 6230444321 (16.94) 14 (2021-03-01 21:39) def get_unique( words ): unique_words = [] for word in words : if word not in unique_words : unique_words.append(word) return unique_words def jaccard(words_1, words_2): same = 0 All = len(words_1) for word in words_2: if word in words_1: same += 1 else: All += 1 jaccard_coef = same/All return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] data = [] for i in range(len(norm_tweets)) : if(jaccard(norm_tweets[i],norm_query) > 0): data.append([-jaccard(norm_tweets[i],norm_query),i]) data.sort() for i in range(n) : top_n.append([data[i][1],-data[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("\n#"+str(tweet_id),"("+str(round(jc_coef,2))+")") WordInTweet = tweet_content.split() content = [] message = " " for word in WordInTweet : if len(message+" "+word) <= print_width : message = message+" "+word else : content.append(message) message = " " + word if(message not in content) : content.append(message) for e in content : print(e) #--------------------------------------------
# 6230585121 (9.33) 15 (2021-03-01 23:57) def get_unique(words): unique_words = [] for c in words: if c not in unique_words : unique_words.append(c) return unique_words def jaccard(words_1, words_2): a = words_1+words_2 b = [] n = 0 for c in a: if c not in b : b.append(c) else: n += 1 jaccard_coef = n/len(b) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a = [] for i in range(0,len(norm_tweets)): b = jaccard(norm_tweets[i], norm_query) a.append(b) c = [] d = [] for i in range(0,len(a)): c.append(a[i]) c.append(i+1) d.append(c) c = [] d.sort() f = d[::-1] p = 1 r = [] for i in range(0,len(f)-1): if f[i][0] == f[i+1][0]: p+=1 r.append(f[i]) r.append(f[i+1]) r.sort() new = [] for i in r: if i not in new: new.append(i) j = f.index(new[-1]) f[j:j+len(new):1] = new for i in range(0,len(f)): f[i][0],f[i][1] = f[i][1],f[i][0] top_n = f[0:n:1] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#"+str(tweet_id)+" ("+str(round(jc_coef, 2))+")") print(' '+tweet_content) #--------------------------------------------
# 6231004021 (20.00) 16 (2021-02-27 13:06) def get_unique(words): unique_words = [] for word in words: if word not in unique_words: unique_words.append(word) return unique_words def jaccard(words_1, words_2): unique_words_1 = get_unique(words_1) unique_words_2 = get_unique(words_2) total_duplicate_word = 0 for word in unique_words_1: if word in unique_words_2: total_duplicate_word += 1 total_unique_word = len(get_unique(words_1 + words_2)) jaccard_coef = total_duplicate_word/total_unique_word return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): coef = jaccard(norm_tweets[i], norm_query) if coef > 0: top_n.append([i, coef]) top_n = sorted(top_n, key=lambda x: x[1], reverse=True)[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print(f'#{tweet_id} ({round(jc_coef, 2)})') tweet_content = tweet_content.split(' ') width = 0 print(' ', end='') for i in range(len(tweet_content)): if len(tweet_content[i]) + width > print_width - 2: print(f'\n {tweet_content[i]} ', end='') width = len(tweet_content[i]) + 1 else: print(f'{tweet_content[i]} ', end='') width += len(tweet_content[i]) + 1 print() # --------------------------------------------
# 6231008621 (19.48) 17 (2021-03-01 20:55) def get_unique( words ): unique_words = [] for w in words: if not w in unique_words: unique_words.append(w) return unique_words def jaccard(words_1, words_2): union = get_unique(words_1 + words_2) nUnion = len(union) nIntersect = len(words_1) + len(words_2) - nUnion return nIntersect / nUnion def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): t = norm_tweets[i] j = jaccard(t, norm_query) if j != 0: top_n.append([i, j]) top_n.sort(key=lambda t: (-t[1], t[0])) return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#%d (%.2f)" % (tweet_id, round(jc_coef, 2))) buffer = " " wordCount = 0 for w in tweet_content.split(" "): if len(buffer) + len(w) > print_width and wordCount != 0: print(buffer) buffer = " " wordCount = 0 buffer += " " + w wordCount += 1 if wordCount != 0: print(buffer) #--------------------------------------------
# 6231012021 (20.00) 18 (2021-02-28 22:34) def get_unique( words ): unique_words=[] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): count_repeat_words=0 words_1_no_repeat=[] words_2_no_repeat=[] for i in range(len(words_1)): if words_1[i] in words_2: count_repeat_words+=1 else: words_1_no_repeat.append(words_1[i]) for i in range(len(words_2)): if words_2[i] not in words_1: words_2_no_repeat.append(words_2[i]) num_words_1=len(words_1_no_repeat) num_words_2=len(words_2_no_repeat) all_num=num_words_1+num_words_2+count_repeat_words jaccard_coef=count_repeat_words/all_num return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): x = [] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i],norm_query) > 0: x.append([i,jaccard(norm_tweets[i],norm_query)]) x.sort() y=[] for o in x: o[0],o[1] = -o[1],o[0] y.append(o) y.sort() top_n=[] for u in y: u[0],u[1] = u[1],-u[0] top_n.append(u) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(' ') print('#'+str(tweet_id)+' ('+str((round((jc_coef),2)))+')') t=tweet_content.split(" ") y=" " for i in t: if len(y)+len(i)<=print_width: y+=i+" " else: print(y) y=" "+i+" " print(y) #--------------------------------------------
# 6231019521 (20.00) 19 (2021-03-01 17:58) def get_unique( words ): words.sort() words.insert(0,' ') unique_words=[] for i in range(1,len(words)): if words[i-1]!=words[i]: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): words12=words_1+words_2 words12.insert(0,' ') words12.sort() a=[] for k in range(1,len(words12)): if words12[k-1]!=words12[k]: a.append(words12[k]) b=[] for m in words_2: if m in words_1: b.append(m) jaccard_coef=len(b)/len(a) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): t=[] for i in range(len(norm_tweets)): coef=jaccard(norm_tweets[i],norm_query) if coef>0: t.append([i,coef]) t.sort() top=[] for k in range(len(t)): top.append([-t[k][1],t[k][0]]) top.sort() top_n=[] for c in range(len(top)): top_n.append([top[c][1],-top[c][0]]) top_n=top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id),'('+str((round(jc_coef,2)))+')') content=tweet_content.split(' ') a=' ' for b in content: if len(a)+len(b)<=print_width: a+=b+' ' else: print(a) a=' '+b+' ' print(a) #--------------------------------------------
# 6231205921 (14.00) 20 (2021-03-01 20:20) def get_unique( words ): words.sort() unique_words = list(words) for i in range(1,len(words)) : if words[i] == words[i-1] : unique_words.remove(words[i]) return unique_words def jaccard(words_1, words_2): c = 0 for i in range(len(words_1)) : if words_1[i] in words_2 : c += 1 a = len(words_1)+len(words_2)-c jaccard_coef = c/a return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): tweet_id = len(norm_tweets) x = []*n for i in range(tweet_id) : x.append([-jaccard(norm_tweets[i],norm_query),i]) x.sort() for s in x : x = [s[1],-s[0]] top_n = x[:min(tweet_id,n)] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#"+str(tweet_id)+" ("+str(round(jc_coef,2))+")") tweet_content = tweet_content.split(" ") a = " " for i in range(len(tweet_content)) : if len(a)+len(tweet_content[i]) > print_width : print(a) a = " " a += tweet_content[i]+" " else : a += tweet_content[i]+" " print(a) #--------------------------------------------
# 6231207121 (17.00) 21 (2021-02-28 02:13) def get_unique( words ): unique_words = [] for e in words : if e not in unique_words : unique_words.append(e) return unique_words def jaccard(words_1, words_2): nsame = 0 if words_1 >= words_2 : for e in words_1 : if e in words_2 : nsame += 1 if words_1 < words_2 : for e in words_2 : if e in words_1 : nsame += 1 nunion = len(words_1)+len(words_2)-nsame jaccard_coef = nsame/nunion return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): x = [] for i in range(len(norm_tweets)) : x.append([i, jaccard(norm_tweets[i], norm_query)]) y = [] for i in range(len(x)) : y.append([-x[i][1],x[i][0]]) y.sort() top_n = [] for i in range(n) : top_n.append([y[i][1],-y[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_list = tweet_content.split(" ") width = print_width-1 print("") print("#"+str(tweet_id),"("+str(round(jc_coef,2))+")") c = 0 p = " " for e in tweet_list : c += (len(e)+1) if c <= width : p += e+" " else : print(p) p = " "+e+" " c = (len(e)+1) print(p) #--------------------------------------------
# 6231213921 (16.94) 22 (2021-03-01 23:55) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): similarity_words = [] for i in range(len(words_1)): if words_1[i] in words_2: similarity_words.append(words_1[i]) jaccard_coef = len(similarity_words)/(len(words_1)+len(words_2)-len(similarity_words)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): mix = [] for e in norm_tweets: x = norm_tweets.index(e) mix.append([jaccard(e,norm_query),-x]) mix.sort() top_n = [] mix = mix[::-1] for i in range(n): y = -(mix[i][1]) top_n.append([y,mix[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#"+str(tweet_id)+" ("+str(round(jc_coef,2))+")") x = tweet_content.split() y = ' ' z = [] for i in range(len(x)): if i != (len(x)-1): if (len(y)+(len(x[i])+1)) <= print_width: y += ' '+x[i] else: z.append(y) y = ' '+x[i] else: if (len(y)+(len(x[i])+1)) <= print_width: y += ' '+x[i] z.append(y) else: z.append(y) y = ' '+x[i] z.append(y) for i in range(len(z)): print(z[i]) #--------------------------------------------
# 6231214521 (18.50) 23 (2021-03-01 23:27) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): x = 0 y = [] for e in words_1: if e in words_2: x +=1 for e in words_1: if e in words_2: y.append(e) a =len(y) b = len(words_1)+len(words_2) c = b-a jaccard_coef = x/c return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i],norm_query) > 0: top_n.append([jaccard(norm_tweets[i],norm_query) , i]) top_n.sort(reverse = True) for i in range(len(top_n)): top_n[i][0],top_n[i][1]=top_n[i][1],top_n[i][0] for i in range(len(top_n)): if top_n[i][1] == "0" or top_n[i][1] == '0.0': x.remove(x[i]) def pun(ji): return ji[0] top_n.sort(key=pun) def punpun(jiji): return jiji[1] top_n.sort(reverse = True ,key=punpun) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#"+str(tweet_id)+" ("+str(round(jc_coef,2))+")") tweet_content=tweet_content.split(" ") pp =" " for i in range(len(tweet_content)): if len(pp)+len(tweet_content[i])>print_width: print(pp) pp = " " pp += tweet_content[i]+" " else: pp += tweet_content[i]+" " print(pp) #--------------------------------------------------------
# 6231220221 (14.73) 24 (2021-03-01 23:55) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): similar_words = [] for i in range(len(words_1)): if words_1[i] in words_2: similar_words.append(words_1[i]) jaccard_coef = len(similar_words)/(len(words_1)+len(words_2)-len(similar_words)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): mix = [] for a in norm_tweets: k = norm_tweets.index(a) mix.append([jaccard(a,norm_query),-k]) mix.sort() top_n = [] mix = mix[::-1] for e in range(n): y = -(mix[e][1]) top_n.append([y,mix[e][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#"+str(tweet_id)+" ("+str(round(jc_coef,2))+")") a = tweet_content.split() b = ' ' c = [] for i in range(len(a)): if i != (len(a)-1): if (len(b)+(len(a[i])+1)) < print_width: b += ' '+a[i] else: c.append(b) b = ' '+a[i] else: if (len(b)+(len(a[i])+1)) < print_width: b += ' '+a[i] c.append(b) else: c.append(b) b = ' '+a[i] c.append(b) for e in range(len(c)): print(c[e]) #--------------------------------------------
# 6231222521 (18.01) 25 (2021-03-01 15:36) def get_unique( words ): result = [] for w in words : if w not in result : result.append(w) return result def jaccard(words_1, words_2): union = [] intersect = [] for i in words_1 : if i not in union : union.append(i) if i in words_2 : intersect.append(i) for i in words_2 : if i not in union : union.append(i) if i in words_1 and i not in intersect : intersect.append(i) return len(intersect)/len(union) def top_n_similarity(norm_tweets, norm_query, n): k = len(norm_tweets) q = [ [-jaccard(norm_tweets[i],norm_query),i] for i in range(k)] q.sort() q = [ [e[1],-e[0]] for e in q] return q[:min(k,n)] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#"+str(tweet_id),"("+str(round(jc_coef,2))+")") tweet_content = tweet_content.split(" ") n = len(tweet_content) e = " "+tweet_content[0]+" " #print(tweet_content) for i in range(1,n) : if len(e)+len(tweet_content[i]) <= print_width : e += tweet_content[i]+" " else : print(e) e = " "+tweet_content[i]+" " print(e) #--------------------------------------------
# 6231223121 (15.55) 26 (2021-02-27 20:46) def get_unique( words ): words.sort() unique_words=[] for i in range(len(words)): if not words[i] in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): words_all= words_1+words_2 words_all.sort() x=[] for i in range(len(words_all)-1): if words_all[i]!=words_all[i+1]: x.append(words_all[i]) x+=[words_all[-1]] down=len(x) up=0 for i in range(len(words_1)): if words_1[i] in words_2: up+=1 jaccard_coef=up/down return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): t=[] for i in range(len(norm_tweets)): t.append([i,jaccard(norm_tweets[i],norm_query)]) top_n=list(t) for e in range(len(top_n)): top_n[e][0],top_n[e][1]=top_n[e][1],top_n[e][0] top_n.sort() for i in range(len(top_n)): top_n[i][0]=round(1-top_n[i][0],2) top_n.sort() for i in range(len(top_n)): top_n[i][0]=round(1-top_n[i][0],2) for e in range(len(top_n)): top_n[e][0],top_n[e][1]=top_n[e][1],top_n[e][0] top_n=top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): t=tweet_content.split(' ') print() print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') p1=[] c=1 while len(t)!=0: for i in range(len(t)): c+=len(' '+t[i]) if c>print_width: c=1 break p1.append(t[i]) for i in range(len(p1)): if p1[i] in t: t.remove(p1[i]) out=' '.join(p1) print(' '+out) if len(p1)>0: p1=[] #--------------------------------------------
# 6231224821 (0.00) 27 (2021-03-01 23:58) def get_unique( words ): words = input().split() unique_words = list(set(words)) print(unique_words) return unique_words def jaccard(words_1, words_2): words_1 = input().split() unw1 = set(words_1) words_2 = input().split() unw2 = set(words_2) its = unw1.intersection(unw2) unn = unw1.union(unw2) x = len(its) y = len(unn) jaccard_coef = x/y return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): k = len(norm_tweets) x = [ [-jaccard(norm_tweets[i],norm_query),i] for i in range(k)] x.sort() x = [ [e[1],-e[0]] for e in q] return x[:min(k,n)] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+ str(tweet_id)+'('+str(round(jc_coef,2))+')') tweet_content = tweet_content.split(" ") x = " " y = len(tweet_content) for e in range(y): if len(x)+y[e] > print_width: print(x) x = " " x += y[e]+" " else: x += y[e]+" " print(x) #--------------------------------------------
# 6231510221 (18.01) 28 (2021-02-28 11:37) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): merge = words_1+words_2 unique = get_unique(merge) jaccard_coef = (len(merge)-len(unique))/(len(unique)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] i = 0 for tweet in norm_tweets: jaccard_coef = jaccard(tweet,norm_query) top_n.append([i,jaccard_coef]) i += 1 top_n.sort(key = lambda x:x[1],reverse=True) return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#"+str(tweet_id),'('+str(round(jc_coef,2))+')') str_print = [] tweet_content = tweet_content.split(' ') line = " " for word in tweet_content: if len(line) + len(word) > print_width: str_print.append(line) line = " " line += word + ' ' if len(line) > 2: str_print.append(line) for l in str_print: print(l) #--------------------------------------------
# 6231511921 (18.01) 29 (2021-03-01 10:11) def get_unique( words ): unique_words = [] for i in range(len(words)-1): words.sort() if words[i] != words[i+1]: unique_words.append(words[i+1]) if unique_words != []: unique_words.append(words[0]) return unique_words def jaccard(words_1, words_2): same = 0 for c in words_1: if c in words_2: same += 1 all = (len(words_1)+len(words_2))-same jaccard_coef = same/all return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for tweets_id in range(len(norm_tweets)): jc = jaccard(norm_tweets[tweets_id],norm_query) top_n.append([tweets_id,jc]) top_n.sort(key = lambda x:x[1],reverse=True) return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#"+str(tweet_id)+" "+'('+str(round(jc_coef,2))+')') twc = tweet_content.split(' ') output = " " for i in twc: if (len(output) + len(i)) > print_width: print(output) output = " " output = output + i + " " if len(output) != 2: print(output) #--------------------------------------------
# 6231707621 (20.00) 30 (2021-02-26 13:59) def get_unique( words ): unique_words=[] words.sort() for i in range(len(words)-1): if words[i]!=words[i+1]: unique_words.append(words[i]) if len(words)>=1: unique_words.append(words[-1]) return unique_words def jaccard(words_1, words_2): words_1=get_unique( words_1 ) words_2=get_unique( words_2 ) words_1.extend(words_2) x=get_unique( words_1 ) jaccard_coef=(len(words_1)-len(x))/len(x) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] x=[] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i], norm_query)>0: x.append([jaccard(norm_tweets[i], norm_query),-i]) x.sort() x=x[-1:-n-1:-1] for i in range(len(x)): top_n.append([int(-1*x[i][1]),x[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): a=tweet_content.split(' ') print('') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') b='' for i in range(len(a)-1): b+=' '+a[i] if print_width-1-len(b)<= len(a[i+1]): print(' '+b) b='' if len(b)+len(a[-1])+1<print_width: print(' '+b+' '+a[-1]) else: print(b) print(a[-1]) #--------------------------------------------
# 6231709921 (13.33) 31 (2021-02-26 23:36) def get_unique( words ): unique_words=[] for e in words : if e not in unique_words : unique_words.append(e) return unique_words def jaccard(words_1, words_2): sed=0 suan=0 for e in words_1 : if e in words_2 : sed+=1 words=words_1 + words_2 tot=get_unique(words) suan=len(tot) jaccard_coef=sed/suan return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n1=[] for i in range(len(norm_tweets)) : tweet_id=i jaccard1=jaccard(norm_tweets[tweet_id],norm_query) top_n1.append([jaccard1,tweet_id]) top_n1.sort(reverse=True) for e in top_n1 : e[1],e[0]=e[0],e[1] top_n=[] r=0 while len(top_n) < n : tem=[] for i in range(r,len(top_n1)-1) : if top_n1[i][1] != top_n1[i+1][1] : if top_n1[i] not in top_n : top_n.append(top_n1[i]) if i>0 : r+=i else : r+=1 break else : tem.append(top_n1[i]) tem.sort() top_n+=tem return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') tcon=tweet_content.split(' ') aws='' i=0 b=0 while i<len(tcon) : if len(aws.strip())+2+len(tcon[i])< print_width : aws+=tcon[i]+' ' i+=1 else: print(' '+aws.strip()) aws='' k=i b+=1 if b<1 : lastaws='' for e in tcon : lastaws+=e+' ' print(' '+lastaws.strip()) else : lastaws='' for e in tcon[k:] : lastaws+=e+' ' print(' '+lastaws.strip()) #--------------------------------------------
# 6330170421 (18.01) 32 (2021-03-01 01:58) def get_unique( words ): unique_words = [] for n in range(len(words)): if words[n] not in unique_words: unique_words.append(words[n]) return unique_words def jaccard(words_1, words_2): uw1=[] for n in range(len(words_1)): if words_1[n] not in uw1: uw1.append(words_1[n]) uw2=[] for n in range(len(words_2)): if words_2[n] not in uw2: uw2.append(words_2[n]) X=[] if len(uw1) > len(uw2): for n in range (len(uw1)): if uw1[n] in uw2: X.append(uw1[n]) else : for n in range (len(uw2)): if uw2[n] in uw1: X.append(uw2[n]) Y=[] if len(uw1) < len(uw2): for n in range (len(uw2)): if uw2[n] not in uw1: Y.append(uw2[n]) for n in range (len(uw1)): Y.append(uw1[n]) else : for n in range (len(uw1)): if uw1[n] not in uw2: Y.append(uw1[n]) for n in range (len(uw2)): Y.append(uw2[n]) jaccard_coef = len(X)/len(Y) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): A=[] B=[] for e in range(len(norm_tweets)): tweet_id = e A.append([tweet_id,jaccard(norm_tweets[e],norm_query)]) for e in range (len(A)): B.append([float(A[e][1])*-1,e]) B.sort() for e in range(len(B)): B[e][1],B[e][0] = -1*float(B[e][0]),B[e][1] top_n = B[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tc = tweet_content.split(' ') x = [] print(' ') print('#'+ str(tweet_id) + ' ' + '(' + str(round(jc_coef,2)) + ')') for n in range (len(tc)): y = ' '.join(x) if len(y)+len(tc[n]) >= (print_width-2): print(' '+y) x = [tc[n]] else : x.append(tc[n]) if len(x) != 0: print(' '+' '.join(x)) #--------------------------------------------
# 6330171021 (18.01) 33 (2021-03-01 14:18) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): intersect =len([x for x in words_1 if x in words_2]) union = len(words_1)+len(words_2)-intersect if union == 0: jaccard_coef = 0 jaccard_coef = intersect/union return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a = [] for i in range(len(norm_tweets)): tweet_id = i a.append([-1*(jaccard(norm_query,norm_tweets[i])),tweet_id]) a.sort() for e in a: e[0] = -1*e[0] e[0],e[1] = e[1],e[0] top_n = a[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') p = tweet_content.split(' ') x = '' for i in range(len(p)): if len(x)+len(p[i]) > print_width-2: print(' '+x) x = ' '+str(p[i]) else: x+=' '+str(p[i]) print(' '+x) #--------------------------------------------
# 6330172721 (20.00) 34 (2021-02-27 19:03) def get_unique( words ): unique_words = [] for i in words : if i not in unique_words : unique_words += [i] return unique_words def jaccard(words_1, words_2): rept_wrd_cnt = 0 tmp_lst = [] for i in words_1 : if i in words_2 : rept_wrd_cnt += 1 if i not in tmp_lst : tmp_lst += [i] for i in words_2 : if i not in tmp_lst : tmp_lst += [i] jaccard_coef = rept_wrd_cnt / len(tmp_lst) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] def second(ele) : return ele[1] for tweet_id in range(len(norm_tweets)) : jacc = jaccard(norm_tweets[tweet_id],norm_query) if jacc > 0 : top_n += [[tweet_id,jacc]] top_n = sorted(top_n,key=second,reverse=1)[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("\n"+'#'+str(tweet_id),'('+str(round(jc_coef,2))+')') tweet_content = tweet_content.split(' ') line = [" "] for txt in tweet_content : line += [txt] if len(" ".join(line))>print_width : line = line[:-1] print(" ".join(line)) line = [" "+txt] print(" ".join(line)) #--------------------------------------------
# 6330173321 (19.68) 35 (2021-02-27 02:12) def get_unique( words ): unique_words = [] i = 0 du = [] while words != []: if words[i] not in unique_words: unique_words.append(words.pop(i)) elif words[i] in unique_words: du.append(words.pop(i)) return unique_words def jaccard(words_1, words_2): ins = 0 for i in words_1: if i in words_2: ins += 1 uni = len(words_1)+len(words_2)-ins jaccard_coef = ins/uni return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] tweetjac= [] tweet_f = [] tweetjack= [] for i in range(len(norm_tweets)): prejaccard = jaccard(norm_tweets[i], norm_query) if prejaccard > 0 : tj = [] tj.append(i) tj.append(prejaccard) tweetjac.append(tj) for [a1,a2] in tweetjac: tweet_f.append([-a2,a1]) tweet_f.sort() for [a1,a2] in tweet_f: tweetjack.append([a2,-a1]) top_n = tweetjack[0:n:] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): re = round(jc_coef,2) print(" ") print("#"+str(tweet_id)+" ("+str(re)+")") content = tweet_content.split(' ') line = [] long = 0 g = 0 for i in content: if long+len(i) <= print_width-2: line.append(i) long += len(i)+1 g = 20 else: if line != []: print(" "+" ".join(line)) line = [] elif g==0: line.append(i) print(" "+" ".join(line)) line.remove(i) long = 0 g = 0 if long+len(i) > print_width-2: long = 0 continue else: long += 2 line.append(i) long += len(i)-1 g = 20 print(" "+" ".join(line)) #--------------------------------------------
# 6330174021 (19.15) 36 (2021-02-28 13:46) def get_unique(words): unique_words = [] for word in words: if word not in unique_words: unique_words.append(word) return unique_words def jaccard(words_1, words_2): less, more = words_1, words_2 if len(less) > len(more): less, more = more, less similar_words = sum([word in more for word in less]) tot_words = len(more + [word for word in less if word not in more]) jaccard_coef = similar_words / tot_words return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for tweet_id in range(len(norm_tweets)): jacc = jaccard(norm_tweets[tweet_id], norm_query) if jacc == 0: continue top_n.append([tweet_id, jacc]) top_n = sorted(top_n, reverse = True, key = lambda x:x[1])[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(end='\n') print('#' + str(tweet_id) + ' (' + str(round(jc_coef ,2)) + ')') tweet_content = tweet_content.split(' ') line = [tweet_content[0]] n = len(tweet_content[0]) for i in range(1, len(tweet_content)): if n + len(line) + len(tweet_content[i]) + 2 > print_width: print(' ' + ' '.join(line)) n = 0 line = [] line.append(tweet_content[i]) n += len(tweet_content[i]) if tweet_content[i] != '' else 1 else: print(' ' + ' '.join(line)) #--------------------------------------------
# 6330176221 (18.01) 37 (2021-03-01 13:47) def get_unique( words ): unique_words = [] for i in range(len(words)) : if words[i] not in unique_words : unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): a = [] b = [] for i in range(len(words_1)) : if words_1[i] not in a : a.append(words_1[i]) for i in range(len(words_2)) : if words_2[i] not in b : b.append(words_2[i]) inter = [] if len(a) >= len(b) : for i in range(len(b)) : if b[i] in a : inter.append(b[i]) if len(a) < len(b) : for i in range(len(a)) : if a[i] in b : inter.append(a[i]) union = len(a) + len(b) - len(inter) jaccard_coef = len(inter) / union return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a = [] for i in range(len(norm_tweets)) : tweet_id = i jac = jaccard(norm_tweets[i], norm_query) a.append([-tweet_id,jac]) for i in a : i[0],i[1] = i[1],i[0] a.sort() a = a[::-1] for i in a : if i[1] < 0 : i[1] = -int(i[1]) for i in a : i[0],i[1] = i[1],i[0] top_n = a[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): a = tweet_content.split(' ') print(' ') print('#'+ str(tweet_id) + ' ' + '(' + str(round(jc_coef,2)) + ')') b = [] for i in range(len(a)) : if len(' '.join(b)) + len(a[i]) >= (print_width - 2) : print(' ' + ' '.join(b)) b = [a[i]] else : b.append(a[i]) print(' ' + ' '.join(b)) #--------------------------------------------
# 6330177921 (20.00) 38 (2021-02-27 03:07) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): c = 0 for e in words_1: if e in words_2: c += 1 s = words_1 + words_2 a = [] for e in s: if e not in a: a.append(e) jaccard_coef = c/len(a) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a = [] for i in range(len(norm_tweets)): a.append([jaccard(norm_tweets[i],norm_query),-i]) a.sort() a = a[::-1] top_n = [] for i in range(n): if a[i][0] <= 0: break top_n.append([-a[i][1],a[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#"+str(tweet_id),"("+str(round(jc_coef,2))+")") con = tweet_content.split(" ") a = " " b = " " show = [] for i in range(len(con)): if len(a) + len(con[i]) > print_width: show.append(a) a = " " a += con[i] + b show.append(a) for e in show: print(e) #--------------------------------------------
# 6330178521 (18.47) 39 (2021-03-01 00:58) def get_unique( words ): unique_words = [] for w in words: if not w in unique_words: unique_words.append(w) return unique_words def jaccard(words_1, words_2): sim = 0 all = len(words_1) + len(words_2) for e in words_1: if e in words_2: sim +=1 jaccard_coef = sim/(all - sim) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] x=[] y=[] for i in range(len(norm_tweets)): j = jaccard(norm_tweets[i], norm_query) if j > 0: x.append([j, -i]) x.sort() y = x[::-1] for i in range(len(y)): top_n.append([(y[i][1]) * -1,y[i][0]]) top_n = top_n[:n:] return(top_n) def show_tweet(tweet_id, tweet_content, jc_coef, print_width): s = tweet_content.split() print() print('#' + str(tweet_id) + ' ' + '(' + str(round(jc_coef,2)) + ')') l = 0 z = '' for i in range(len(s)): p = ' ' + str(s[i]) if l + len(p) <= (print_width-1): if l != 0: z +=p else: z += ' ' + p l += len(p) else: print(z) z = ' ' + s[i] l = len(z) if len(z) != 0: print(z) #--------------------------------------------
# 6330179121 (15.02) 40 (2021-02-27 22:27) def get_unique( words ): unique_words = [] for i in words : if i not in unique_words : unique_words.append(i) return unique_words def jaccard(words_1, words_2): a = [] b = [] for i in words_1 : if i in words_2 : a.append(i) if i not in b : b.append(i) for i in words_2 : if i not in b : b.append(i) jaccard_coef = len(a)/len(b) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a = [] for i in range(len(norm_tweets)) : x = [] j = jaccard(norm_tweets[i],norm_query) x.append(j) x.append(i) a.append(x) a.sort(reverse = True) x = [] for i in range(len(a)) : y = [] y.append(a[i][1]) y.append(a[i][0]) x.append(y) i = 1 top_n = [] k = x[0][1] m = [x[0]] for i in range(len(x)-1) : if x[i+1][1] != k : top_n += m k = x[i+1][1] m = [x[i+1]] else : m.append(x[i+1]) m.sort() top_n += m return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): w = tweet_content.split() print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') n = [] for i in w : n.append(len(i)) while len(n) != 0 : i = 0 a = ' ' if n[i] > print_width : a = a+' '+w[i] i += 1 if i+1 > len(n) : break while len(a) <= print_width-n[i] : a = a+' '+w[i] i += 1 if i+1 > len(n) : break w = w[i:] n = n[i:] print(a) #--------------------------------------------
# 6330180721 (17.00) 41 (2021-02-26 11:05) def get_unique(words): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1,words_2): x = get_unique(words_1) y = get_unique(words_2) s = 0 for e in y: if e in x:s+=1 total = get_unique(x+y) jaccard_coef = s/len(total) return jaccard_coef def top_n_similarity(norm_tweets,norm_query,n): k =[]; top_n=[]; h=[]; l= [] for i in range(len(norm_tweets)): jac = jaccard(norm_tweets[i],norm_query) k.append([jac,i]) k.sort() k = k[-1::-1] for i in range(len(k)-1): if k[i][0]==k[i+1][0]: h.append([k[i][1],k[i][0]]) else: h.append([k[i][1],k[i][0]]) h.sort(); l += h h =[] h.append([k[-1][1],k[-1][0]]) h.sort(); l+=h for i in range(n): top_n.append(l[i]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): sen = tweet_content.split(' ') var = str(round(jc_coef,2)) print('') print('#'+str(tweet_id)+' ('+var+')') r = ' ' for e in range(len(sen)-1): if len(r+sen[e])<=print_width: r+=sen[e]+' ' else: print(r) r=' '+sen[e]+' ' if len(r+sen[-1])<=print_width: print(r+sen[-1]) else: print(r) print(' '+sen[-1]) #--------------------------------------------
# 6330181321 (20.00) 42 (2021-03-01 16:14) def get_unique( words ): unique_words = [ ] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): s=0 w1=get_unique(words_1) w2=get_unique(words_2) for e in w1: if e in w2: s+=1 jaccard_coef=s/(len(w1)+len(w2)-s) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): Top_no0=[ ] for tweet_id in range(len(norm_tweets)): Jaccard=jaccard(norm_tweets[tweet_id],norm_query) Top_no0.append([tweet_id,Jaccard]) Top_no1=[ ] for i in range(len(Top_no0)): if Top_no0[i][1]>0: Top_no1.append(Top_no0[i]) Top_no2=[ ] for [i1,i2] in Top_no1: Top_no2.append([i2,i1]) Top_no2.sort() Top_no2 = Top_no2[::-1] for i in range(len(Top_no2)): Top_no1[i][0],Top_no1[i][1] = Top_no2[i][1],Top_no2[i][0] for k in range(len(Top_no1)-1): for e in range(len(Top_no1)-1): if Top_no1[e][1] == Top_no1[e+1][1]: if Top_no1[e][0] > Top_no1[e+1][0]: Top_no1[e],Top_no1[e+1] = Top_no1[e+1],Top_no1[e] top_n=Top_no1[:n:] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): x=[] ans='' tweet_content=tweet_content.split(' ') print(' ') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') if len(' '.join(tweet_content))<print_width: print(' '+' '.join(tweet_content)) else: for i in range(len(tweet_content)): if len(' '+ans)<= print_width: x.append(tweet_content[i]) ans=' '.join(x) else: x.pop(-1) ans=' '.join(x) print(' '+ans) x=[tweet_content[i-1],tweet_content[i]] ans=' '.join(x) if len(' '+ans)<= print_width: print(' '+ans) else: x.pop(-1) ans=' '.join(x) print(' '+ans) print(' '+tweet_content[-1]) #--------------------------------------------
# 6330182021 (14.12) 43 (2021-03-01 09:52) def get_unique( words ): unique_words = [] for e in words: if not e in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): total = words_1+words_2 tt=[] for e in total: if not e in tt: tt.append(e) out = len(total)-len(tt) jaccard_coef = out/(len(tt)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] tweet_id = [] for i in range(len(norm_tweets)): tweet_id.append(i) y = [] for i in range (len(norm_tweets)): jaccard(norm_tweets[i],norm_query) y.append(jaccard(norm_tweets[i],norm_query)) x=[] for i in range (len(norm_tweets)): xx=[y[i],tweet_id[i]*(-1)] x.append(xx) r=sorted(x,reverse=True) x=r[:n] for [a1,a2] in x: top_n.append([a2*(-1),a1]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): w=tweet_content.split(' ') print(" ") print("#"+str(tweet_id),"("+str(round(jc_coef, 2))+")") n=print_width i=0 x=True while x: c=" "*2 while len(c) < n and i<len(w): c+=w[i]+" " i=i+1 if len(c)==n: x=True if len(c) > n: i= i-1 c=c[:-len(w[i])-1] else: x=True print(c) if i>=len(w) : x=False #--------------------------------------------
# 6330183621 (16.94) 44 (2021-02-25 23:42) def get_unique( words ): unique_words=[] for e in range(len(words)): if words[e] not in unique_words: unique_words.append(words[e]) else: pass return unique_words def jaccard(words_1, words_2): a=get_unique(words_1) b=get_unique(words_2) if len(a)>=len(b): c=0 for e in range(len(a)): if a[e] in b: c+=1 else: pass else: c=0 for e in range(len(b)): if b[e] in a: c+=1 else: pass total1=words_1+words_2 total2=get_unique(total1) bottom=len(total2) j=c/bottom return j def top_n_similarity(norm_tweets, norm_query, n): a=[] for e in range(len(norm_tweets)): b=jaccard(norm_tweets[e],norm_query) if b>0: c=[b,e] a.append(c) else: pass a.sort() for i in range(len(a)): for z in range (len(a)-i): if a[i][0]==a[i+z][0]: a[i],a[i+z]=a[i+z],a[i] c=[] for e in range(n): d=a[len(a)-e-1] f=[d[1],d[0]] c.append(f) return c def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') z=round(jc_coef,2) a='#'+str(tweet_id)+' ('+str(z)+')' print(a) n=print_width b=tweet_content.split() c=' '+b[0] for e in range (1,len(b)): if len(c)>2: c=c+' '+b[e] else: c=c+b[e] if e!= len(b)-1: if len(c)+len(b[e+1])+1>n: print(c) c=' ' else: pass else: print(c) #--------------------------------------------
# 6330184221 (20.00) 45 (2021-03-01 14:38) def get_unique( words ): unique_words = [] for e in words : if e not in unique_words : unique_words.append(e) return unique_words def jaccard(words_1, words_2): c = 0 t = [] for i in range(len(words_1)) : if words_1[i] in words_2 : c += 1 t.append([words_1[i]]) else : t.append([words_1[i]]) for i in range(len(words_2)) : if words_2[i] not in words_1 : t.append([words_2[i]]) p = len(t) jaccard_coef = c/p return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)) : if jaccard(norm_tweets[i] , norm_query) > 0 : top_n.append([-jaccard(norm_tweets[i] , norm_query),i]) top_n.sort() for i in range(len(top_n)) : top_n[i] =[top_n[i][1],-top_n[i][0]] top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#"+str(tweet_id)+' ('+str(round(jc_coef,2))+')') a = tweet_content.split(' ') s = "" c = 0 for e in a : if (c+len(e)+1) <= (int(print_width)-1) : s += " ".join([''] + [e]) c = len(s) else : print(" "+s) s = "" s += " ".join([''] + [e]) c = len(s) if len(s) != 0 : print(" "+s) #--------------------------------------------
# 6330185921 (20.00) 46 (2021-02-27 19:21) def get_unique( words ): words.sort() words.append("ยากชิบหายเลยมาม่่าไม่อร่อยงงจัด") unique_words=[] for i in range (len(words)-1): if words[i]!= words[i+1]: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): jaccard_coef=0 for i in range (len(words_1)): if words_1[i] in words_2: jaccard_coef+=1 words =words_1+words_2 words.sort() words.append("ยากชิบหายเลยมาม่่าไม่อร่อยงงจัด") unique_words=[] for i in range (len(words)-1): if words[i]!= words[i+1]: unique_words.append(words[i]) jaccard_coef=(float(jaccard_coef)/len(unique_words)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): d=[] top_n=[] for i in range (len(norm_tweets)): k =jaccard(norm_tweets[i],norm_query) d.append([k,i]) for i in range (len(d)): d[i][0] = float(d[i][0]*-1) d.sort() for i in range (len(d)): d[i][0] = float(d[i][0]*-1) if d[i][0] > 0 : top_n.append([d[i][1],d[i][0]]) top_n=top_n[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') z=tweet_content.split(' ') h=' ' for i in range (len(z)): if len(h)+len(z[i])+1 <= print_width: h=h+' '+z[i] if z[i] == z[-1]: print(h) else: print(h) h=' ' h=h+' '+z[i] if z[i] == z[-1]: print(h) #--------------------------------------------
# 6330186521 (17.75) 47 (2021-03-01 22:30) def get_unique( words ): unil = [] for e in words: if e not in unil: unil.append(e) unique_words = unil return unique_words def jaccard(words_1, words_2): un1 = get_unique(words_1) un2 = get_unique(words_2) dv = len(get_unique(words_1+words_2)) repe = [] for e in un1: if e in un2: repe.append(e) jaccard_coef = len(repe)/dv return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): sp = norm_tweets sub = [] top_n = [] for e in sp: if jaccard(e,norm_query) > 0: sub.append(sp.index(e)) sub.append(jaccard(e,norm_query)) top_n.append(sub[-2:]) if len(top_n) >= n: break for e in top_n: e[0],e[1] = -e[1],e[0] top_n.sort() for e in top_n: e[0],e[1] = e[1],-e[0] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n'+'#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') para = (tweet_content.split(' ')) l=True i=0 a=[] while l == True: if i < len(para): a.append(para[i]) t = ' '.join(a) i+=1 if len(t) > print_width-2 and i < len(para): print(' '+ t[:len(t)-1-len(a[-1])]) a=[a[-1]] elif i == len(para): if len(t) > print_width-2: print(' '+ t[:len(t)-1-len(a[-1])]+'\n'+' '+a[-1]) else: print(' '+t) l = False #--------------------------------------------
# 6330187121 (16.67) 48 (2021-02-27 23:05) def get_unique( words ): unique_words = [] for e in words: if not e in unique_words: unique_words.append(e) return unique_words #-------------------------------------------------------- def jaccard(words_1, words_2): nu = 0 for e in words_1: if e in words_2: nu += 1 de = len(words_1)+len(words_2)-nu jaccard_coef = nu/de return jaccard_coef #-------------------------------------------------------- def top_n_similarity(norm_tweets, norm_query, n): top_n = [] a = [] for i in range (len(norm_tweets)): r = -jaccard(norm_tweets[i],norm_query) a.append([r,i]) a.sort() a = a[:n] for e,i in a: top_n.append([i,-e]) for e in top_n: if e[1] == 0: top_n.remove(e) if [1,0.0] in top_n: top_n.remove([1,0.0]) return top_n #-------------------------------------------------------- def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') a = round(jc_coef,2) print('#'+str(tweet_id),'('+str(a)+')') x1 = tweet_content ab = [] while len(x1) > (print_width-2): x1 = x1.split(' ') d = x1[0] n = 0 while len(d) <= (print_width-2): n += 1 d += ' '+x1[n] x2 = d.split() x2.remove(x1[n]) x3 = ' '.join(x2) if x3[0] == ' ': x3.remove(x3[0]) for e in x2: if e in x1: x1.remove(e) ab.append(x3) x1 = ' '.join(x1) for e in ab: print(' '+e) if x1[0] == ' ': x1 = x1[1:] print(' '+x1) #--------------------------------------------
# 6330188821 (16.00) 49 (2021-03-01 23:44) def get_unique( words ): unique_words = [] for x in words: if x not in unique_words: unique_words.append(x) return unique_words def jaccard(words_1, words_2): intersect = 0 for x in words_1: if x in words_2: intersect += 1 total = (len(words_1) + len(words_2)) - intersect jaccard_coef = intersect / total return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): x = jaccard(norm_tweets[i], norm_query) if x > 0: top_n.append([-x, i]) top_n.sort() for i in range(len(top_n)): top_n[i] = [top_n[i][1], (-top_n[i][0])] top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#' + str(tweet_id) + ' (' + str(round(jc_coef, 2)) + ')') content = tweet_content.split(' ') a = '' b = 0 for w in content: if (b + 1 + len(w)) <= (print_width - 1): a += ' ' + w l = len(a) else: print(' ' + a) a = '' a += ' ' + w l = len(a) if len(a) != 0: print(' ' + a) #--------------------------------------------
# 6330189421 (18.44) 50 (2021-02-28 23:28) def get_unique( words ): unique_words=[] for i in words: if not i in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): All=[] x=0 for e in words_1: if not e in All: All.append(e) for e in words_2: if not e in All: All.append(e) for e in All: if e in words_1 and e in words_2: x+=1 jaccard_coef=x/len(All) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): x=[] b=[] for i in range(len(norm_tweets)): x=[] x.append(-jaccard(norm_tweets[i],norm_query)) x.append(i) b=b+[x] b.sort() top_n=[] for i in range(n): if -b[i][0] > 0: top_n.append([b[i][1],-b[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(" ") print("#"+str(tweet_id)+" "+"("+str(round(jc_coef,2))+")") x=tweet_content.split() y=" " a="" for i in range (len(x)): a=y y+=" "+x[i] if len(y)>print_width: print(a) y=" " a="" y+=" "+x[i] else:print(y) #--------------------------------------------
# 6330190021 (16.12) 51 (2021-03-01 01:19) def get_unique( words ): if words != [] : info = words char = info[0] unique_words = [char] while True : while char in info : info.remove(char) if len(info) == 0 : break else: char = info[0] unique_words.append(char) else: unique_words = words return unique_words def jaccard(words_1, words_2): top = [] for e in words_1 : if e in words_2 : top.append(e) jaccard_coef = len(top)/len(get_unique(words_1+words_2)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): info = [] for i in range(len(norm_tweets)) : tweet_id = i info.append([-jaccard(norm_query,norm_tweets[i]),tweet_id]) info.sort() top_n = info[:n] for i in range(len(top_n)) : top_n[i][0] = -top_n[i][0] top_n[i] = top_n[i][::-1] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#"+str(tweet_id),"("+str(round(jc_coef,2))+")") info = tweet_content.split(" ") OP = " " while len(info) != 0 : while len(OP) < print_width : if len(OP+" "+info[0]) <= print_width : OP += " "+info[0] else: break info.pop(0) if len(info) == 0 : break print(OP) OP = " " #--------------------------------------------
# 6330191621 (14.18) 52 (2021-03-01 14:02) def get_unique( words ): d = {x:0 for x in words} unique_words = list(d) return unique_words def jaccard(words_1, words_2): d = [] for word in words_1 : if word in words_2 : d += [word] totalword = words_1 + words_2 totalword = get_unique(totalword) jaccard_coef = len(d)/len(totalword) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)) : top_n.append([jaccard(norm_tweets[i],norm_query), i])#swap later for item in top_n:#jac more than 0 if item[1] <= 0 : top_n.remove(item) top_n.sort(reverse = True)#sort by jac for k in range(len(top_n)-1) :#sort by tweet if jac equal if top_n[k][0] == top_n[k+1][0]: if top_n[k][1] > top_n[k+1][1] : top_n[k], top_n[k+1] = top_n[k+1],top_n[k] for item in top_n :#swapto[index,jac] item[0], item[1] = item[1], item[0] top_n = top_n[:n]#topn return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): import math print(' ') print('#' + str(tweet_id), '(' + str(round(jc_coef, 2)) + ')') mylist = list(tweet_content) newlist = [] for i in range(math.ceil(len(mylist)/print_width)): newlist.append(''.join(mylist[i*print_width:(i+1)*print_width])) print('\n'.join(newlist)) #--------------------------------------------
# 6330192221 (19.10) 53 (2021-03-01 23:59) def get_unique( words ): unique_words=[] for i in range(len(words)): for j in range(0,i+1): if words[j] not in unique_words: unique_words.append(words[j]) return unique_words def jaccard(words_1, words_2): numer=0 denom=1 for i in range(len(words_1)): for j in range(len(words_2)): if words_1[i] == words_2[j]: numer+=1 denom=len(words_1)+len(words_2)-numer jaccard_coef= numer/denom return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i],norm_query)>0: top_n.append([i,jaccard(norm_tweets[i],norm_query)]) top_n.sort(key=lambda x:(x[1],-x[0])) return top_n[-1:-n-1:-1] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("\n"+"#"+str(tweet_id),"("+str(round(jc_coef,2))+")") x=tweet_content.split(' ') printed=2 line=" " for i in range(len(x)-1): if printed+len(x[i])>print_width: print(line) line=" " printed=2 printed=printed+len(x[i])+1 line+=(x[i]+" ") else: print(line+x[i+1]) return #--------------------------------------------
# 6330193921 (18.01) 54 (2021-02-27 12:37) def get_unique( words ): for i in range(len(words)): unique = words[0] words.remove(unique) if unique not in words: words.append(unique) unique_words = words return unique_words def jaccard(words_1, words_2): k = 0 for e in words_2: if e in words_1: k += 1 jaccard_coef = k/((len(words_1)+len(words_2))-k) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): s = [] for i in range(len(norm_tweets)): tweet_id = i jaccard2 = jaccard(norm_tweets[tweet_id],norm_query) s.append([tweet_id,-jaccard2]) for e in s: e[0],e[1] = e[1],e[0] s.sort() for e in s: e[0],e[1] = e[1],-e[0] top_n = s[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id),"("+str(round(jc_coef,2))+')') line = " " tweet_content = tweet_content.split(' ') for e in tweet_content: if len(line)+len(e) <= print_width: line += e+" " else: print(line) line = " " line += e+" " print(line) #--------------------------------------------
# 6330194521 (17.89) 55 (2021-03-01 01:26) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): s_t = [] for i in words_1: if i in words_2: s_t.append(i) word_fin = words_1+words_2 jaccard_coef = len(s_t)/len(get_unique(word_fin)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): top_n.append([i,jaccard(norm_tweets[i],norm_query)]) for x in top_n: x[0],x[1] = -x[1],x[0] top_n = sorted(top_n) for y in top_n: y[0] = -y[0] y[0],y[1] = y[1],y[0] return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): text = tweet_content.split(" ") print() print(str(tweet_id) + " " + "(" + str(round(jc_coef,2)) +")") c_w = 0 f_w = True for i in text: if c_w+len(i) <= print_width: if f_w == True: print(" ", end="") c_w += 2 print(i, end=" ") c_w += len(i)+1 f_w = False else: print() c_w = 0 f_w = True if c_w+len(i) > print_width: print(" " + i) c_w = 0 continue else: print(" ", end="") c_w += 2 print(i, end=" ") c_w += len(i)+1 f_w = False print() #--------------------------------------------
# 6330197421 (17.01) 56 (2021-02-28 13:06) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): numerator = 0 for e in words_1: if e in words_2: numerator += 1 denominator = numerator for e in words_1: if e not in words_2: denominator += 1 for e in words_2: if e not in words_1: denominator += 1 jaccard_coef = numerator/denominator return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): JC = jaccard(norm_tweets[i], norm_query) top_n.append([i, JC]) top_n = [tweet[::-1] for tweet in top_n] top_n.sort() top_n = top_n[::-1] for i in range(len(top_n)-1): if top_n[i][0] == top_n[i+1][0]: if top_n[i][1] > top_n[i+1][1]: top_n[i],top_n[i+1] = top_n[i+1],top_n[i] for i in range(len(top_n)-1): if top_n[i][0] == top_n[i+1][0]: if top_n[i][1] > top_n[i+1][1]: top_n[i],top_n[i+1] = top_n[i+1],top_n[i] top_n = [tweet[::-1] for tweet in top_n] top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#" + str(tweet_id), "(" + str(round(jc_coef,2)) + ")") words = tweet_content.split(' ') Line = [' '] for e in words: if e == words[-1]: if len(" ".join(Line)) < print_width: if len((" ".join(Line) + ' ' + e)) <= print_width: Line += [e] print(" ".join(Line)) else: print(" ".join(Line)) print(" " + e) else: if len(" ".join(Line)) < print_width: if len((" ".join(Line) + ' ' + e)) <= print_width: Line += [e] else: print(" ".join(Line)) Line = [' '] + [e] else: print(" ".join(Line)) Line = [' '] + [e] #--------------------------------------------
# 6330198021 (0.00) 57 (2021-03-01 23:53) def get_unique( words ): unique_words = [] for i in range (len(words)) : if words[i] not in unique_words : unique_words.append(words[i]) else : pass return unique_words def jaccard(words_1, words_2): k = 0 b = len(words_1) c = len(words_2) for a in words_1 : if a in words_2 : k = k + 1 else : k = k + 0 jaccard_coef = float(k/((b+c)-k)) return jaccard_coef
# 6330199721 (17.33) 58 (2021-03-01 22:04) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): a = 0 for i in words_1: if i in words_2: a += 1 b = len(words_1)+len(words_2)-a jaccard_coef = a/b return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i],norm_query) > 0: top_n.append([jaccard(norm_tweets[i],norm_query),-i]) top_n.sort(reverse = True) for a in top_n: a[0],a[1] = -a[1] , a[0] top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') a = tweet_content b = print_width while len(a) > 0 : if len(a) < b: print(' ' + a[:]) break elif ' ' not in a[:]: print(' ' + a.split()[0]) t = ' '.join(a.split()[1:]) elif a[b] == ' ' or a[b-1] == ' ': print(' ' + a[:b]) a = a[b:] else: c = a[:b].split() a = c[-1].strip() + a[b:] print(' '+' '.join(c[:-1])) #--------------------------------------------
# 6330200621 (17.40) 59 (2021-03-01 14:39) def get_unique( words ): unique_words = [] for e in words: if e in unique_words: unique_words.remove(e) unique_words.append(e) return unique_words def jaccard(words_1, words_2): c = 0 for e in words_1: if e in words_2: c +=1 n = len(words_1)+len(words_2) for e in words_1: if e in words_2: n -=1 jaccard_coef = c/n return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i],norm_query) > 0: top_n.append([jaccard(norm_tweets[i],norm_query) , -i]) top_n.sort(reverse = True) for e in top_n: e[0] ,e[1] = -e[1] , e[0] top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') t = tweet_content n = print_width while len(t) != 0 : if len(t) <= n: print(' ' + t[:n]) t = t[n:] else: if ' ' not in t[:n]: print(' ' + t.split()[0]) t = ' '.join(t.split()[1:]).strip() else: if t[n] == ' ' or t[n-1] == ' ': print(' ' + t[:n]) t = t[n:].strip() else: s = t[:n].split() t = s[-1].strip() + t[n:] print(' '+' '.join(s[:-1])) #--------------------------------------------
# 6330201221 (12.94) 60 (2021-03-01 23:20) def get_unique( words ): unique_words = [] for item in words: if item not in unique_words: unique_words.append(item) return unique_words def jaccard(words_1, words_2): count_same = 0 for c_same in words_1: if c_same in words_2: count_same += 1 if len(words_1)+len(words_2)-count_same != 0: jaccard_coefx = count_same/(len(words_1)+len(words_2)-count_same) jaccard_coef = round(jaccard_coefx,2) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n0 = [] top_n = [] for i in range(len(norm_tweets)): tweet_id = i j = jaccard(norm_tweets[i],norm_query) if j > 0: top_n0.append([j,int(i)]) top_n0.sort(reverse = True) '''for e in top_n0: e[0] = e[1] e[1] = -e[0]''' for j in range(n): top_n.append([top_n0[j][1],top_n0[j][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id),"("+str(round(jc_coef,2))+')') line = " " tweet_content = tweet_content.split(' ') for i in range(len(tweet_content)): if i == len(tweet_content)-1 and len(line)+len(tweet_content[i]) < print_width: print(line) elif i == len(tweet_content)-1 and len(line)+len(tweet_content[i]) >= print_width: print(line) print(' '+tweet_content[i]) elif len(line)+len(tweet_content[i]) < print_width: line += tweet_content[i]+" " else: print(line) line = " " line += tweet_content[i]+" " #--------------------------------------------
# 6330202921 (16.93) 61 (2021-03-01 20:57) def get_unique( words ): rtrn = [] words.sort() l = len(words) if(l>0): crnt = words[0] rtrn.append(words[0]) for w in words: if(w!=crnt): crnt = w rtrn.append(w) return rtrn def jaccard(words_1, words_2): l1 = len(words_1) l2 = len(words_2) lmrg = len(get_unique(words_1+words_2)) same = l1+l2-lmrg return same/lmrg def top_n_similarity(norm_tweets, norm_query, n): l1 = [] for i in range(0,len(norm_tweets)): j = -jaccard(norm_tweets[i],norm_query) if(j<0): l1.append([j,i]); l1.sort() l2 = [] if(n>len(l1)): n = len(l1) for i in range(0,n): item = l1[i] l2.append([item[1],-item[0]]) return l2 def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') txt = '' for tweet in tweet_content.split(' '): if(len(txt)>0 and (len(txt)+len(tweet))>print_width): print(txt) txt = '' txt += tweet + ' '; if(len(txt)>0): print(txt) #--------------------------------------------
# 6330203521 (19.62) 62 (2021-02-28 21:13) def get_unique( words ): words.sort() unique_words = [] if len(words) != 0 : for i in range(len(words)-1) : if words[i] != words[i+1] : unique_words.append(words[i]) unique_words.append(words[-1]) return unique_words def jaccard(words_1, words_2): a = 0 b = list(words_1) if len(b) != 0 : for c in words_2 : if c in words_1 : a += 1 else : b.append(c) jaccard_coef = a/len(b) else : jaccard_coef = 0 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] x = [] for i in range(len(norm_tweets)) : u = list(norm_tweets[i]) j = jaccard(norm_tweets[i],norm_query) if j > 0 : x.append([i,j]) a = len(x) for k in range(a-1) : for i in range(a-1) : #Bubble_sort if x[i][1] < x[i+1][1] : x[i],x[i+1] = x[i+1],x[i] elif x[i][1] == x[i+1][1] and x[i][0] > x[i+1][0] : x[i],x[i+1] = x[i+1],x[i] top_n = x[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): p = tweet_content+" " print("") print("#"+str(tweet_id)+" "+"("+str(round(jc_coef,2))+")") u = 0 index = 0 a = print_width -2 while True : #rfindถ้าหาไม่เจอจะreturnค่า-1ออกมา index = p.rfind(" ",u,u+a+1) if index == -1 : break print(" "+p[u:index]) u = index+1 #--------------------------------------------
# 6330205821 (18.01) 63 (2021-03-01 13:44) def get_unique( words ): wws = words unique_words=[] for e in wws: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): wss1 = words_1 wss2 = words_2 c = [] for e in wss2: if e in wss1 : c.append(e) jaccard_coef = len(c)/(len(wss1)+len(wss2)-len(c)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): y =0 t=[] while y < len(norm_tweets): jac = jaccard(norm_tweets[y],norm_query) t.append([jac*(-1),y]) y += 1 t.sort() top =t for i in range(len(t)): top[i][0],top[i][1] = t[i][1],-t[i][0] top_n = top[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') tt = tweet_content.split(" ") k = 0 d = ' ' while k < len(tt): if k ==len(tt)-1 and len(d)+len(tt[k]) <= print_width: d += tt[k] print(d) elif len(d)+len(tt[k]) <= print_width: d +=tt[k]+' ' elif k == len(tt)-1 and len(d)+len(tt[k]) > print_width: print(d) d = ' '+tt[k]+' ' print(d) else: print(d) d = ' '+tt[k]+' ' k += 1 #--------------------------------------------
# 6330206421 (14.16) 64 (2021-02-28 13:21) def get_unique( words ): unique_words = [] for item in words: if item not in unique_words: unique_words.append(item) return unique_words def jaccard(words_1, words_2): count_same = 0 for c_same in words_1: if c_same in words_2: count_same += 1 words_sum = words_1 + words_2 unique_words = [] for item in words_sum: if item not in unique_words: unique_words.append(item) jaccard_coefx = count_same/len(unique_words) jaccard_coef = round(jaccard_coefx,2) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = []*n count = len(norm_tweets) for i in range(count): tweet_id = (i) jaccard_mi = ((jaccard(norm_tweets[tweet_id],norm_query))*(-1)) top_n.append([jaccard_mi,tweet_id]) top_n.sort() for i in top_n: i[0],i[1] = i[1],(i[0])*(-1) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): jc = round(jc_coef,2) word = tweet_content.split(" ") maximum_word = " " check = 0 count = 0 print() print("#"+str(tweet_id)+" ("+str(jc)+")") for i in word: if (len(maximum_word)+len(i) <= print_width-1): maximum_word = " ".join((maximum_word, i)) count += 1 check += 1 elif (len(maximum_word)+len(i) > print_width-1): break print(maximum_word) while(check < len(word)): maximum_word = " " for i in word[count::]: if (len(maximum_word)+len(i) <= print_width-1): maximum_word = " ".join((maximum_word, i)) count += 1 check += 1 elif (len(maximum_word)+len(i) > print_width-1): break print(maximum_word) #--------------------------------------------
# 6330207021 (0.00) 65 (2021-03-01 23:25) def jaccard(words_1, words_2): for i in words_1: a=0 if words_1[i] in word_2: a+=1 jaccard_coef=(a/(len(words_1)+len(words_2)-a)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): x=[] z=[-(norm_query),(norm_tweet[i])] x.append(z) x.sort() norm_query,norm_tweet[i]=-norm_tweet[i],norm_query top_n=x[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content.split(' ') print('') print('#' + tweet_id + round(jc_coef,2)) print(' ' + t) #--------------------------------------------
# 6330208721 (20.00) 66 (2021-02-28 10:42) def get_unique(words): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): I = [] s = [] for i in words_1: if i in words_2: I.append(i) for i in words_1+words_2: if i not in s: s.append(i) jaccard_coef = len(I)/len(s) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): j = [] for i in range(len(norm_tweets)): j.append([-jaccard(norm_tweets[i], norm_query),i]) j.sort() j2 = j[:n] top_n = [] for i in j2: if i[0] != 0.0: top_n.append([i[1],-i[0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): list_tweet = tweet_content.split(' ') c = [] length = 0 print_width-=2 print() print('#'+str(tweet_id)+" ("+str(round(jc_coef,2))+")" ) for i in range(len(list_tweet)): if length + len(list_tweet[i]) > print_width: print(" "+" ".join(c)) c = [] length = 0 c.append(list_tweet[i]) length += len(list_tweet[i])+1 print(" "+" ".join(c)) #--------------------------------------------
# 6330209321 (17.00) 67 (2021-03-01 17:31) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] in unique_words: pass else: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): inter = 0 for i in range(len(words_1)): if words_1[i] in words_2: inter += 1 #========================== union = [] for i in range(len(words_1)): if words_1[i] in union: pass else: union.append(words_1[i]) for i in range(len(words_2)): if words_2[i] in union: pass else: union.append(words_2[i]) jaccard_coef = inter/len(union) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): jaclist=[] for i in range(len(norm_tweets)): jac=jaccard(norm_tweets[i],norm_query) jaclist.append([i,jac]) sorted_jaclist=sorted(jaclist,key=lambda x: x[1]) top_n=[] for i in range(n): k=len(sorted_jaclist) if sorted_jaclist[k-1-i][1]>0: top_n.append(sorted_jaclist[k-1-i]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#"+str(tweet_id)+" ("+str(round(jc_coef,2))+")") #print("#%d (%f)"%(tweet_id,round(jc_coef,2))) words=tweet_content.split(" ") temps=" " for i in range(len(words)): l=len(temps)+len(words[i])+1 if l<=print_width: temps+=" "+words[i] else: if len(temps)==1: print(" "+words[i]) #temps=" " else: print(temps) temps=" "+words[i] print(temps) #--------------------------------------------
# 6330210921 (14.01) 68 (2021-03-01 20:01) def get_unique( words ): unique_words = [] for e in words : if e not in unique_words : unique_words.append(e) return unique_words ################################### def jaccard(words_1, words_2): #top top = 0 if len(words_1)<len(words_2) : gt_words = words_2 lt_words = words_1 else : gt_words = words_1 lt_words = words_2 for e in gt_words : if e in lt_words : top += 1 #bottom bottom = 0 sum_words = words_1 for e in words_2 : if e not in sum_words : sum_words.append(e) bottom = len(sum_words) #Jaccard similarity coefficient jaccard_coef = top/bottom return jaccard_ceof ###################################### def top_n_similarity(norm_tweets, norm_query, n): all_words = [] for i in range(len(norm_tweets)) : now_list = norm_tweets[i] top = 0 for e in now_list : if e in norm_query : top += 1 bottom = 0 s_words = [] for e in norm_query : if e not in s_words : s_words.append(e) for e in now_list : if e not in s_words : s_words.append(e) bottom = len(s_words) jaccard = top/bottom pt_test = [] pt_test.append(jaccard) pt_test.append(i) all_words.append(pt_test) all_words.sort() all_words.reverse() i = 0 while i < len(all_words)-1 : test1 = all_words[i] test2 = all_words[i+1] if test1[0] == test2[0] : if test1[1]>test2[1] : all_words[i],all_words[i+1] = all_words[i+1],all_words[i] i -= 1 else : i += 1 else : i += 1 top_n = all_words[:n] for i in range(len(top_n)) : list = top_n[i] fill = [] fill.append(list[1]) fill.append(list[0]) top_n[i] = fill return top_n ########################################################################### def show_tweet(tweet_id, tweet_content, jc_coef, print_width) : jc_coef = round(jc_coef,2) jc_coef = str(jc_coef) print("\n#"+str(tweet_id)+" ("+jc_coef+")") tweet_content = tweet_content.split(" ") n = " " for e in tweet_content : if len(n)+len(e)<print_width : n += " " n += e else : print(n) n = " " n += e print(n) ################################################################# #--------------------------------------------
# 6330211521 (11.35) 69 (2021-03-01 16:20) def get_unique( words ): unique_words = [] for i in range(len(words)) : if words[i] in unique_words : unique_words += [] elif words[i] not in unique_words : unique_words += [words[i]] return unique_words def jaccard(words_1, words_2): a = 0 b = [] words_2_2 = words_2 for i in range(len(words_1)) : if words_1[i] in words_2 : a += 1 if words_1[i] in words_2_2 : b += words_1[i] words_2_2.remove(words_1[i]) elif words_1[i] not in words_2_2 : b += words_1[i] for j in range(len(words_2_2)) : b += words_2_2[j] jaccard_coef = a/(int(len(b))) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): e2 = [] c = 0 n1 = 0 d = [] e = [] for j in range(len(norm_tweets)) : tweet_id = j norm_query1 = norm_query for i in range(len(norm_tweets[j])) : if norm_tweets[j][i] in norm_query : c += 1 if norm_tweets[j][i] in norm_query1 : d += norm_tweets[j][i] norm_query1.remove(norm_tweets[j][i]) elif norm_tweets[j][i] not in norm_query1 : d += norm_tweets[j][i] for k in range(len(norm_query1)) : d += norm_query1[k] jaccard = c/len(d) e += [[jaccard, tweet_id]] e.sort() e1 = [] for [a1, a2] in e : e1.append([a2, a1]) for i in range(len(e1)) : if e1[i][1]>0 : e2 += [e1[i]] n1 += 1 if n1 == n : break else : e2 += [] top_n = e2 return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content = tweet_content.split(' ') print("\n") print('#'+str(tweet_id), (round(jc_coef, 2))) print(' '.join(tweet_content)) #--------------------------------------------
# 6330212121 (16.16) 70 (2021-02-25 22:51) def get_unique( words ): unique_words=[] l=len(words) for i in range(l): if (not str(words[i]) in words[i+1:]) and (not str(words[i]) in unique_words): unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): t=[] lw1=len(words_1) lw2=len(words_2) if lw1<=lw2: for e in words_1: if e in words_2: t.append(e) else: for e in words_2: if e in words_1: t.append(e) a_sec_b=len(t) union=words_1+words_2 k=get_unique(union) a_union_b=len(k) return round(a_sec_b/a_union_b,2) def top_n_similarity(norm_tweets, norm_query, n): x=[] l=len(norm_tweets) for i in range(l): jc=jaccard(norm_tweets[i],norm_query) x.append([i,jc]) #print(x) x.sort(key=takeSecond,reverse=True) #print(x) top_n =x[:n:] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): #print('1234567890123456789012345678901234567890') print(' ') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') x=tweet_content.split(' ') #print(x) y=[] for i in range(len(x)): y.append(len(x[i])) lt=' ' for i in range(len(x)): if len(lt) <= print_width and len(lt)+y[i]<= print_width: lt+=x[i]+' ' else: print(lt) lt=' '+x[i]+' ' print(lt) #--------------------------------------------
# 6330213821 (13.03) 71 (2021-03-01 21:38) def get_unique( words ): unique_words = [] for w in words: if w not in unique_words: unique_words.append(w) return unique_words def jaccard(words_1, words_2): words_1 = get_unique(words_1) words_2 = get_unique(words_2) k = 0 i = len(get_unique(words_1 + words_2)) for w in words_1: if w in words_2: k+=1 jaccard_coef = k/i return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): tweet_id = [] for i in range(len(norm_tweets)): tweet_id.append(i) j = [] for i in range (len(norm_tweets)) : if jaccard(norm_tweets[i],norm_query) > 0: j.append(jaccard(norm_tweets[i],norm_query)) a = [] for i in range (len(norm_tweets)) : b = [jaccard(norm_tweets[i],norm_query),tweet_id[i]] a.append(b) a.sort() a = a[::-1] top_n = [] for i in range (n): top_n.append(a[i][::-1]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): t1 = tweet_content.split() print('') print('#'+str(tweet_id),'(',str(round(float(jc_coef),2)),')') c =2 print(' ',end = '') for e in t1: if c + len(e) + 1 <= print_width: print(" "+e,end = '') c += len(e) + 1 else: print() print(' ',end ='') c = 2+len(e) print(' '+e,end= '') print () #--------------------------------------------
# 6330214421 (18.50) 72 (2021-02-28 01:19) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): a = [] if len(words_1) < len(words_2): for i in range(len(words_1)): if words_1[i] in words_2: a.append(words_1[i]) elif len(words_1) > len(words_2): for i in range(len(words_2)): if words_2[i] in words_1: a.append(words_2[i]) elif len(words_1) == len(words_2): for i in range(len(words_2)): if words_2[i] in words_1: a.append(words_2[i]) b = len(words_1)+len(words_2)-len(a) jaccard_coef = len(a)/b return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): b = [] for i in range(len(norm_tweets)): tweet_id = i a = jaccard(norm_tweets[i], norm_query) b.append([-tweet_id,a]) for i in b: i[0],i[1] = i[1],i[0] b.sort() b = b[::-1] for i in b: i[0],i[1] = i[1],i[0] for i in b: if i[0] < 0: i[0] = -i[0] for i in b: if i[1] == 0.0 : b.remove(i) top_n = b[:n:] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(" ") print("#"+str(tweet_id),"("+str(round(jc_coef,2))+")") a = tweet_content.split(" ") b = print_width - 2 c = 0 x = [] for i in range(len(a)): c += len(a[i])+1 x.append(a[i]) if c == b: r = " "+" ".join(x) print(r) c = 0 x = [] if c-1 > b : x = x[:-1:] r = " "+" ".join(x) print(r) c = len(a[i])+1 x = [] x.append(a[i]) r = " "+" ".join(x) print(r) #--------------------------------------------
# 6330215021 (17.95) 73 (2021-03-01 01:43) def get_unique( words ): if len(words) == 0: return [] words.sort() unique_words = [] unique_words.append(words[0]) for i in range(len(words)-1): if words[i] == words[i+1]: pass else: unique_words.append(words[i+1]) unique_words.sort() return unique_words def jaccard(words_1, words_2): union = words_1 + words_2 union = get_unique(union) intersec = [] for i in range(len(words_1)): if words_1[i] in words_2: intersec.append(words_1[i]) jaccard_coef = len(intersec)/len(union) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = [] for i in range(len(norm_tweets)): jaccard_coef = jaccard(norm_tweets[i],norm_query) top.append([jaccard_coef*(-1),i]) top.sort() for i in range(len(norm_tweets)): top[i][0] *= -1 top[i][0],top[i][1] = top[i][1],top[i][0] top_n = top[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): jc_coef = round(jc_coef,2) print("") print("#"+str(tweet_id)+" ("+str(jc_coef)+")") tweet_word = tweet_content.split(" ") content = " " for e in tweet_word: if len(content)<=print_width: content += " "+e if len(content)>print_width: print(content[:-(len(e))]) content=" "+e print(content) #--------------------------------------------
# 6330216721 (20.00) 74 (2021-02-27 01:47) def get_unique( words ): unique_words = [] for word in words: if word not in unique_words: unique_words.append(word) return unique_words def jaccard(words_1, words_2): n_intersect = sum(word in words_2 for word in words_1) n_union = len(words_1) + len(words_2) - n_intersect jaccard_coef = n_intersect/n_union if n_union != 0 else 0 """ w1, w2 = set(words_1), set(words_2) i = len(w1.intersection(w2)) u = len(w1.union(w2)) assert jaccard_coef == i/u """ return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = [[tweet_id, jaccard(tweet, norm_query)] for tweet_id, tweet in enumerate(norm_tweets)] # TODO: check order[asc, dec], stable? top = filter(lambda x: x[1] > 0, top) # similarity must be > 0 top = sorted(top, key=lambda x: (-x[1], x[0])) # sort desc by score then asc by index top_n = top[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#{} ({})".format(tweet_id, round(jc_coef, 2))) # split tweet content into token content = tweet_content.split(' ') # initalize line to first content current_line = " " + content[0] # iterate through all token for tok in content[1:]: # skipping appending empty space onto empty line # relevent when handling breaking in multispace sequence if current_line == " " and len(tok) == 0: continue # lookahead to see if line can fit new token if (len(current_line) + 1 + len(tok)) <= print_width: # add token to line if it fit, spacing if nesscary current_line += (" " if current_line != " " else "" )+ tok else: # but if it can't fit # then print the current line out print(current_line.rstrip()) # and start new line, with current token added to it current_line = " " + tok # display leftover line, if it isn't empty if current_line != " ": print(current_line) #--------------------------------------------
# 6330217321 (20.00) 75 (2021-02-27 06:32) def get_unique( words ): unique_words=[] for i in range (len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): w = words_1 + words_2 k=0 for e in words_1: if e in words_2: k+=1 w.sort() i=0 while i <= (len(w)-1): if i != len(w)-1: if w[i] == w[i+1]: w.remove(w[i]) else: i+=1 else: break jaccard_coef = k/len(w) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] a = [] for i in range (len(norm_tweets)): jac = jaccard(norm_tweets[i], norm_query) if jac != 0: a.append([-jac,i]) a.sort() for i in range (len(a)): a[i][0],a[i][1] = a[i][1],-a[i][0] top_n = a[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): w = tweet_content.split(" ") print("") print("#"+str(tweet_id),"("+str(round(jc_coef,2))+")") a=0 s=[] for e in w: if a+len(e)+1 <= print_width-1: e=" "+e s.append(e) a+=len(e) else: e=" "+e print(" "+"".join(s)) s = [e] a = len(e) print(" "+"".join(s)) #--------------------------------------------
# 6330219621 (20.00) 76 (2021-02-27 13:01) def get_unique( words ): unique_words = [] for e in words : if e not in unique_words : unique_words.append(e) else : pass return unique_words def jaccard(words_1, words_2): words_1 = get_unique(words_1) words_2 = get_unique(words_2) c = 0 for j in words_1 : if j in words_2 : c += 1 else : words_2.append(j) jaccard_coef = c / len(words_2) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] tn = list() for i in range(len(norm_tweets)) : jac = jaccard(norm_query,norm_tweets[i]) if jac > 0 : jac *= -1 x = [jac,i] tn.append(x) tn.sort() for e in tn : x = [e[1],-e[0]] top_n.append(x) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tc = tweet_content.split() c = 0 s,f = 0,'' cn = list() print('\n#'+str(tweet_id),'('+str(round(jc_coef,2))+')') for e in tweet_content : if e == ' ' : c += 1 elif c != 0 : c = str(c) cn.append(c) c = 0 cn.append('1') for i in range(len(tc)) : s += len(tc[i]) if s < print_width-1 : s += int(cn[i]) f += str(tc[i])+' '*int(cn[i]) else : print(' ',f) s = len(tc[i])+int(cn[i]) f = str(tc[i])+' '*int(cn[i]) print(' ',f) #--------------------------------------------
# 6330221821 (18.33) 77 (2021-03-01 02:08) def get_unique( words ): unique_words = [] if len(words) == 1: unique_words = words words.sort() for e in range(len(words)): if words[e] != words[e-1]: unique_words.append(words[e]) return unique_words def jaccard(words_1, words_2): a = get_unique(words_1+words_2) jaccard_coef = (len(words_1+words_2)-len(a))/len(a) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a = [] for e in range(len(norm_tweets)): jc = jaccard(norm_tweets[e], norm_query) if jc > 0: a.append([1-jc,e]) a.sort() for p in range(len(a)): a[p][0],a[p][1] = a[p][1], 1-a[p][0] top_n= a[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('#'+str(tweet_id), '('+str(round(jc_coef,2))+')') i = 0 w = tweet_content.split(' ') while i < len(w): wc = 2 ln = [' '] while i < len(w): wc += len(w[i]) if wc > print_width or i == len(w): break ln.append(w[i]); i += 1 wc += 1 if wc > print_width: break print(' '.join(ln)) #--------------------------------------------
# 6330222421 (13.34) 78 (2021-02-28 12:16) def get_unique( words ): # words = norm_tweets words.sort() unique_word0 = [] for i in range(len(words)) : if i == 0 : unique_word0 += [words[i]] else : if words[i] != words[i-1] : unique_word0 += [words[i]] unique_words = unique_word0 return unique_words def jaccard(words_1, words_2): words_1 = get_unique(words_1) #unique_words words_2 = get_unique(words_2) #norm_query w = (words_1 + words_2) w.sort() a = [] for i in range(len(w)) : if i == 0 : a += [w[i]] else : if w[i] != w[i-1] : a += [w[i]] b = a c = len(b) d = len(words_1) ; e = len(words_2) ; f = (d + e) - c jaccard_coef = round((f/c), 2) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n0 = [] for i in range(len(norm_tweets)) : top_n0.append([i, jaccard(norm_tweets[i],norm_query)]) top_n1 = [] for [a1,a2] in top_n0 : top_n1.append([-a2,a1]) top_n1.sort() for i in range(len(top_n1)) : top_n0[i][0],top_n0[i][1] = top_n1[i][1],-top_n1[i][0] top_n = top_n0[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): a = '#'+ str(tweet_id) + ' ' +'(' + str(jc_coef) + ')' b = (tweet_content.split()) total_w = len(tweet_content) print() print(a) #tweet-id and jc-number tw_c = [] for i in range(len(b)) : if len(' '.join(tw_c)) < print_width-2 : tw_c.append(b[i]) else : tw_c = tw_c[:i-1:1] print(' '+' '.join(tw_c)) tw_c = [b[i-1],b[i]] print(' '+' '.join(tw_c)) #--------------------------------------------
# 6330223021 (20.00) 79 (2021-02-25 23:42) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): x = [] for i in words_2: if i in words_1: x.append(i) jaccard_coef = len(x)/len(get_unique(words_1+words_2)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): tweet_id = [int(e) for e in range(len(norm_tweets))]; top_reverse = [] for i in range(len(tweet_id)): if jaccard(norm_tweets[i], norm_query) > 0: top_reverse.append([-jaccard(norm_tweets[i], norm_query), tweet_id[i]]) top_reverse.sort(); top = [] for i in top_reverse: top.append([i[1], -i[0]]) top_n = top[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id), '('+str(round(jc_coef, 2))+')') x = tweet_content.split(' ') t = []; m = 0; c = 0 for i in x: if len(i)+m+2 <= print_width: t.append(i) m += len(i)+1 elif len(i)+m+2 > print_width: t = ' '.join(t) print(' ', t.strip()) t = [] t.append(i) m = 0 m += len(i)+1 c += 1 if c == len(x) : t = ' '.join(t) print(' ', t.strip()) #--------------------------------------------
# 6330224721 (14.00) 80 (2021-02-28 01:15) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): up = 0 words11 = get_unique( words_1 ) words22 = get_unique( words_2 ) for i in range(len(words11)): for j in range(len(words22)): if words11[i] == words22[j]: up += 1 words_3 = words_1 + words_2 words33 = get_unique( words_3 ) down = len(words33) jaccard_coef = up/(down+int(len(words33)==0)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [[0,0]]*n top_m = [] x = [] for i in range(len(norm_tweets)): x += [[jaccard(norm_query, norm_tweets[i]),-i]] x.sort() x = x[::-1] top_m = x[:n] for i in range(n): a,b = top_m[i] top_n[i] = -b,a return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): s = "" s += "\n" s += "#" + str(tweet_id) + " " + "(" + str(round(jc_coef,2)) + ")" s += "\n" s += " " countWord = 2 sentence = tweet_content.split(" ") for i in range(len(sentence)): word = sentence[i] if i == 0: s += word countWord += len(word) else: if (countWord + len(word) + 1) > print_width: s += "\n" s += " " s += word countWord = 2 + len(word) else: s += " " s += word countWord += len(word) + 1 print(s) #--------------------------------------------
# 6330225321 (18.01) 81 (2021-03-01 14:38) def get_unique( words ): words.sort() unique_words = [] for i in range(len(words)) : if words[i] != words[i-1] : unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): a = 0 if len(words_1) > len(words_2) : for i in range(len(words_1)) : if words_1[i] in words_2 : a +=1 else : for i in range(len(words_2)) : if words_2[i] in words_1 : a +=1 b = len(words_1) + len(words_2) -a jaccard_coef = a/b return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a = [] top_n =[] for i in range(len(norm_tweets)) : tweet_id = i jac = jaccard(norm_tweets[tweet_id], norm_query) a.append([jac,-tweet_id]) a.sort() b = a[-1:-n-1:-1] for f in b : top_n.append([-f[1],f[0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id),'(' + str(round(jc_coef,2)) +')' ) a = tweet_content.split(' ') b = ' ' for e in a : if len(b) +len(e) < print_width : b += ' '+e if e == a[-1]: print(b) else : print(b) b = '' b =' ' +e if e == a[-1] : print(b) #--------------------------------------------
# 6330226021 (20.00) 82 (2021-03-01 00:42) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): sim = 0 for e in words_1: if e in words_2: sim += 1 tot = len(get_unique(words_1+words_2)) jaccard_coef = sim/tot return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = [] for tweet_id in range(len(norm_tweets)): Jaccard = jaccard(norm_tweets[tweet_id],norm_query) if Jaccard > 0: top.append([Jaccard, -tweet_id]) top.sort() top_n = [] for Jaccard, tweet_id in top: top_n.append([-tweet_id, Jaccard]) top_n = top_n[:-n-1:-1] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): cont = tweet_content.split(' ') show = [' '] n = 2 for e in cont: if n == 2: show.append(e) n += len(e) elif 2 < n+len(e)+1 <= print_width: show.append(' '+e) n += len(e)+1 else : show.append('\n'+' '+e) n = 2+len(e) j_show = ''.join(show) print() print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') print(j_show) #--------------------------------------------
# 6330227621 (19.10) 83 (2021-03-01 22:37) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words #............................................................ def jaccard(words_1, words_2): A=0 for i in range(len(words_1)): if words_1[i] in words_2: A +=1 B = len(words_1)+len(words_2)-A jaccard_coef = A/B return jaccard_coef # ........................................................... def top_n_similarity(norm_tweets, norm_query, n): L=[] B=[] C=[] D=[] for i in range(len(norm_tweets)): A = jaccard(norm_tweets[i],norm_query) L.append([A,i]) L=sorted(L) L=L[::-1] L+=['aaaaaa','aaaaa'] for i in range(len(norm_tweets)): if L[i][0]==L[i+1][0]: B.append(L[i]) else: B.append(L[i]) B.sort() C+=B B=[] C=C[:n:] for i in range(len(C)): C[i][0],C[i][1]=C[i][1],C[i][0] for i in range(len(C)): if C[i][1]!=0: D.append(C[i]) top_n = D return top_n #............................................................. def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(' ') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') n = print_width A = tweet_content.split(' ') N = 0 L = ' ' for i in range(len(A)): if len(A[i])>= n-2: L = ' '+A[i] print(L) L=' ' else: if len(A[i])>n-2-N: print(L) N =0 L=' '+A[i]+' ' N = len(A[i])+1 else: if A[i]==' ': L+=' ' N+=1 else: L += A[i]+' ' N+=len(A[i])+1 print(L) #--------------------------------------------
# 6330228221 (18.01) 84 (2021-02-28 23:20) def get_unique( words ): unique_words = [] for i in words : if i not in unique_words : unique_words.append(i) return unique_words def jaccard(words_1, words_2): allword = get_unique( words_1 + words_2) m = len(allword) intersec=0 for i in words_1 : if i in words_2 : intersec+=1 jaccard_coef = intersec/m return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)) : tweet_id = i jaccardcoef = jaccard(norm_tweets[i],norm_query) top_n.append([jaccardcoef,-tweet_id]) top_n.sort() for i in range(len(top_n)): top_n[i][1]*= (-1) for i in top_n : i.reverse() top_n.reverse() return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(" ") print("#"+str(tweet_id),'('+str(round(jc_coef,2))+')') first_line = " " t = tweet_content.split(" ") for i in t : if len(first_line+i) <= print_width : first_line +=i first_line += " " else: print(first_line) first_line= " " + i + " " print(first_line) #--------------------------------------------
# 6330229921 (20.00) 85 (2021-02-28 13:46) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): b = 0 for i in range(len(words_1)): if words_1[i] in words_2: b += 1 c = len(words_1) + len(words_2) - b jaccard_coef = b / c return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] tot = [] for tweet_id in range(len(norm_tweets)): d = [] m = jaccard(norm_tweets[tweet_id], norm_query) if m > 0: d.append(-m) d.append(tweet_id) tot.append(d) tot.sort() for i in range(len(tot)): tot[i][0], tot[i][1] = tot[i][1], -tot[i][0] top_n = tot[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n'+'#'+str(tweet_id)+' ('+str(round(jc_coef, 2))+')') k = tweet_content.split(' ') t = ' ' for i in range(len(k)): if i != (len(k)-1) and (len(t)+len(k[i])) <= print_width: t += k[i] if len(t) < print_width: t += ' ' elif i != (len(k)-1) and (len(t)+len(k[i])+1) > print_width: print(t) t = ' ' + k[i] + ' ' elif i == (len(k)-1) and (len(t)+len(k[i])) <= print_width : t += k[i] print(t) elif i == (len(k)-1) and (len(t)+len(k[i])+1) > print_width: print(t) t = ' ' + k[i] print(t) #--------------------------------------------
# 6330230421 (18.01) 86 (2021-02-26 13:00) def get_unique( words ): unique_words = [] for i in words: if not i in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): x = [] x1 = [] for i in words_1: if i in words_2: if not i in x: x.append(i) x1 += x for i in words_1: if not i in x1: x1.append(i) for i in words_2: if not i in x1: x1.append(i) jaccard_coef = len(x)/len(x1) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a = [] for i in range(len(norm_tweets)): tweet_id = i jac = jaccard(norm_tweets[tweet_id],norm_query) a.append([-jac,tweet_id]) a.sort() for k in range(len(a)): jac = -a[k][0] a[k][0] = a[k][1] a[k][1] = jac top_n = a[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content = tweet_content.split(' ') line = ' ' print("\n"+"#"+str(tweet_id)+" "+"("+str(round(jc_coef,2))+")") for i in range(len(tweet_content)): if len(line)+(len(' '+tweet_content[i])) <= print_width: line += ' '+tweet_content[i] else : print(line) line = ' '+tweet_content[i] print(line) #--------------------------------------------
# 6330232721 (20.00) 87 (2021-03-01 00:56) def get_unique( words ): unique_words = [] i=0 while i<len(words): if words[i] not in unique_words : unique_words.append(words[i]) i+=1 return unique_words def jaccard(words_1, words_2): i=0 top_num = 0 while i<len(words_2): if words_2[i] in words_1: top_num+=1 i+=1 bot_num = len(words_1) + len(words_2) - top_num jaccard_coef = top_num/bot_num return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] all_tweet = [] i=0 while i<(len(norm_tweets)): a = [] jac_num = jaccard(norm_tweets[i],norm_query) if jac_num > 0 : a.append(-jac_num) a.append(i) all_tweet.append(a) i+=1 all_tweet.sort() for i in range (len(all_tweet)): all_tweet[i][0],all_tweet[i][1]=all_tweet[i][1],-all_tweet[i][0] top_n = all_tweet[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): lis_tweet = tweet_content.split(' ') print('\n'+'#'+str(tweet_id),'('+str(round(jc_coef,2))+')') show = ' ' for i in lis_tweet: show+=' '+i if len(show)>print_width: print(show[0:-(len(i)):1]) show=' '+i print(show) #--------------------------------------------
# 6330233321 (18.05) 88 (2021-03-01 20:03) def get_unique( words ): unique_words = [] for e in words: if not e in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): count = 0 maizum = [] for e in words_1: if not e in maizum: maizum.append(e) for e in words_2: if not e in maizum: maizum.append(e) maizum = len(maizum) for e in words_2: if e in words_1: count += 1 zum = count jaccard_coef = (zum/maizum) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] all_jacc = [] id = 0 for e in norm_tweets: jacc = jaccard(e, norm_query) if jacc > 0: all_jacc.append([-jacc, id]) id += 1 else: id += 1 all_jacc.sort() all_jacc.sort(reverse=True) all_jacc = all_jacc[::-1] for e in all_jacc: top_n.append(e[::-1]) for e in top_n: e[1] = -e[1] top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): t = tweet_content print() print('#'+str(tweet_id)+' ('+str(round(jc_coef, 2))+')') k = len(t) t = t.split() words_width = 2 print_words = [] s = 0 for i in range(len(t)+(k//48)): if words_width + len(t[i-s]) <= print_width: words_width += len(t[i-s]) words_width += 1 print_words.append(t[i-s]) elif words_width + len(t[i-s]) > print_width: print_words = ' '.join(print_words) print(' '+print_words) print_words = [] words_width = 2 s += 1 if len(print_words) != 0: print_words = ' '.join(print_words) print(' '+print_words) #--------------------------------------------
# 6330234021 (14.33) 89 (2021-03-01 17:20) def get_unique( words ): unique_words = [] i = 0 while i < len(words) : if words[i] not in unique_words : unique_words.append(words[i]) i += 1 return unique_words def jaccard(words_1, words_2): inter = [] union =[] for i in range(len(words_1)) : if words_1[i] in words_2 : inter.append(words_1[i]) if words_1[i] not in words_2 : union.append(words_1[i]) for i in range(len(words_2)) : if words_2[i] not in words_1 : union.append(words_2[i]) union += inter jaccard_coef = len(inter) / len(union) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] x = [] for tweet_id in range(len(norm_tweets)) : if jaccard(norm_tweets[tweet_id] , norm_query) > 0 : y = jaccard(norm_tweets[tweet_id] , norm_query) x.append([-y,tweet_id]) x.sort() if len(x) != 0 : for i in range(n) : top_n.append([x[i][1],-x[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(' ') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') x = tweet_content.split(' ') y = ' ' for c in x : z = len(c) + 2 if z <= print_width : y += c + ' ' z += 1 else : print(y) y = ' '+c+' ' z = len(y) print(y) #--------------------------------------------
# 6330235621 (18.50) 90 (2021-03-01 03:08) def get_unique( words ): unique_words=[] for i in range(len(words)) : if words[i] not in unique_words : unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): a=[] b=[] c=[] d=[] if len(words_1)<=len(words_2) : a+=words_1 b+=words_2 else : a+=words_2 b+=words_1 for i in range(len(a)) : if a[i] in b : c.append(a[i]) if a[i] not in b : d.append(a[i]) for i in range(len(b)) : if b[i] not in a : d.append(b[i]) jaccard_coef=len(c)/(len(d)+len(c)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] x=[] for tweet_id in range(len(norm_tweets)) : if jaccard(norm_tweets[tweet_id],norm_query)>0 : a=jaccard(norm_tweets[tweet_id],norm_query) x.append([-a,tweet_id]) x.sort() if len(x)!=0 : for i in range(n) : top_n.append([x[i][1],-x[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(" ") print("#"+str(tweet_id)+" "+"("+str(round(jc_coef,2))+")") a=tweet_content.split(" ") b=" " d=2 for c in a : d+=len(c) if d<=print_width : b+=c+" " d+=1 else : print(b) b=" "+c+" " d=len(b) print(b) #--------------------------------------------
# 6330236221 (17.97) 91 (2021-02-28 22:45) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): r =[] for j in words_2: if j in words_1: r.append(j) x = len(words_1) + len(words_2) - len(r) jaccard_coef = len(r) / x return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] j = [] p = [] v =[] for i in range(len(norm_tweets)): y = norm_tweets[i] jac = jaccard(y, norm_query) if jac > 0: tweet_id = i top_n.append([jac,tweet_id]) for k in top_n: j.append(k) j.sort() j.reverse() for k in j: k[1] *= -1 j.sort() j.reverse() for k in j: k [1] *= -1 p = j[:n] for k in p: k[0],k[1] = k[1],k[0] top_n = p return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): x = " " print("\n"+"#"+str(tweet_id),"("+str(round(jc_coef,2))+")") for t in tweet_content.split(" "): if len(x +" "+ t) < print_width: x = x +" "+ t else: print(x) x = " " x = x +" "+ t print(x) #--------------------------------------------
# 6330238521 (18.01) 92 (2021-03-01 22:45) def get_unique( words ): unique_words = [] for i in range(0, len(words)): if not words[i] in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): x = [] for i in range(len(words_1)): if words_1[i] in words_2: x.append((words_1)[i]) jaccard_coef = (len(x))/(len(get_unique( words_1 + words_2 ))) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range (len(norm_tweets)): k = jaccard(norm_tweets[i], norm_query) top_n.append([k,-i]) top_n.sort(reverse = True) top_n = [[-b,a] for a,b in top_n] return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): x = tweet_content.split(' ') print("") print("#"+str(tweet_id)+" "+"("+str(round(jc_coef,2))+")") g = ' ' for w in x: if len(g+w) < print_width: g += ' '+w else: print(g) g = ' '+w print(g) #--------------------------------------------
# 6330239121 (17.87) 93 (2021-02-28 20:13) def get_unique(words): unique_words = [] for i in words: if not i in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): interception = [] for i in range( min( len(words_1) , len(words_2 ))): if len(words_1) >= len(words_2): if words_2[i] in words_1 : interception.append(words_2[i]) else : if words_1[i] in words_2 : interception.append(words_1[i]) jaccard_coef = len(interception) / ( len(words_1) + len(words_2) - len(interception) ) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): before_top_n = [] mid_top_n = [] top_n = [] for i in range (len(norm_tweets)): tweet_id = i before_top_n.append([tweet_id, jaccard(norm_tweets[tweet_id],norm_query)]) for [tweet_id,jco] in before_top_n: if jco > 0 : mid_top_n.append([jco,tweet_id]) mid_top_n.sort() mid_top_n = mid_top_n[::-1] k = 0 while k != len(mid_top_n): for i in range (len(mid_top_n)-1): if mid_top_n[i][0] == mid_top_n[i+1][0] and mid_top_n[i][1] > mid_top_n[i+1][1]: mid_top_n[i],mid_top_n[i+1] = mid_top_n[i+1],mid_top_n[i] k += 1 if len(mid_top_n) != 0: for i in range (min(n,len(mid_top_n))): top_n.append(mid_top_n[i]) for i in range (min(n,len(mid_top_n))): top_n[i][0],top_n[i][1] = mid_top_n[i][1],mid_top_n[i][0] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("\n#" + str(tweet_id) + " " + "(" + str(round(jc_coef,2)) + ")") y = tweet_content.split(" ") x = len(tweet_content) a = 0 z = " " if x <= (print_width-2): print(" " + tweet_content[::] ) else: while x >= (print_width-2): for i in range(len(y)): if len(z) <= (print_width): if not len(z) + len(y[i]) > print_width: z += y[i] z += " " else: print(z[0:-1:1] + z[-1:-2].strip()) x -= (len(z)-3) a += 1 z = " " z += y[i] z += " " else: if not len(z) + len(y[i]) > print_width: z += y[i] z += " " else: z = z[0:-1:1] + z[-1:-2].strip() print(z) x -= (len(z)-3) a += 1 z = " " z += y[i] else: if a >= 2 : print(" " + tweet_content[-x+a::].strip() ) else: print(" " + tweet_content[-x+1::].strip() ) #--------------------------------------------
# 6330240721 (19.19) 94 (2021-02-26 22:10) def get_unique( words ): unique_words = [] NCT = [] for e in words: if e not in unique_words: unique_words.append(e) if e in unique_words: NCT.append(e) return unique_words def jaccard(words_1, words_2): U = [] V = [] for e in words_1 + words_2: if e in U: V.append(e) if e not in U: U.append(e) jaccard_coef = len(V)/len(U) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): y = [] o = [] for i in range(len(norm_tweets)): WayV = any(item in norm_tweets[i] for item in norm_query) if WayV == True : y.append(norm_tweets[i]) o.append(i) if WayV == False : pass z = [] for i in range(len(y)): x = jaccard(y[i], norm_query) if x > 0: z.append([round(x,2), o[i]]) z.sort(key = lambda x:x[1]) z.sort(key = lambda x:x[0],reverse=True) top_n = [] zz = len(z) for i in range(n): if len(z)==0: break if zz == 0: break top_n.append([z[i][1], z[i][0]]) zz -= 1 return(top_n) def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#' + str(tweet_id),'('+str(round(jc_coef,2))+')') line = [] tweet_content_list = tweet_content.split() for tweet in tweet_content_list: if len(line) == 0: line = [tweet] elif len(' '+' '.join(line + [tweet])) <= print_width: line += [tweet] else: print(' '+' '.join(line)) line = [tweet] print(' '+' '.join(line)) #--------------------------------------------
# 6330241321 (20.00) 95 (2021-03-01 23:59) def get_unique( words ): unique_words = [] for a in words : if a not in unique_words : unique_words.append(a) return unique_words def jaccard(words_1, words_2): same = 0 tot = 0 for a in words_1 : if a in words_2 : same = same + 1 tot = len(words_1) + len(words_2) - same jaccard_coef = same / tot return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)) : j = jaccard(norm_tweets[i] , norm_query) if j > 0 : top_n.append([-j, i]) top_n.sort() top_n = top_n[:n] for i in range(len(top_n)) : top_n[i] = [top_n[i][1] , (-top_n[i][0])] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#" + str(tweet_id) + " (" + str(round(jc_coef , 2)) + ")") content = tweet_content.split(" ") out = "" l = 0 for a in content : if (l + 1 + len(a)) <= (print_width - 1) : out = out + " " + a l = len(out) else : print(" " + out) out = "" out = out + " " + a l = len(out) if len(out) != 0 : print(" " + out) #--------------------------------------------
# 6330242021 (11.67) 96 (2021-03-01 17:00) def get_unique( words ): unique_words = [] for e in words: unique_words.append(e) unique_words.sort() i=0 while 1< i < len(words): if unique_words[i-1] == unique_words[i]: unique_words.remove(unique_words[i]) i+=1 return unique_words def jaccard(words_1, words_2): s = 0 for i in range(len(words_1)): if words_1[i] in words_2: s += 1 t=len(words_1+words_2)-s if t!=0: jaccard_coef = s/t return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): d=[] for i in range(len(norm_tweets)): jac=jaccard(norm_tweets[i],norm_query) if jac>0: d.append([i, jac]) d.sort() for j in range(0,len(d)-1): if d[j+1][1]>d[j][1]: d[j+1],d[j]=d[j],d[j+1] elif d[j][1]==d[j+1][1]: if d[j+1][0]<d[j][0]: d[j+1],d[j]=d[j],d[j+1] top_n=d[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n'+'#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') c=tweet_content.split(' ') i=0 out=' ' while i < len(c): if len(out)+len(c[i]) < print_width-1: out += ' '+c[i] i+=1 else: print(' '+out) out=' ' if i == len(c): print(' '+out) #--------------------------------------------
# 6330243621 (19.82) 97 (2021-03-01 23:36) def get_unique( words ): unique_words = [] for i in range(1,len(words)) : if not words[i] in unique_words : if words[i] != words[i-1] : unique_words.append(words[i]) if i == 1 : unique_words.append(words[i-1]) return unique_words def jaccard(words_1, words_2): jc = [] for i in range(len(words_2)) : if not words_2[i] in jc : if words_2[i] in words_1 : jc.append([words_2[i]]) if not (len(words_1)+len(words_2)-len(jc)) == 0 : jaccard_coef = len(jc)/(len(words_1)+len(words_2)-len(jc)) else : jaccard_coef = 0 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)) : jc = jaccard(norm_tweets[i], norm_query) if jc > 0 : top_n.append([jc, i]) top_n.sort(key=lambda sl: (-sl[0],sl[1])) top_n = top_n[:n] for t in range(len(top_n)) : top_n[t][0],top_n[t][1] = top_n[t][1],top_n[t][0] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id), '('+str(round(jc_coef,2))+')') t = tweet_content.split(' ') c = [] d = 0 for i in range(len(t)) : if d < print_width : c += [t[i]] d += 1 d += len(t[i]) if d >= print_width : c.remove(t[i]) print(' '+' '.join(c)) c = [t[i]] d = len(t[i])+1 print(' '+' '.join(c)) #--------------------------------------------
# 6330245921 (18.98) 98 (2021-03-01 16:08) def get_unique( words ): unique_words = [] for i in words : if i not in unique_words: unique_words += [i] return unique_words def jaccard(words_1, words_2): v = get_unique(words_1 + words_2) x = [] for b in words_1: if b in words_2: x += [b] jaccard_coef = len(x)/len(v) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): y = [] for i in range(len(norm_tweets)): y.append(i) r = [] for i in norm_tweets: r += [-jaccard(i,norm_query)] m = [] for i in y: m += [[r[i],y[i]]] m.sort() p = [] for i in m: if i[0] != 0: p+=[i] top = [] for i in p: top+=[[i[1],-i[0]]] top_n = top[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() meen = '(' + str(round(jc_coef, 2)) + ')' parn = '#' + str(tweet_id) print(parn, meen) tee = tweet_content.split(' ') au = ' ' + tee[0] for z in tee[1:]: if len(au) + len(' ' + z) <= print_width: au += ' ' + z else: print(au.strip()) au = ' ' + z print(au.strip()) #--------------------------------------------
# 6330246521 (17.92) 99 (2021-02-26 22:11) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): words_all = words_1 + words_2 union_words = [] for e in words_all: if e not in union_words: union_words.append(e) words_1.sort() words_2.sort() intersec_words = [] c = 0 for i in range(len(words_1)): for i in range(len(words_2)): if words_1[c] == words_2[i] and words_1[c] not in intersec_words: intersec_words.append(words_2[i]) c+=1 jaccard_coef = round(len(intersec_words)/len(union_words), 2) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): list_of_jaccard = [] for x in range(len(norm_tweets)): words_all = norm_tweets[x] + norm_query union_words = [] for e in words_all: if e not in union_words: union_words.append(e) norm_tweets[x].sort() norm_query.sort() intersec_words = [] c = 0 for i in range(len(norm_tweets[x])): for i in range(len(norm_query)): if norm_tweets[x][c] == norm_query[i] and norm_tweets[x][c] not in intersec_words: intersec_words.append(norm_query[i]) c+=1 jaccard = round(len(intersec_words)/len(union_words), 2) if jaccard != 0: list_of_jaccard.append([jaccard, -1*x]) list_of_jaccard.sort() top_n = list_of_jaccard[::-1][:n:] for i in range (len(top_n)): top_n[i][1] *= -1 for i in range (len(top_n)): top_n[i][0], top_n[i][1] = top_n[i][1], top_n[i][0] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#" + str(tweet_id) + " (" + str(round(jc_coef,2)) + ")") words_list = tweet_content.split(' ') sentence = words_list[0] for i in range(len(words_list)-1): if len(sentence) + len(words_list[i+1]) + 1 <= print_width-2: sentence += " " + words_list[i+1] else: print(" " + sentence) sentence = words_list[i+1] print(" " + sentence) #--------------------------------------------
# 6330247121 (18.22) 100 (2021-03-01 22:12) def get_unique( words ): unique_words = [] for x in words: if x not in unique_words: unique_words.append(x) return unique_words def jaccard(words_1, words_2): unique = [] count_unique_words = 0 count_duplicate = 0 for x in words_1: if x in words_2: count_duplicate += 1 for y in words_1: if y not in unique: unique.append(y) for z in words_2: if z not in unique: unique.append(z) count_unique_words = len(unique) try: jaccard_coef = count_duplicate/count_unique_words except: jaccard_coef = 0 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] counter = 0 for index in range(0,len(norm_tweets)): data = [] result = jaccard(norm_tweets[index],norm_query) data.append(index) data.append(result) top_n.append(data) top_n = sorted(top_n,key=lambda l:l[1], reverse=True) if top_n[0][1] == 0.0: top_n=[] return top_n[0:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("# {} ({})".format(tweet_id, round(jc_coef,2))) count=0 line = [] for word in tweet_content.split(" "): if count + len(word) + 1 <= print_width: line.append(word) count += len(word) + 1 else: res = " ".join(line) print(" " + res) count=len(word)+1 line = [] line.append(word) print(" ", end = "") print(" ".join(line)) #--------------------------------------------
# 6330248821 (15.97) 101 (2021-03-01 23:42) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): w1 = get_unique(words_1) w2 = get_unique(words_2) w = w1 + w2 a=[] for i in w1: if i in w2: a.append(i) jaccard_coef = len(a) /( len(w)-len(a)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): new = [] jac = [] top_n = [] for i in range(len(norm_tweets)): new.append([i]) jac.append(jaccard(norm_tweets[i],norm_query)) for i in range(n): top_n.append([new[jac.index(max(jac))][0],max(jac)]) jac.insert(jac.index(max(jac)), 0) jac.remove(max(jac)) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(' ') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') words = tweet_content.split(' ') r = ' ' for i in range(len(words)): if i == len(words)-1 and len(r) + len(words[i]) +1 < print_width: r += ' '+words[i] print(r) elif i == len(words)-1 and len(r) + len(words[i]) +1 > print_width: print(r) print(' '+words[i]) else: if len(r) + len(words[i]) +1 > print_width: print(r) r = ' '+words[i] elif len(r) + len(words[i]) +1 < print_width: r += words[i]+' ' #--------------------------------------------
# 6330249421 (20.00) 102 (2021-03-01 19:11) def get_unique( words ): unique_words=[] for i in range(len(words)) : if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): sed=0 for i in range(len(words_2)): if words_2[i] in words_1 : sed+=1 suan=len(get_unique( words_1 + words_2)) jaccard_coef=(sed/suan) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range(len(norm_tweets)) : k=[] k.append(-(jaccard(norm_tweets[i],norm_query))) k.append(i) top_n.append(k) top_n.sort() for xxx in top_n: xxx[0],xxx[-1]=abs(xxx[1]),abs(xxx[0]) answer=[] for i in range(len(top_n)): if (top_n[i])[1] >0: answer.append(top_n[i]) top_n=answer[:n:] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content=tweet_content.split(" ") print(" ") print("#"+str(tweet_id)+" "+"("+str(round(jc_coef,2))+")") answer=[] for i in range(len(tweet_content)): k=3+len(" ".join(answer))+len(tweet_content[i]) if k<=print_width : answer.append(tweet_content[i]) else : print(" "," ".join(answer).strip()) answer=[tweet_content[i]] print(" "," ".join(answer).strip()) #--------------------------------------------
# 6330250021 (18.33) 103 (2021-03-01 11:57) def get_unique( words ): unique_words=[] for a in words: if a not in unique_words: unique_words.append(a) return unique_words def jaccard(words_1, words_2): s=0 for b in words_1: if b in words_2: s+=1 u=len(words_1)+len(words_2)-s jaccard_coef=s/u return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): all=[] for i in range(len(norm_tweets)): j=jaccard(norm_tweets[i],norm_query) all.append([-j,i]) all.sort() top_n=[] for a,b in all[:n]: if a!=0: top_n.append([b,-a]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') i=0 while i+print_width-2<len(tweet_content): if ' ' in tweet_content[i:i+print_width-1]: x=tweet_content[i+print_width-2::-1].index(' ') y=i+print_width-2-x print(' ',tweet_content[i:y]) elif ' ' in tweet_content[i:]: y=tweet_content[i:].index(' ') print(' ',tweet_content[i:y]) i=y+1 print(' ',tweet_content[i:]) #--------------------------------------------
# 6330251621 (20.00) 104 (2021-02-28 19:42) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): b = words_1 + words_2 words_total= [] for i in b: if i not in words_total: words_total.append(i) n=0 for i in words_1: if i in words_2: n +=1 if words_total ==0: jaccard_coef=0 else: jaccard_coef = n/int(len(words_total)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top=[] for tweet_id in range(len(norm_tweets)): h = norm_tweets[tweet_id] + norm_query norm_total= [] for i in h: if i not in norm_total: norm_total.append(i) l=0 for e in norm_tweets[tweet_id]: if e in norm_query: l +=1 if len(norm_total) == 0:pass else: yr = l/int(len(norm_total)) if yr >0: top.append([tweet_id,yr]) for p in range(len(top)): top[p][1] = float(top[p][1])*(-1) g=[] for [a1,a2] in top: g.append([a2,a1]) g.sort() top_n =[] for p in range(len(g)): g[p][0] = g[p][0]*(-1) for [a1,a2] in g: top_n.append([a2,a1]) top_n= top_n[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(' ') print('#'+str(int(tweet_id))+' '+'('+str(round(jc_coef,2))+')') t= tweet_content.split(' ') k='' for i in range(len(t)): if len(str(k))==0 and len(t[i]) >print_width-2: print(' '+t[i]) elif len(str(t[i])) <= print_width-2 and len(str(k))+len(str(t[i])) <= print_width-2: k += t[i]+' ' else : print(' '+k) k ='' k +=t[i]+' ' print(' '+k) #--------------------------------------------
# 6330252221 (19.48) 105 (2021-03-01 23:22) def get_unique( words ): words.sort() unique_words=[] for i in range(len(words)): if i==0: unique_words+=[words[i]] else: if words[i]!=words[i-1]: unique_words+=[words[i]] return unique_words #------------------ def jaccard(words_1, words_2): b=0 c=0 for i in range(len(words_1)): if words_1[i] in words_2: b+=1 else: c+=1 for i in range(len(words_2)): if words_2[i] in words_1: b+=0 else: c+=1 d=b/(b+c) if d==0: jaccard_coef=0 else: jaccard_coef=d return jaccard_coef #------------------- def top_n_similarity(norm_tweets, norm_query, n): top_n=[] ln = len(norm_tweets) for i in range(ln): a= jaccard(norm_tweets[i],norm_query) if a>0: top_n.append([i,a]) else: pass top=[] for b in top_n: b=[-b[1],b[0]] top.append(b) top_n =[] top.sort() for z in top: z=[z[1],-z[0]] top_n.append(z) top_n= top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content=tweet_content.split(" ") l=len(tweet_content) a=tweet_id print(" ") print("#"+str(a)+" ("+str(round(jc_coef,2))+")") x=[] n=0 for i in range(l): z=int(len(x)) + 1 + n if z <= print_width: x+=[tweet_content[i]] n+=len(tweet_content[i]) else: print(" "+" ".join(x[:-1])) x=[x[-1]] n=len(tweet_content[i-1]) x+=[tweet_content[i]] n+=len(tweet_content[i]) if len(x) + 1 + n <= print_width: print(" "+" ".join(x)) else: print(" "+" ".join(x[:-1])) print(" "+x[-1]) #--------------------------------------------
# 6330253921 (18.90) 106 (2021-03-01 01:32) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] in words[:i]: pass else: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): words = words_1 + words_2 unique_words = [] for i in range(len(words)): if words[i] in words[:i]: pass else: unique_words.append(words[i]) c = 0 for i in words_1: if i in words_2: c += 1 jaccard_coef = round(c/len(unique_words),6) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): r = [] for i in norm_tweets: r += [1-jaccard(i,norm_query)] k = [] for i in range(len(norm_tweets)): k += [[r[i],i]] k.sort() o = [] for i in k: if i[0] != 1: o.append(i) else: pass d = [] for i in o: a = [i[1],1-i[0]] d.append(a) top_n = d[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): c = tweet_content.split(' ') j = '' print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') for i in range(len(c)): j += str(' '+c[i]) if i+1 == len(c): print(' '+j.strip()) break elif len(j)+len(c[i+1]) < print_width: pass else: print(' '+j.strip()) j = '' #--------------------------------------------
# 6330254521 (17.42) 107 (2021-03-01 06:08) def get_unique( words ): unique_words=[] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): s=0 for e in words_1: if e in words_2: s+=1 t=len(words_1)+len(words_2)-s if t!=0: jaccard_coef = s/t else: jaccard_coef = 0 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range(len(norm_tweets)): tweet_id= i jaccad = jaccard(norm_tweets[i], norm_query) top_n.append([jaccad,tweet_id]) top_n.sort() top_n=top_n[::-1] top_n_incomplete=[] top_n_complete=[] z=len(top_n) top_n.append([99999,99999]) for i in range(z): if top_n[i][0]==top_n[i+1][0]: top_n_incomplete.append(top_n[i]) else: top_n_incomplete.append(top_n[i]) for i in range(len(top_n_incomplete)): top_n_incomplete[i][0],top_n_incomplete[i][1]=top_n_incomplete[i][1],top_n_incomplete[i][0] top_n_incomplete.sort() for e in top_n_incomplete: top_n_complete.append(e) top_n_incomplete=[] if top_n_complete[0][1]==0: top_n_complete=[] top_n_complete=top_n_complete[:n] return top_n_complete def show_tweet(tweet_id, tweet_content, jc_coef, print_width): word=tweet_content.split(" ") width=0 id = str(tweet_id) jccd = "("+str(round(jc_coef,2))+")" print("\n") print("#"+id+" "+jccd) list_of_word=[] for i in range(len(word)): width+=len(word[i]) if i!=len(word)-1: if width < print_width-2 : list_of_word.append(word[i]) list_of_word.append(" ") width+=1 else: z="" for q in list_of_word: z+=q print(" "+z) width=len(word[i])+1 list_of_word=[] list_of_word.append(word[i]) list_of_word.append(" ") else: if width<print_width-2: list_of_word.append(word[i]) z="" for q in list_of_word: z+=q print(" "+z) else : z="" for q in list_of_word: z+=q print(" "+z) print(" "+word[i]) #--------------------------------------------
# 6330255121 (20.00) 108 (2021-03-01 23:51) def get_unique( words ): unique_words = [] for s in words: if s not in unique_words: unique_words.append(s) return unique_words #-------------------------------------------------------- def jaccard(words_1, words_2): dup = 0 for w in words_1: if w in words_2 : dup += 1 result = len(words_1) + len(words_2) - dup jaccard_coef = dup / result return jaccard_coef #-------------------------------------------------------- def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): noj = jaccard(norm_tweets[i], norm_query) if noj != 0: top_n.append([-noj,i]) top_n.sort() top_n = top_n[:n] for i in range(len(top_n)): top_n[i] = [top_n[i][1],-top_n[i][0]] return top_n #-------------------------------------------------------- def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print( ) print('#'+ str(tweet_id),'('+str(round( jc_coef,2))+')') twc = tweet_content.split(' ') fp = '' lw = 0 for s in twc: if (lw + 1 +len(s)) <= (print_width - 1): fp += ' '+ s lw = len(fp) else: print(' '+fp) fp = 0 fp = ' ' + s lw = len(fp) if len(fp) != 0: print(' ' + fp) #--------------------------------------------
# 6330256821 (5.33) 109 (2021-03-01 23:15) def get_unique( words ): unique_words=[] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): inter=[] union=[] for i in range(len(words_1)): if words_1[i]==words_2: inter.append(word[i]) for i in words_1: if i not in words_2 : union.append(i) sum1= union+words_2 jaccard_coef=len(inter)/(len(sum1)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+tweet_id,"("+round(jc_coef,2)+")") out=tweet_content.split('') #--------------------------------------------
# 6330257421 (18.33) 110 (2021-02-28 23:49) def get_unique( words ): unique_words=[] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): s=0 j=0 for i in range(len(words_1)): if words_1[i] in words_2 : s=s+1 j=len(words_1)+len(words_2)-s jaccard_coef=s/j return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for tweet_id in range(len(norm_tweets)): j=[] if(jaccard(norm_tweets[tweet_id],norm_query)>0): j.append(jaccard(norm_tweets[tweet_id],norm_query)) j.append(tweet_id*-1) top_n.append(j) top_n.sort() for i in range(len(top_n)): top_n[i][0],top_n[i][1]=top_n[i][1],top_n[i][0] top_n[i][0]=top_n[i][0]*-1 top_n=top_n[::-1] if(len(top_n)>n): top_n=top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') if(tweet_content[:10]=='RT @POTUS:'): tweet_content=tweet_content[11:] j=0 i=print_width-2 while(i<len(tweet_content)): k=j while(tweet_content[i]!=' '): i=i-1 if(i==-1): while(tweet_content[k]!=' '): k=k+1 break; if(k!=j): print(' '+tweet_content[j:k]) j=k+1 while(tweet_content[j]==' '): j=j+1 i=k+print_width else: print(' '+tweet_content[j:i]) j=i while(tweet_content[j]==' '): j=j-1 i,j=i+print_width-i+j,i+1 else: print(' '+tweet_content[j:])
# 6330258021 (20.00) 111 (2021-02-25 23:56) def get_unique( words ): unique_words = [] for i in words : if i not in unique_words : unique_words.append(i) return unique_words def jaccard(words_1, words_2): union,intersect = get_unique(words_1+words_2),[] for i in get_unique(words_1) : if i in get_unique(words_2) : intersect.append(i) jaccard_coef = len(intersect)/len(union) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n,l,li,jl,c = [],[],[],[],0 for i in range(len(norm_tweets)) : jac = jaccard(norm_tweets[i],norm_query) if jac > 0 : top_n.append([jac,i]) top_n = sorted(top_n)[::-1] for i in range(len(top_n)) : if top_n[i][0] == c : li.append(top_n[i][1]) else : if len(li) != 0 : l.append(sorted(li)) li = [top_n[i][1]] c = top_n[i][0] jl.append(c) l.append(sorted(li)) top_n = [] for i in range(len(jl)) : for j in l[i] : top_n.append([j,jl[i]]) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n#'+str(tweet_id),'('+str(round(jc_coef,2))+')') s = [] for i in tweet_content.split(' ') : n = len(' '.join(s)) if n+len(i)+3 <= print_width : s.append(i) if n+len(i)+3 > print_width : print(' ',' '.join(s).strip()) s = [i] print(' ',' '.join(s).strip()) #--------------------------------------------
# 6330259721 (20.00) 112 (2021-02-28 23:14) def get_unique( words ): if len(words) == 0: return [] unique_words = [] words.sort() words.append(words[-1]+"5") e = words[0] for i in range(1,len(words)): if words[i] != e: unique_words.append(e) e = words[i] return unique_words def jaccard(words_1, words_2): n = 0 for e1 in words_1: if e1 in words_2 : n += 1 d = len(words_1)+len(words_2) - n jaccard_coef = n/d return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for tweet_id in range(len(norm_tweets)): top_n.append([jaccard(norm_tweets[tweet_id],norm_query)*(-1),tweet_id]) top_n.sort() for i in range(len(top_n)): top_n[i][0] *= -1 for i in range(len(top_n)): top_n[i][0],top_n[i][1] = top_n[i][1],top_n[i][0] top_n = top_n[:n] q = [] for r in top_n : if r[1] > 0 : q.append(r) return q def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(" ") print("#" +str(tweet_id)+" ("+str(round(jc_coef,2))+")") x = tweet_content.split(" ") v = "" c = len(v) for e in x: d = c + len(e) + 1 if d > print_width - 1: print(" " + v) v = "" v += " " + e c = len(v) else: v += " " + e c = len(v) if c != 0: print(" " + v) #--------------------------------------------
# 6330260221 (16.44) 113 (2021-03-01 00:23) def get_unique( words ): unique_words = [] for i in range(0,len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): n = 0 for i in words_1: if i in words_2: n+=1 jaccard_coef = round(n/(len(words_1)+len(words_2)-n),2) print(jaccard_coef) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): t = 0 z = 0 jac = 0.0 suml = [] suml2 = [] suml3=[] for i in norm_tweets: m = 0 for j in i: if j in norm_query: m+=1 jaccard = round((m/(len(i)+len(norm_query)-m)),2) t += 1 suml.append([jaccard]+[t-1]) if jaccard != 0.0: jac+=1 suml.sort(reverse=True) while z<=n: for i in range(len(suml)-1): if suml[i][0]==suml[i+1][0]: if suml[i][1]>suml[i+1][1]: suml[i],suml[i+1]=suml[i+1],suml[i] z+=1 top_n = suml[0:n] for z in range(len(top_n)): top_n[z][0],top_n[z][1] = top_n[z][1],top_n[z][0] if jac == 0.0: top_n.clear() return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' (' + str(jc_coef) + ')') b = tweet_content.split(' ') t1 = [] n1 = 0 n2 = 0 for i in b: n2 += 1 n1 += len(i) if i == '': n1+=1 if n1<= (print_width-2) : t1.append(i) if n2 != len(b): n1+=1 if n1<= (print_width-2) and n2 == len(b): c=' '.join(t1) print(' '+c) elif n1>(print_width-2): c=' '.join(t1) print(' '+c) t1.clear() t1.append(i) n1 = len(i)+1 if n1<=(print_width) and n2 == len(b): c=' '.join(t1) print(' '+c) #--------------------------------------------
# 6330261921 (20.00) 114 (2021-02-26 21:50) def get_unique(words): words.sort() w=[] for i in words: if not i in w:w.append(i) unique_words = w return unique_words def jaccard(words_1, words_2): c=0 k=0 for i in words_1: if i in words_2: c+=1 else:k+=1 jaccard_coef=c/(k+len(words_2)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): l=[] for i in range(len(norm_tweets)): a=jaccard(norm_tweets[i],norm_query) if a>0:l.append([i,a]) l.sort(key=lambda x:x[1],reverse=True) top_n=l[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n'+'#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') a=tweet_content g=tweet_content.split() t=' '+g[0] for i in range(1,len(g)): f=a.split(g[i-1],1)[1] b=f.split(g[i]) k=len(b[0]) if len(t+' '*k+g[i])>print_width: print(t) t=' '+g[i] else:t+=' '*k+g[i] a=a[len(g[i-1])+k:] print(t) #--------------------------------------------
# 6330262521 (19.95) 115 (2021-03-01 21:39) def get_unique( words ): unique_words = [] words.sort() if words == []: return words else: for i in range(len(words)-1): if words[i] != words[i+1]: unique_words.append(words[i]) unique_words.append(words[-1]) return unique_words def jaccard(words_1, words_2): d = words_1 + words_2 d = get_unique(d) n = [] for c in words_1: if c in words_2: n.append(c) c = get_unique(n) jaccard_coef = (len(c)/len(d)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): tweet_id = [] for i in range(len(norm_tweets)): tweet_id.append(i) jac = [] for i in range(len(norm_tweets)): j = jaccard(norm_tweets[i], norm_query) jac.append(j) top = [] for i in range(len(norm_tweets)): if jac[i] != 0: top.append([jac[i],tweet_id[i]]) top.sort() for i in range(len(top)): top[i][0] *= -1 top.sort() for i in range(len(top)): top[i][0] *= -1 top_n = top[:n:] for i in range(len(top_n)): top_n[i][1],top_n[i][0] = top_n[i][0],top_n[i][1] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') tweet_content = tweet_content.split() c = 0 s = '' for i in range(len(tweet_content)): if c + len(tweet_content[i]) <= print_width -2: s += tweet_content[i] + ' ' c += len(tweet_content[i])+1 else: s += '\n' + ' ' c = 0 s += tweet_content[i] + ' ' c += len(tweet_content[i])+1 print(' '+s.strip()) #--------------------------------------------
# 6330263121 (13.33) 116 (2021-03-01 22:08) def get_unique( words ): n = [] for i in range(len(words)): d = words[i] if d not in words[i+1:]: n.append(d) unique_words = n return unique_words def jaccard(words_1, words_2): v = len(words_1) a = 0 for c in range(len(words_2)): w2 = words_2[c] if w2 not in words_1: v +=1 else: a +=1 if v != 0: jaccard_coef = a/v else : jaccard_coef = a/100000000000 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top=[] for i in range(len(norm_tweets)): tweet_id = i a = norm_tweets[i] a1 = get_unique(a) b1 = norm_query jac =round(jaccard(a1,b1),20) e =[-jac,tweet_id] top.append(e) a = sorted(top) top_n =[] for j in range(n): a1 = -a[j][0] b1 = a[j][1] if a1 !=0: r =[b1,a1] top_n.append(r) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): a = round(jc_coef,2) e = tweet_content.split(' ') print("") print(f'#{tweet_id} ({a})') d = [] t = 0 g = int(len(e)) for i in range(g): if g+t+1>print_width: print(" "+" ".join(d[:-1])) d = [d[-1]] t = len(e[i-1]) d +=[e[i]] t +=len(e[i]) else: d +=[e[i]] t +=len(e[i]) if g+t+1>print_width: print(" "+" ".join(d[:-1])) print(" "+d[-1]) else: print(" "+" ".join(d)) #--------------------------------------------
# 6330264821 (18.50) 117 (2021-03-01 19:41) def get_unique( words ): words.sort() words.append('ABCDEFGHIJJK') words1 = [] unique_words = [] unique_words.append(words[0]) for i in range(1, len(words)-1) : if unique_words[-1] != words[i] : unique_words.append(words[i]) unique_words.sort() for e in unique_words : words1.append([len(e),e]) words1.sort() for i in range(len(unique_words)) : unique_words[i] = words1[i][1] return unique_words def jaccard(words_1, words_2): c = 0 for e in words_1 : for i in range(len(words_2)) : if e == words_2[i] : c += 1 jaccard_coef = c/(len(words_1)+len(words_2)-c) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] x = [] for tweet_id in range(len(norm_tweets)) : top_n.append([jaccard(norm_tweets[tweet_id], norm_query) ,-tweet_id ]) top_n.sort() top_n = top_n[::-1] top_n = top_n[:n] for i in range(len(top_n)) : top_n[i][0],top_n[i][-1] = -top_n[i][-1],top_n[i][0] if top_n[i][1] == 0.0 : top_n = [] break return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): c = ' ' n = 0 print() print('#'+str(tweet_id)+' ('+str(round(jc_coef, 2))+')') x = tweet_content.split(' ') for e in x : if len(c)+len(e) > print_width : print(c) c = ' ' if e == x[-1] : print(c+e) else : c += e+' ' #--------------------------------------------
# 6330265421 (20.00) 118 (2021-03-01 12:16) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): x,y = words_1,words_2 its = 0 for i in range(len(x)): if x[i] in y: its += 1 un = len(x) + len(y) - its jaccard_coef = its/un return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n1 = [] t_j = []*2 for i in range(len(norm_tweets)): tweet_id = i jc = jaccard(norm_tweets[i],norm_query) if jc > 0: t_j = [tweet_id, jc] top_n1.append(t_j) top_n1.sort() top_n2 = [] for x,y in top_n1: top_n2.append([-y,x]) top_n2.sort() top_n = [] for x,y in top_n2: top_n.append([y,-x]) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n#'+str(tweet_id)+' ('+str((round((jc_coef),2)))+')') t= tweet_content.split(' ') line = ' ' for i in range(len(t)): if len(line) + len(t[i]) <= print_width: line += t[i] +' ' else: print(line) line = ' ' + t[i] + ' ' print(line) #--------------------------------------------
# 6330266021 (0.00) 119 (2021-03-01 23:57) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) else: pass return unique_words def jaccard(words_1, words_2): intersec = words_1 + words_2 words_total = [] for i in intersec: if i not in words_total: words_total.append(i) num = 0 for i in words_1: if i in words_2: num += 1 if not words_total == 0: jaccard_coef = num/int(len(words_total)) else: jaccard_coef = 0 return jaccard_coef def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#" + str(tweet_id) +"("+ str(jc_coef) + ")") tweet_content.split(" ") a = " " for i in tweet_content: if len(a) < print_width: a += i else: a = " " print(a) #--------------------------------------------
# 6330267721 (12.68) 120 (2021-02-27 17:56) def get_unique( words ): unique_words = [] [unique_words.append(i) for i in words if i not in unique_words] return unique_words def jaccard(words_1, words_2): count = 0 for i in words_1 : if i in words_2 : count += 1 s_plus_t = words_1 + words_2 temp_list = [] for i in s_plus_t : if i not in temp_list : temp_list.append(i) s_plus_t = temp_list jaccard_coef = count/len(s_plus_t) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): pre_top_n = [] switch_top_n = [] for number in range(len(norm_tweets)): tweet_id_got = number jaccard_got = jaccard(norm_tweets[number],norm_query) pre_top_n.append([jaccard_got,tweet_id_got]) pre_top_n.sort(reverse=True) for [jc,tweetid] in pre_top_n: switch_top_n.append([tweetid,jc]) top_n = switch_top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): word_tweetcontent = tweet_content.split() jaccard_round = round(jc_coef,2) print() print('#'+str(tweet_id),'('+str(jaccard_round)+')') i = 0 sentence = ' ' while i < len(word_tweetcontent) : if len(sentence) + len(word_tweetcontent[i]) <= print_width+2 : sentence += word_tweetcontent[i]+' ' i += 1 else : print(sentence) sentence = ' ' if sentence != '': print(sentence) sentence = '' #--------------------------------------------
# 6330268321 (20.00) 121 (2021-02-28 16:19) def get_unique( words ): unique_words=[] for e in words: if e not in unique_words: unique_words.append(e) return unique_words #-------------------------------------------------- def jaccard(words_1, words_2): t=[] for e in words_1: t.append(e) for e in words_2: if e not in words_1: t.append(e) t=len(t) s=[] for e in words_1: if e in words_2: s.append(e) s=len(s) jaccard_coef=s/t return jaccard_coef #--------------------------------------------- def top_n_similarity(norm_tweets, norm_query, n): top_n=[] u=[] for tweet_id in range(len(norm_tweets)): p=jaccard(norm_tweets[tweet_id],norm_query) if p >0: u.append([p,-tweet_id]) u.sort() u=u[:-n-1:-1] for e in u: top_n.append([-e[1],e[0]]) return top_n #------------------------------------------------- def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content=tweet_content.split(" ") a="" print() print("#"+str(tweet_id),"("+str(round(jc_coef,2))+")") for i in range(len(tweet_content)): if len(a)+1+len(tweet_content[i])<=print_width-1: a+=" "+tweet_content[i] else: print(" "+a) a=" "+tweet_content[i] print(" "+a) #health care policy 5 #COVID economic crisis 3 # american president 5 #--------------------------------------------
# 6330269021 (17.95) 122 (2021-02-26 20:59) def get_unique( words ): unique_words = [] for a in words: if a not in unique_words: unique_words.append(a) return unique_words def jaccard(words_1, words_2): duplicate_words = [] for a in words_1: if a in words_2: duplicate_words.append(a) jaccard_coef = len(duplicate_words)/len(get_unique(words_1 + words_2)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): all = [] for t in norm_tweets: all.append([norm_tweets.index(t), jaccard(t, norm_query)]) all.sort(reverse=True, key=lambda x : x[1]) top_n = all[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print(f"#{tweet_id} ({round(jc_coef, 2)})") print(' ', end = '') pool = tweet_content.split() char_count = 0 for i in range(len(pool)): if i == 0: char_count += len(pool[i]) else: char_count += 1 + len(pool[i]) if char_count > print_width - 2: print() print(' ' + pool[i], end = '') char_count = len(pool[i]) else: if i == 0: print(pool[i], end = '') else: print(' ' + pool[i], end = '') print() #--------------------------------------------
# 6330270521 (20.00) 123 (2021-03-01 20:56) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): cnt = 0 for i in words_1: if i in words_2: cnt += 1 ans = len(words_1)+len(words_2)-cnt try : jaccard_coef = cnt/ans except ZeroDivisionError : jaccard_coef = 0 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] x = [] for tweet_id in range(len(norm_tweets)) : y = jaccard(norm_tweets[tweet_id],norm_query) if y > 0 : x.append([y,-tweet_id]) x.sort() x = x[:-n-1:-1] for i in x: top_n.append([-i[1],i[0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content = tweet_content.split(' ') #print(tweet_content) x = '' print() print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') for i in range(len(tweet_content)) : if len(x)+1+len(tweet_content[i])<=print_width-1 : x += ' '+tweet_content[i] else : print(' '+x) x = ' '+tweet_content[i] print(' '+x) #--------------------------------------------
# 6330271121 (20.00) 124 (2021-02-28 10:43) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): aa = 0 cc = list(words_1) for e in words_1: if e in words_2: aa +=1 for d in words_2: if d not in cc: cc.append(d) if len(cc) > 0: jaccard_coef = aa/len(cc) else: jaccard_coef = 0 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = [] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i],norm_query)>0 : top.append([jaccard(norm_tweets[i],norm_query), -i]) top.sort() x = top[-1:-n-1:-1] top_n = [] for e in x: top_n.append([-e[1],e[0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') t = tweet_content.split(' ') x = ' ' for e in t: if len(x) + len(e) + 1 <= print_width: x += ' ' + e else: print(x) x = ' '+ e print(x) #--------------------------------------------
# 6330272821 (11.83) 125 (2021-02-28 23:58) def get_unique(words): unique_word = [] for i in words: if i not in unique_word: unique_word.append(i) return unique_word def jaccard(words_1, words_2): repeat_word = [] for n in words_1: if n in words_2: repeat_word.append(n) all_word = words_1 + words_2 union_word = get_unique(all_word) jaccard_coef = len(repeat_word)/ len(union_word) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] top_n_re = [] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i], norm_query) > 0: top_n_re.append([jaccard(norm_tweets[i], norm_query), i]) top_n_re.sort() top_n_re = top_n_re[:n] for n in top_n_re: top_n.append(n[::-1]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content = tweet_content.split(" ") print("#"+str(tweet_id)+ " ("+ str(round(jc_coef,2)) + ")") for i in tweet_content: line = " " if len(line) <= print_width: line = line+i else: print(line) #--------------------------------------------
# 6330273421 (15.50) 126 (2021-02-27 01:27) def get_unique( words ): words.sort() unique_words=[] for i in range(len(words)-1): if words[i]!=words[i+1]: unique_words.append(words[i]) if i==(len(words)-2): unique_words.append(words[i+1]) return unique_words def jaccard(words_1, words_2): a=0 if len(words_1)>=len(words_2): for i in range(len(words_2)): if words_2[i] in words_1: a+=1 else : for i in range(len(words_1)): if words_1[i] in words_2: a+=1 b=len(words_1)+len(words_2)-a jaccard_coef=a/b return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] in_top=[] for i in range(len(norm_tweets)): in_top.append(-jaccard(norm_tweets[i],norm_query)) in_top.append(i) in_top.append(jaccard(norm_tweets[i],norm_query)) top_n.append(in_top) in_top=[] top_n.sort() for i in range(len(norm_tweets)): x=top_n[i].pop(0) top_n=top_n[:n:] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tw ='#'+str(tweet_id) jc = '('+str(round(jc_coef,2))+')' ltc = tweet_content.split(' ') print('') print(tw,jc) sm=2 op='' if (len(tweet_content)+2)<=print_width: print(' '+tweet_content) else: for i in range(len(ltc)): sm+=len(ltc[i]) if sm<print_width: op+=ltc[i]+' ' sm+=1 else: print(' '+op[:-1:]) op=ltc[i]+' ' sm=3+len(ltc[i]) print(' '+op[:-1:]) print(print_width) #--------------------------------------------
# 6330274021 (20.00) 127 (2021-02-27 16:31) def get_unique( words ): unique_words = [] for e in words: if not e in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): same = [] al = [] for e in words_1: if not e in al: al.append(e) for e in words_2: if not e in al: al.append(e) for e in al: if e in words_1 and e in words_2: same.append(e) jaccard_coef = len(same)/len(al) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): data = [] jc = jaccard(norm_tweets[i], norm_query) if not jc <= 0: data.append(jc) data.append(i) top_n.append(data) for i in range(len(top_n)): top_n[i][0] *= -1 top_n.sort() top_n = top_n[0:n] for i in range(len(top_n)): top_n[i][0] *= -1 top_n[i][0],top_n[i][1] = top_n[i][1],top_n[i][0] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): t = tweet_content.split(' ') print() print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') t1 = 'ก'.join(t) t = t1.split('ก') n_word = 0 line = [] line_= "" x = 0 for i in range(len(t)): x = n_word x += len(t[i]) if x > print_width-2: line_ = ' '.join(line) print(" "+str(line_)) line = [] line.append(t[i]) line_ = "" n_word = len(t[i]) + 1 else: n_word += len(t[i]) + 1 line.append(t[i]) if line_ == "": line_ = ' '.join(line) print(" "+str(line_)) #--------------------------------------------
# 6330275721 (20.00) 128 (2021-02-28 17:50) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): w12 = words_1 + words_2 lower = [] for i in range(len(w12)): if w12[i] not in lower: lower.append(w12[i]) upper = [] for j in range(len(words_1)): if words_1[j] in words_2: upper.append(words_1[j]) jaccard_coef = len(upper)/len(lower) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for tweet_id in range(len(norm_tweets)): JC = jaccard(norm_tweets[tweet_id], norm_query) if JC > 0: top_n.append([JC, tweet_id]) for i in range(len(top_n)): top_n[i][0] *= -1 top_n.sort() for i in range(len(top_n)): top_n[i][0], top_n[i][1] = top_n[i][1], top_n[i][0] top_n[i][1] *= -1 top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#' + str(tweet_id), '(' + str(round(jc_coef, 2)) + ')') t = tweet_content.split(' ') t_ans = '' print_width -= 2 for i in range(len(t)): if t_ans == '': if len(t_ans+t[i]) >= print_width: t_ans += t[i] print(' ' + t_ans) t_ans = '' else: t_ans += t[i] elif t_ans != '': t_ans += ' ' if len(t_ans+t[i]) == print_width: t_ans += t[i] print(' ' + t_ans) t_ans = '' elif len(t_ans+t[i]) > print_width: print(' ' + t_ans) t_ans = t[i] elif len(t_ans+t[i]) < print_width: t_ans += t[i] if t_ans != '': print(' ' + t_ans) #--------------------------------------------
# 6330276321 (18.01) 129 (2021-02-27 01:02) def get_unique( words ): unique_words = [] if len(words) != 0 : unique_words.append(words[0]) for e in words : if not e in unique_words : unique_words.append(e) return unique_words def jaccard(words_1, words_2): a = 0 for e in words_1 : if e in words_2 : a += 1 sum_words = words_1 + words_2 b = a for j in sum_words : if sum_words.count(j) == 1 : b += 1 jaccard_coef = a / b return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): total = [] # [ [tweet_id, jaccard], [.., ..], ...] for i in range(len(norm_tweets)) : total.append([jaccard(norm_tweets[i], norm_query), i * (-1)]) total.sort() for k in range(len(norm_tweets)) : total[k][1] *= (-1) total = total[::-1] top_n = total[0:n] for j in range(len(top_n)) : top_n[j].append(top_n[j].pop(0)) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#" + str(tweet_id), "(" + str(round(jc_coef, 2)) + ")") content_words = tweet_content.split(' ') c = 0 print_text = [] for i in range(len(content_words)) : if (c + len(content_words[i])) <= print_width - 2 : # ต่อแล้วยังไม่เกิน width - 2 c += len(content_words[i]) print_text.append(content_words[i]) if i == len(content_words) - 1 : print_str_text = " ".join(print_text) print(" " + print_str_text) else : print_str_text = " ".join(print_text) print(" " + print_str_text) c = 0 + len(content_words[i]) print_text = [content_words[i]] print_str_text = [content_words[i]] if i == len(content_words) - 1 : print_str_text = " ".join(print_text) print(" " + print_str_text) c += 1 #--------------------------------------------
# 6330277021 (20.00) 130 (2021-02-27 23:03) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): up = [] for a in words_1: if a in words_2: up.append(a) j_c_u = len(up) down = [] for b in words_1: if b not in down: down.append(b) for c in words_2: if c not in down: down.append(c) j_c_d = len(down) jaccard_coef = j_c_u / j_c_d return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] top_n_x = [] for i in range(len(norm_tweets)): jc = jaccard(norm_tweets[i], norm_query) if jc > 0: top_n_x.append([jc, i]) if top_n_x != []: top_n_x.sort() top_n_x = top_n_x[::-1] if len(top_n_x) > 1: x = [top_n_x[0]] for j in range(1, len(top_n_x)): if top_n_x[j][0] == top_n_x[j-1][0]: x.append(top_n_x[j]) else: for k in x: k[0],k[1] = k[1],k[0] x.sort() top_n += x x = [] x.append(top_n_x[j]) for k in x: k[0],k[1] = k[1],k[0] x.sort() top_n += x elif len(top_n_x) == 1: top_n_x[0][0],top_n_x[0][1] = top_n_x[0][1],top_n_x[0][0] top_n = top_n_x if len(top_n) > n: top_n = top_n[:n:] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#" + str(tweet_id) + " " + "(" + str(round(jc_coef, 2)) + ")") tw_c = tweet_content.split(" ") tw_c_s = "" if len(tw_c[0]) >= print_width-2: tw_c_s += tw_c[0] print(" "*2 + tw_c_s) tw_c_s = "" else: tw_c_s += tw_c[0] for f in range(1, len(tw_c)): if tw_c_s == "": if len(tw_c_s+tw_c[f]) >= print_width-2: tw_c_s += tw_c[f] print(" "*2 + tw_c_s) tw_c_s = "" else: tw_c_s += tw_c[f] elif len(tw_c_s) > 0: tw_c_s += " " if len(tw_c_s+tw_c[f]) == print_width-2: tw_c_s += tw_c[f] print(" "*2 + tw_c_s) tw_c_s = "" elif len(tw_c_s+tw_c[f]) > print_width-2: print(" "*2 + tw_c_s) tw_c_s = tw_c[f] elif len(tw_c_s+tw_c[f]) < print_width-2: tw_c_s += tw_c[f] if tw_c_s != "": print(" "*2 + tw_c_s) #--------------------------------------------
# 6330278621 (18.01) 131 (2021-02-28 00:13) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): n = 0 for i in range(len(words_2)): if words_2[i] in words_1: n += 1 k = len(words_1)+len(words_2)-n jaccard_coef = n/k return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): tweet_id = i x = jaccard( norm_tweets[tweet_id], norm_query) top_n.append([tweet_id,x]) for i in range(len(top_n)): top_n[i][0] *= -1 top_n[i][0],top_n[i][1] = top_n[i][1],top_n[i][0] top_n.sort(reverse = True) for i in range(len(top_n)): top_n[i][0],top_n[i][1] = top_n[i][1],top_n[i][0] top_n[i][0] *= -1 top_n = top_n[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') t = tweet_content.split(' ') print_width -=2 message = [] sum_len = 0 for e in t: sum_len += len(e) if sum_len <= print_width: if e == t[-1] : message.append(e) print(' '+''.join(message)) else : message.append(e) message.append(' ') sum_len += 1 else : print(' '+''.join(message)) message = [e] message.append(' ') sum_len = len(e) + 1 if e == t[-1] : print(' '+''.join(message)) #--------------------------------------------
# 6330279221 (20.00) 132 (2021-03-01 14:36) def get_unique( words ): words.sort() unique_words = [] if words == []: return words else: for i in range(len(words)-1): if words[i] != words[i+1]: unique_words.append(words[i]) unique_words.append(words[-1]) return unique_words def jaccard(words_1, words_2): both = [] for i in words_1: if i in words_2: k = words_2.index(i) both.append(words_2[k]) t = words_1 + words_2 tot = get_unique(t) jaccard_coef = len(both)/len(tot) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] t = [] for i in range(len(norm_tweets)): jac = jaccard(norm_tweets[i],norm_query) j = [-jac, i] if -jac < 0: t.append(j) t.sort() for i in t: top_n.append([i[1],-i[0]]) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() twid = '#'+str(tweet_id) jc = '('+str(round(jc_coef,2))+')' print(twid, jc) tc = tweet_content.split(' ') s = [] c = 0 for e in range(len(tc)): if len(' '.join(s)) + len(tc[e]) + 3 <= print_width: s.append(tc[e]) c += len(s) else: ss = ' '.join(s) print(' ',ss.strip()) s = [tc[e]] print(' ', ' '.join(s).strip()) #--------------------------------------------
# 6330280821 (16.80) 133 (2021-03-01 21:48) def get_unique( words ): words.sort() unique_words = [] for i in range(len(words)-1): if words[i] != words[i+1] : unique_words.append(words[i]) if len(unique_words)>0 : unique_words.append(words[-1]) return unique_words def jaccard(words_1, words_2 ): a = [] for e in words_1 : if e in words_2 : a.append(e) for e in words_2 : if e in words_1 : a.append(e) c = get_unique(a) b = get_unique(words_1 + words_2) jaccard_coef = len(c)/len(b) return jaccard_coef def top_n_similarity(norm_tweet, norm_query, n): x = [] for i in range(len(norm_tweet)) : if jaccard(norm_tweet[i], norm_query) != 0: x.append([-jaccard(norm_tweet[i], norm_query),i]) print(x) x.sort() for e in x: e[0] *= -1 e[0],e[1] = e[1],e[0] top_n = x[:n:] print(x) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width) : print("") print("#"+str(tweet_id)+" "+"("+str(round(jc_coef,2))+")") x = tweet_content.split(' ') s = " " c = 0 while len(x) > 0: if len(s + x[0]) < print_width : s += " " + x[0] x.pop(0) elif len(s + x[0]) > print_width : print(s) s = " " else : s += " " + x[0] print(s) x.pop(0) s = " " c += 1 if c == 0 : print(s) #--------------------------------------------
# 6330281421 (18.01) 134 (2021-02-26 22:38) def get_unique( words ): unique_words=[] for k in range(len(words)): if not(words[k] in unique_words): unique_words.append(words[k]) return unique_words def jaccard(words_1, words_2): set_of_words=get_unique(words_1+words_2) set_of_sames=[] for k in range(len(set_of_words)): if set_of_words[k] in words_1 and set_of_words[k] in words_2: set_of_sames.append(set_of_words[k]) jaccard_coef=len(set_of_sames)/len(set_of_words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): all_list=[] for k in range(len(norm_tweets)): all_list.append([-k,jaccard(norm_tweets[k],norm_query)])#[-tweet_id,jaccard] all_list=sorted(all_list,reverse=True)#เรียง จากมากน้อย for k in range(len(all_list)): all_list[k][0],all_list[k][1]=all_list[k][1],all_list[k][0] #print(all_list) all_list=sorted(all_list,reverse=True) #print(all_list) top_n=[] if n>len(norm_tweets): n=len(norm_tweets) for k in range(n): top_n.append([-all_list[k][1],all_list[k][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') print_width+=1 tweet_content=tweet_content.split(' ') #print(tweet_content) words=' ' while True: if len(tweet_content)!=0: word=tweet_content.pop(0) word+=' ' if len(words)==2 and len(words)+len(word)>print_width: print(words+word) words=' ' else: if len(words)+len(word)<=print_width: words+=word else: print(words) words=' '+word else: print(words) break #--------------------------------------------
# 6330282021 (20.00) 135 (2021-02-25 23:20) def get_unique( words ): words.sort() unique_words = [] if len(words) != 0: unique_words = [words[0]] for i in range(1,len(words)): if words[i] != words[i-1]: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): words_in_both = 0 total_words = len(words_1)+len(words_2) if len(words_1) >= len(words_2): for i in range(len(words_1)): if words_1[i] in words_2: words_in_both += 1 else: for i in range(len(words_2)): if words_2[i] in words_1: words_in_both += 1 total_words -= words_in_both jaccard_coef = words_in_both/total_words return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = [] for tweet_id in range(len(norm_tweets)): Jaccard = jaccard(norm_tweets[tweet_id],norm_query) if Jaccard > 0: top.append([Jaccard,tweet_id]) top.sort() no_diff = 0 jc_diff = [0] for c in range(1,len(top)): no_diff += 1 if top[c][0] != top[c-1][0]: jc_diff.append(no_diff) jc_diff = jc_diff[::-1] top_list = [] for ii in range(len(jc_diff)): if ii == 0: top_list.append(top[jc_diff[ii]:]) else: top_list.append(top[jc_diff[ii]:jc_diff[ii-1]]) top_n = [] for e in top_list: for ee in e: top_n.append(ee) top_n = top_n[0:n] for d in range(len(top_n)): top_n[d] =top_n[d][::-1] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content = tweet_content.split(' ') print('') print('#'+ str(tweet_id) + ' (' + str(round(jc_coef,2)) + ') ') show = [] words = 0 total_len = 1 for l in range(len(tweet_content)): words += 1 total_len += len(tweet_content[l]) + 1 if total_len <= print_width: show.append(tweet_content[l]) elif total_len > print_width: print(' ' + ' '.join(show)) show = [tweet_content[l]] total_len = len(tweet_content[l]) + 2 print(' ' + ' '.join(show)) #--------------------------------------------
# 6330283721 (17.57) 136 (2021-02-27 09:19) def get_unique( words ): unique_words=[] for word in words: if word not in unique_words: unique_words.append(word) return unique_words def jaccard(words_1, words_2): a=[] for word in words_1: if word in words_2: a.append(word) b = len(words_1)+len(words_2)-len(a) jaccard_coef = len(a)/b return jaccard_coef return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range (len(norm_tweets)): if jaccard(norm_tweets[i],norm_query) >0: top_n.append([i,jaccard(norm_tweets[i],norm_query)]) top_n=sorted(top_n,key=lambda l:(l[1],-l[0]), reverse=True)[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): words = tweet_content.split(" ") print("\n #"+str(tweet_id),"("+str(round(jc_coef,2))+")") a=" " count=2 for i in range (len(words)): if (count-print_width)<=0 and len(words[i])>=(print_width-count): count-=print_width a+="\n " a+=" "+words[i] elif len(words[i])==0 and count+len(words[i+1])<=print_width and len(words[i])>=(print_width-count): a+="\n " a+=" "+words[i] else: a+=" "+words[i] count+=1+len(words[i]) print(a) #--------------------------------------------
# 6330284321 (20.00) 137 (2021-03-01 20:23) def get_unique( words ): unique_words = [] for x in words: if x not in unique_words: unique_words.append(x) return unique_words def jaccard(words_1, words_2): jaccard_coef = 0 for x in words_2: if x in words_1: jaccard_coef += 1 return jaccard_coef / (len(words_1) + len(words_2) - jaccard_coef) def top_n_similarity(norm_tweets, norm_query, n): ll = [] for idx, val in enumerate(norm_tweets): ll.append([-jaccard(val, norm_query), idx]) ll.sort() top_n = [] for i in range(n): if ll[i][0] == 0: break top_n.append([ll[i][1], -ll[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#" + str(tweet_id), "(" + str(round(jc_coef,2)) + ")") x = tweet_content.split(" ") unique_words = " " for i in x: if len(unique_words + i) <= print_width: unique_words += i if len(unique_words + i) == print_width: print(unique_words) unique_words = " " else: unique_words += " " else: if unique_words == " ": print(unique_words + i) else: print(unique_words) unique_words = " " + i if len(unique_words) > print_width: print(unique_words) unique_words = " " else: unique_words += " " if unique_words != " ": print(unique_words) #--------------------------------------------
# 6330286621 (20.00) 138 (2021-02-28 04:44) def get_unique( words ): unique_words=[] for i in words: if not i in unique_words: unique_words.append(i) return (unique_words) def jaccard(words_1, words_2): words1= get_unique(words_1) words2= get_unique(words_2) s,t=0,0 for i in words1: if i in words2: s+=1 else : t+=1 t=len(words2)+t jaccard_coef=s/t return (jaccard_coef) def top_n_similarity(norm_tweets, norm_query, n): top = [] k=len(norm_tweets) for i in range(k): j=jaccard(norm_query,norm_tweets[i]) if j>0: top.append( [j , k-i , i]) #([jaccard,reverse_index,index]) ==>reverse_index sort จากมากไปน้อย จะได้ index sort จากน้อยไปมาก top=sorted(top, reverse=True) topform = [] for r in top: topform.append([ r[2], r[0]]) top_n=topform[:n] return (top_n) def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n#{} ({})'.format( tweet_id ,round(jc_coef,2) ) ) tweet_content=tweet_content.split(' ') #print(tweet_content) k='start' n=0 for i in tweet_content: #print('\n',n) if k!='start' : # word n+= 1+len(i) # spacebar+word if n<=print_width: print(' {}'.format(i),end='') else: print("\n",end='') n,k=0,'start' if k=='start': #starword print(' {}'.format(i),end='') k='none' n+= 2+len(i) #--------------------------------------------
# 6330288921 (18.50) 139 (2021-03-01 17:02) def get_unique( words ): unique_words = [] for j in range(len(words)) : if words[j] in unique_words: continue else : unique_words.append(words[j]) return unique_words def jaccard(words_1, words_2): w = 0 a = get_unique(words_1) b = get_unique(words_2) c = a+b d = [] for j in range(len(c)) : if c[j] not in d : w += 0 elif c[j] in d : w += 1 d.append(c[j]) n = len(get_unique(c)) Jaccard_similarity_coefficient = w/n jaccard_coef = Jaccard_similarity_coefficient return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): def two(p): return p[1] p=[] top_n=[] for j in range(len(norm_tweets)): u = jaccard(norm_tweets[j], norm_query) if u>0 : p.append([j,u]) if p != []: r = sorted(p,key=two,reverse = True) top_n=r[:n:] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n'+'#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') tweet_content= tweet_content.split(' ') az = ' ' for e in tweet_content: if len(az)+1+len(e)<=print_width: az += ' '+e else: print(az) az = ' '+e print(az) #--------------------------------------------
# 6330289521 (20.00) 140 (2021-03-01 23:16) def get_unique( words ): unique_words=[] for e in words: if e not in unique_words: unique_words+=[e] else: pass return unique_words def jaccard(words_1, words_2): #jaccard_coef x=[] for c in words_1: if c in words_2: x+=[c] else: pass y=words_1+words_2 y.sort() z=[y[0]] for i in range(len(y)-1): if y[i]!=y[i+1]: z+=[y[i+1]] jaccard_coef=len(x)/len(z) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): x=[] p=[] top_n=[] for i in range (len(norm_tweets)): tweet_id=i z=jaccard(norm_tweets[tweet_id],norm_query) if z!=0: x+=[[z,tweet_id]] else: pass x.sort(reverse=True) for i in range (len(x)): p+=[[x[i][1],x[i][0]]] #print(p) use=[] if len(p)!=0: l=[[p[0][0],p[0][1]]] for i in range(len(p)-1): if p[i][1]==p[i+1][1]: l+=[[p[i+1][0],p[i+1][1]]] l.sort() else: use+=l l=[[p[i+1][0],p[i+1][1]]] if len(l) != 0: use+=l #for i in range(len(use)): # top_n+=[[round(use[i][1],2),use[i][0]]] #print(top_n) #print(use) x=use[:n:] top_n=x return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): t=tweet_content.split(' ') #print(" "+a[(print_width)*(i+1):(print_width-2)*(i+2)]) #n=len(tweet_content) l=" " print("\n"+"#"+str(tweet_id),"("+str(round(jc_coef,2))+")") #print(len(t)) #print(print_width) for e in t: if len(l)+len(e)<=print_width: if e==" ": l+=" " else: l+=e+" " else: print(l) l=" "+e+" " else: print(l) #--------------------------------------------
# 6330290021 (19.68) 141 (2021-02-27 01:50) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): allwords = words_1 + words_2 allwords = get_unique(allwords) samewords = [] for i in words_1: if i in words_2 and i not in samewords: samewords.append(i) if len(allwords) > 0 : jaccard_coef = len(samewords)/len(allwords) else: jaccard_coef = 0 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i],norm_query) > 0 : s = [] s.append(i) s.append(jaccard(norm_tweets[i],norm_query)) top_n.append(s) def sortsecond(jac): return(jac[1]) top_n.sort(key=sortsecond,reverse=True) top_n = top_n[:n:] return(top_n) def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') show = [] space = -1 len_show = 0 print_width -= 2 tweet_content = tweet_content.split(' ') for i in range(len(tweet_content)): if len_show + space + len(tweet_content[i]) <= print_width: show.append(tweet_content[i]) len_show += len(tweet_content[i]) space += 1 if i == len(tweet_content)-1: print(' '+' '.join(show)) elif len_show + space +1 + len(tweet_content[i+1]) > print_width: print(' '+' '.join(show)) show = [] space = -1 len_show = 0 #--------------------------------------------
# 6330291721 (15.78) 142 (2021-02-27 22:11) def get_unique( words ): unique_words = [ ] for i in words: if not i in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): intersection = 0 for i in words_1: if i in words_2: intersection += 1 union = (len(words_1) + len(words_2)) - intersection jaccard_coef = intersection/union return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): tweet_n = [ ] for tweet_id in range(len(norm_tweets)): Jaccard = jaccard(get_unique(norm_tweets[tweet_id]), get_unique(norm_query)) tweet_n.append([Jaccard,tweet_id]) tweet_n.sort(reverse=True) sort_tweet = [ ] for [Jaccard,tweet_id] in tweet_n: if Jaccard > 0: sort_tweet.append([tweet_id,Jaccard]) for k in range(len(sort_tweet)-1): for i in range(len(sort_tweet)-1): if sort_tweet[i][1] == sort_tweet[i+1][1]: if sort_tweet[i][0] > sort_tweet[i+1][0]: sort_tweet[i],sort_tweet[i+1] = sort_tweet[i+1],sort_tweet[i] top_n = [ ] for i in range(n): top_n.append(sort_tweet[i]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(" ") print("#" + str(tweet_id) + " " + "(" + str(round(jc_coef,2)) + ")") t = tweet_content.split(' ') x = ' ' for i in range(len(t)): if len(x)+1 <= print_width: x = x + t[i] + ' ' if len(x) > print_width: x = x.strip().split(' ') y = x.pop() x = ' '.join(x) print(' '+ x) x = ' ' + y + ' ' if i == len(t)-1: print(' '+ y) elif len(x) == print_width: print(' '+ x) x = ' ' elif i == len(t)-1: print(' '+ x) #--------------------------------------------
# 6330292321 (19.37) 143 (2021-02-28 21:29) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): n=0 m=0 ans = [] ans.extend(words_1) ans.extend(words_2) ans = get_unique( ans ) for i in ans: if i in words_1 and i in words_2: n+=1 m+=1 else: m+=1 if m == 0: jaccard_coef = 0 else : jaccard_coef = n/m return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] q = [] for i in range(len(norm_tweets)) : j = jaccard(norm_tweets[i],norm_query) if j > 0 : q.append([i,j]) y = sorted(q, key = lambda s:(s[1],-s[0]), reverse = True) top_n = y[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): p = tweet_content+" " print("") print("#"+str(tweet_id)+" "+"("+str(round(jc_coef,2))+")") start = 0 end = 0 end1 = 0 end2 = 0 for i in range(len(p)) : if p[i] == ' ': end = end1 end1 = end2 end2 = i #print(i,p[i],start,"\t",end,end1,end2,"\t",end2-start,len(p)) if (end2-start)>(print_width -2): print(" "+p[start:end1]) start = end1+1 if end2 == len(p)-1: print(" "+p[start::]) #--------------------------------------------
# 6330293021 (19.70) 144 (2021-03-01 14:36) def get_unique( words ): unique_words = [] for x in words : if x in unique_words : pass else : unique_words.append(x) return unique_words def jaccard(words_1, words_2): i = 0 for x in words_2 : if x in words_1 : i = i+1 else : pass LOL = words_1 + words_2 EE = [] for x in LOL : if x in EE : pass else : EE.append(x) jaccard_coef = i/len(EE) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): OAML = [] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i],norm_query) > 0: OAML.append([jaccard(norm_tweets[i],norm_query) , i]) OAML.sort(reverse = True) FXCK = [] OFF = [] for i in range(len(OAML)-1) : if OAML[i][0] == OAML[i+1][0] : FXCK.append(OAML[i]) else: FXCK.append(OAML[i]) FXCK.sort() OFF.append(FXCK) FXCK = [] if i == len(OAML)-2 : FXCK.append(OAML[i+1]) FXCK.sort() OFF.append(FXCK) BULLSHIT = [] for e in OFF : for MAIWAILAEW in e : BULLSHIT.append(MAIWAILAEW) for e in BULLSHIT : e[0],e[1] = e[1],e[0] top_n = BULLSHIT[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+ str(tweet_id),'('+str(round(jc_coef,2)) +')') x = tweet_content.split((' ')*1) HIGH = [] LOW = '' for e in x : if len(e)+len(LOW) <= print_width-2 : LOW = LOW + e + (' ')*1 else : HIGH.append(LOW.strip()) if len(e) > print_width-2 : HIGH.append(e.strip()) else : LOW = e + (' ')*1 if LOW != (' ')*1 : HIGH.append(LOW.strip()) for e in HIGH : print((' ')*2 + e) #--------------------------------------------
# 6330294621 (12.91) 145 (2021-02-28 22:56) def get_unique( words ): words.sort() unique_words = [] for i in range(len(words)-1): if words[i] != words[i+1]: unique_words += [words[i]] if len(words) != 0: unique_words += [words[-1]] return unique_words def jaccard(words_1, words_2): rpt_wd = [] for e in words_1: if e in words_2: rpt_wd += [e] jaccard_coef = len(rpt_wd)/(len(words_1)+len(words_2)-len(rpt_wd)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] atww = [] for i in range(len(norm_tweets)): atww.append([jaccard(norm_tweets[i],norm_query),i]) atww.sort() atww = atww[::-1] for i in range(len(atww)): atww[i][0],atww[i][1] = atww[i][1],atww[i][0] for i in range(n): top_n.append(atww[i]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id),'('+(str(round(jc_coef,2)))+')') m = tweet_content.split(' ') for i in range(len(m)-1): m.insert((2*i)+1,' ') prt = [] c = 0 for i in range(len(m)): prt += m[i] c = len(prt) if c > print_width-2: prt = prt[0:len(prt)-len(m[i])] if prt[0] == ' ': prt = prt[1:] print(' '+''.join(prt)) c = 0 prt =m[i] if prt[0] == ' ': prt = prt[1:] print(' '+''.join(prt)) #--------------------------------------------
# 6330295221 (18.33) 146 (2021-02-27 23:43) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): same = 0 jukkroo = [] for e in words_1: if e in words_2: same += 1 if e not in jukkroo: jukkroo.append(e) for e in words_2: if e not in jukkroo: jukkroo.append(e) jaccard_coef = same/len(jukkroo) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] nub = 0 b = [] g = [] for i in range(len(norm_tweets)): nub = jaccard(norm_tweets[i], norm_query) if nub > 0: b.append([-nub, i]) nub = 0 b.sort() top_n = [[e[1],-1*e[0]] for e in b][:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(' ') print('#'+str(tweet_id)+" "+'('+str(round(jc_coef,2))+')') t = tweet_content.split(' ') s = ' ' while len(t) != 0: if len(s+t[0]) < int(print_width): s += t[0]+' ' t.remove(t[0]) elif len(s+t[0]) == int(print_width): s += t[0] t.remove(t[0]) print(s) s = ' ' else: print(s) s = ' ' if s != ' ': print(s) #--------------------------------------------
# 6330296921 (17.03) 147 (2021-03-01 23:48) def get_unique( words ): unique_words = [] for i in words: if not i in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): mid = [] for i in words_1 : if i in words_2: mid.append(i) nw1 = len(words_1) nw2 = len(words_2) nm = len(mid) jaccard_coef = nm/(nw1+nw2-nm) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] tweet_id = [] jaccards = [] y = 0 x = [] for i in range (len(norm_tweets)): tweet_id.append(norm_tweets.index(norm_tweets[i])) jaccards.append(jaccard(norm_tweets[i], norm_query)) if jaccards[i] > 0 : top_n.append([tweet_id[i], jaccards[i]]) top_n.sort() for e in top_n: e[0],e[1] = -e[1],e[0] top_n.sort() for y in top_n: y[0],y[1] = -y[0],y[1] for x in top_n: x[0],x[1] = x[1],x[0] top_n = top_n[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): n = 2 i = 0 y = '' t = 0 s = '' print('') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') x = tweet_content.split(' ') while i < len(x): while i < len(x) : if n < print_width: t = len(x[i])+1 n += t y = x[i]+' ' s += y i +=1 else : break print(' '+s) n = 0 s = '' #--------------------------------------------
# 6330298121 (19.25) 148 (2021-02-27 23:41) def get_unique( words ): unique_words = [] for e in words: if not e in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): n = 0 for e in words_1: if e in words_2: n += 1 tn = len(words_1)+len(words_2) - n jaccard_coef = n / tn return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): a = [] b = jaccard(norm_tweets[i], norm_query) if b > 0: a.append(b) a.append(i) top_n.append(a) top_n.sort(reverse=True) for i in range(len(top_n)): top_n[i] = top_n[i][::-1] b = [] c = [] for i in range(len(top_n)-1): if top_n[i][1] == top_n[i+1][1]: b.append(top_n[i]) else: b.append(top_n[i]) b.sort() for e in b: c.append(e) b = [] top_n = c[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#"+str(tweet_id),"("+str(round(jc_coef,2))+")") a = tweet_content.split(" ") c = 0 line = " " for i in range(len(a)): if len(" "+a[i])+c <= print_width: line += " "+a[i] c = len(line) else: print(line) line = " "+a[i] c = len(line) if i == len(a)-1: print(line) #--------------------------------------------
# 6330299821 (20.00) 149 (2021-02-26 22:20) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): if len(words_1) == 0: return 0 samewords = [] allwords = [] for i in range(len(words_1)): if words_1[i] not in allwords: allwords.append(words_1[i]) for j in range(len(words_2)): if words_1[i] == words_2[j] and words_2[j] not in samewords: samewords.append(words_2[j]) if words_2[j] not in allwords: allwords.append(words_2[j]) jaccard_coef = len(samewords) / len(allwords) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] b = [] for i in range(len(norm_tweets)): jc = jaccard(norm_tweets[i], norm_query) d = [i, jc] top_n.append(d) top_n.sort() for i in range(len(top_n)): top_n[i] = top_n[i][::-1] top_n.sort(reverse=True) for i in range(len(top_n)): top_n[i] = top_n[i][::-1] for i in range(len(top_n)): if top_n[i][1] != 0: if len(b) == 0: b.append(top_n[i]) else: for j in range(len(b)): if b[j][1] == top_n[i][1] and b[j][0] > top_n[i][0]: b.insert(j, top_n[i]) break if j == len(b) - 1: b.append(top_n[i]) top_n = b[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') content_word = tweet_content.split(' ') m = 0 line = " " for i in range(len(content_word)): if m == 0 or len(line+' '+content_word[i]): line += ' '+content_word[i] m += 1 if i < len(content_word)-1: if len(line+' '+content_word[i+1]) > print_width: print(line) line = " " m = 0 elif i == len(content_word)-1: print(line) #--------------------------------------------
# 6330300721 (17.00) 150 (2021-02-28 02:19) def get_unique(words): unique_words = [] for e in words : if e not in unique_words: unique_words.append(e) else: pass return unique_words def jaccard(words_1, words_2): words_1 = get_unique(words_1) words_2 = get_unique(words_2) z = [] for e in words_1 : for r in words_2 : if e not in z and r not in z and e == r : z.append(e) else : pass s = words_1 for e in words_2 : if e not in s : s.append(e) else : pass if len(s) != 0 : jaccard_coef = len(z)/len(s) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): number = n tweet_count = 0 list_jaccard = [] for j in range(len(norm_tweets)) : each_jaccard = jaccard(norm_tweets[j],norm_query) if each_jaccard > 0 : if each_jaccard != 0: list_jaccard.append([each_jaccard,j]) else : tweet_count+=1 list_jaccard.sort() top_n = [] top_n = list_jaccard[-number::] for g in list_jaccard[:-number] : for e in top_n : if e[0] <= g[0] : if e[1] > g[0] : e = g break else : pass else: break if len(top_n) > 0 : for e in top_n : e[1],e[0] = e[0],e[1] else : pass return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') text = tweet_content.split(' ') show = text[0] j = 0 for i in range(1,len(text)) : if len(show+' '+text[i]) <= print_width-2 : show = show+' '+text[i] else : print(' '+show) show = text[i] j += 1 if j != 0 : print(' '+show) #--------------------------------------------
# 6330301321 (17.72) 151 (2021-03-01 16:02) def get_unique( words ): words.sort() unique_words = [] for i in range(len(words)) : if words[i] != words[i-1] or i == 0 : unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): w12 = sorted(words_1+words_2) aub = len(get_unique(w12)) ainb = len(w12)-aub if aub == 0 : return round(0,2) jaccard_coef = round(ainb/aub,2) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): result = [] for i in range(len(norm_tweets)) : jco = jaccard(norm_tweets[i],norm_query) if jco > 0 : result.append([i,jco]) def sortkey(twt) : return twt[1] result.sort(reverse=True,key=sortkey) top_n = result[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() #print('123456789012345678901234567890123456789012345678901234567890') print('#'+str(tweet_id),'('+str(jc_coef)+')') twt = tweet_content.split(' ') line = [] for i in range(len(twt)) : line.append(twt[i]) if len(' '+' '.join(line)) > print_width : if len(line) > 1 : print(' '+' '.join(line[:-1])) line = [twt[i]] if i == len(twt)-1 : print(' '+' '.join(line)) #--------------------------------------------
# 6330302021 (12.78) 152 (2021-02-28 20:14) def get_unique( words ): words.sort() unique_words = [] for i in range (len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): count = 0 for k in words_1: for w in words_2: if k == w : count += 1 jaccard_coef = count/(len(words_1)+len(words_2)-count) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): top_n.append([i,jaccard(norm_tweets[i], norm_query)]) # print(top_n) top_n = sorted(top_n, key = lambda top_n : (top_n[1],-top_n[0]), reverse=True) # print(top_n) for k in range (len(top_n)): if float(top_n[k][1]) == 0.0: top_n1 = list(top_n) top_n1.pop(k) top_n1 = top_n1[:n] # print(top_n1) return top_n1 def show_tweet(tweet_id, tweet_content, jc_coef, print_width): x = '' count = 0 forwardcount = 0 y = tweet_content.split(' ') print() print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') for i in range (len(y)): #print(i,y[i],count,forwardcount,'\t\t',x) try : if forwardcount <= int(print_width-2): x += y[i] x += ' ' count += int(1+len(y[i])) forwardcount = int(count + len(y[i+1])) else : print(' '+str(x)) x = y[i] l = y[i+1] x += ' ' count = int(len(y[i])+1) forwardcount = count except: print(' '+str(x)) #--------------------------------------------
# 6330303621 (17.50) 153 (2021-02-28 11:34) def get_unique( words ): unique_words=[] for i in range(len(words)) : if words[i] in unique_words: continue unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): n=0 words_4=[] words_1=get_unique(words_1) words_2=get_unique(words_2) words_3=words_1+words_2 for i in range(len(words_3)) : if words_3[i] in words_4 : n+=1 else : n+=0 words_4.append(words_3[i]) a=len(get_unique(words_3)) jaccard_coef=n/a return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a=[] for i in range(len(norm_tweets)): tweet_id=i f=jaccard(norm_tweets[i],norm_query) a.append([tweet_id, f]) b=[] for r in range(len(a)): if a[r][1]>0 : b.append([a[r][1], a[r][0]]) c = sorted(b, reverse=True) top_n=[] top_temp=[] for r in range(len(c)) : if c[r][0]==c[r-1][0] : top_temp.append([c[r][1], c[r][0]]) else : if top_temp!=[] : top_temp=sorted(top_temp) top_n=top_n+top_temp top_temp=[] top_temp.append([c[r][1], c[r][0]]) else : top_temp.append([c[r][1], c[r][0]]) top_n=top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef, 2))+')') tweet_content=tweet_content.split(' ') n=0 i=0 while i in range(len(tweet_content)) : n+=len(tweet_content[i])+1 if n in range(print_width) : if n==len(tweet_content[i])+1 : print(' '+tweet_content[i], end=' ') i+=1 else : print(tweet_content[i], end=' ') i+=1 else : print() n=0 print() #--------------------------------------------
# 6330304221 (18.01) 154 (2021-02-28 20:56) def get_unique( words ): unique_words = [] for word in words: if word not in unique_words: unique_words.append(word) return unique_words def jaccard(words_1, words_2): intercept_words = [] union_words = get_unique(words_1 + words_2) for word1 in words_1: for word2 in words_2: if word1 == word2: intercept_words.append(word1) intercept_words = get_unique(intercept_words) jaccard_coef = len(intercept_words) / len(union_words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): all_n, top_n = [], [] # all_n is list that the jc_coef descending --> [[jc_coef, id], ...] for idx, tweet in enumerate(norm_tweets): all_n.append([jaccard(norm_query, tweet), idx]) all_n.sort(reverse=True) # Get list of tweet_id in the same jc_coef tweet_ids, tweet_id = [], [all_n[0][1]] for i in range(1, len(all_n)): if all_n[i][0] != all_n[i-1][0]: # Sort ids ascending tweet_id.sort() tweet_ids.append(tweet_id) tweet_id = [all_n[i][1]] else: tweet_id.append(all_n[i][1]) tweet_ids.append(tweet_id) # Make new list with [[id, jc_coef], ...] which ids ascending and jc_coef descending jc_coef = get_unique([item[0] for item in all_n]) for idx, jc in enumerate(jc_coef): for tweet_id in tweet_ids[idx]: top_n.append([tweet_id, jc]) top_n = top_n[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n#{} ({})'.format(tweet_id, round(jc_coef, 2))) word2show, check_limit = ' ', 0 for word in tweet_content.split(' '): word2show += ' ' + word check_limit += len(word2show) if check_limit <= print_width: print(word2show, end='') else: word2show = ' ' + word print('\n' + word2show, end='') check_limit = len(word2show) word2show = '' print() #--------------------------------------------
# 6330305921 (18.01) 155 (2021-02-27 20:57) def get_unique( words ): words.sort() unique_words = [] if len(words) != 0 : c = words[0] unique_words.append(c) for i in range(1,len(words)): if words[i] != c : c = words[i] unique_words.append(c) return unique_words def jaccard(words_1, words_2): j1 = [] for e in words_1 : if e in words_2 : j1.append(e) s = words_1 + words_2 j2 = get_unique( s ) jaccard_coef = len(j1)/len(j2) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): s= [] for i in range(len(norm_tweets)) : j = jaccard(norm_tweets[i],norm_query) s.append([j,-i]) s.sort(reverse = True) for e in s : e[0],e[1] = int(- e[1]),e[0] top_n = s[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+ str(tweet_id),'('+str(round(jc_coef,2))+')') content = tweet_content.split(' ') k = [] for e in content : if len(k) == 0 : k.append(e) if len(k)+2 >= print_width : print(' '+k[0]) k = [] else : if len(' '.join(k))+len(e)+3 <= print_width : k.append(e) else : print(' '+ ' '.join(k)) k = [] k.append(e) if len(k) != 0 : print(' '+' '.join(k)) #--------------------------------------------
# 6330306521 (14.37) 156 (2021-03-01 22:02) def get_unique( words ): unique_words=[] for i in range(len(words)) : if (words[i] in unique_words)==False : unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): a=words_1 b=0 for i in range(len(words_2)) : if (words_2[i] in words_1) : b+=1 a.append(words_2[i]) a=get_unique(a) jaccard_coef = b/len(a) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] a=[] for i in range(len(norm_tweets)) : a.append([jaccard(norm_tweets[i],norm_query),i]) a.sort(reverse=True) b=[] for i in range(len(a)) : a[i][1],a[i][0]=a[i][0],a[i][1] for i in range(len(a)) : if i<len(a)-200 : for e in range(200) : if a[i][1]==a[i+e][1] and i+e<len(a) : if a[i][0]>a[i+e][0] : a[i][0],a[i+e][0]=a[i+e][0],a[i][0] for i in range(n) : top_n.append([a[i][0],a[i][1]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(' ') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') a=' ' b=tweet_content.split(' ') for i in range(len(b)) : if len(a+b[i])<print_width : c=' '+b[i] a+=c q=0 if i+1<len(b) and len(a+b[i+1])>=print_width : print(a) a=' ' q=1 if q==0 : print(a) #--------------------------------------------
# 6330308821 (20.00) 157 (2021-02-27 19:23) def get_unique( words ): unique_words = [] while len(words) > 0: a = words.pop(0) if a not in unique_words: unique_words.append(a) return unique_words def jaccard(words_1, words_2): n = 0 for e in words_1: if e in words_2: n += 1 c = len(words_1)+len(words_2)-n jaccard_coef = n/c return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): l = [] for i in range(len(norm_tweets)): jaccard2 = jaccard(norm_tweets[i],norm_query) l.append([jaccard2,-i]) l.sort(reverse = True) top_n = [] for x,y in l: if x > 0: top_n.append([-y,x]) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): l = tweet_content.split(' ') m = [] s = " " for e in l: s += ' '+e if len(s) > print_width: s = s[:len(s)-len(e)] m.append(s) s = " "+e if e == l[-1]: m.append(s) jc_coef = round(jc_coef,2) print(" ") print(f"#{tweet_id} ({jc_coef})") print(*m, sep = "\n") #--------------------------------------------
# 6330309421 (17.60) 158 (2021-03-01 20:37) def get_unique( words ): duplicate = set() unique_words = [] for w in words: if w not in duplicate: duplicate.add(w) unique_words.append(w) return unique_words def jaccard(words_1, words_2): intersect = len(list(set(words_1).intersection(words_2))) union = (len(words_1) + len(words_2)) - intersect jaccard_coef = (float(intersect)/union) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): jcn = [] for i in range(len(norm_tweets)): jc_co = jaccard(norm_tweets[i], norm_query) jcn.append([jc_co, i]) jcn.sort(key=lambda k:(k[0],-k[1])) top_n = [] for i in range(len(jcn)-1, len(jcn)-1-n, -1): j = jcn[i] top_n.append([j[1], j[0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print( '\n'+'#'+str(tweet_id)+' ('+ str(round(jc_coef,2))+')') textlist = tweet_content.split(" ") m = 2 text = " " for t in textlist: word_ = t + " " m += len(word_) if (m-1 == print_width) : word_ = t text += word_ m -= 1 elif m <= print_width: text += word_ else: m = 0 word_ = "\n" + " " + t + " " text += word_ m += len(word_)-1 print(text) #--------------------------------------------
# 6330310021 (17.54) 159 (2021-03-01 16:47) def get_unique( words ): unique_words = [] for e in words: if not e in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): intersect_word = [] jaccard_coef = 0 for e in words_1: if e in words_2: intersect_word.append(e) if (len(words_1)+len(words_2)-len(intersect_word)) != 0: jaccard_coef = len(intersect_word)/(len(words_1)+len(words_2)-len(intersect_word)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n0 = [] for i in range(len(norm_tweets)): j = jaccard(norm_tweets[i], norm_query) list007 = [j, -i] if j > 0 : top_n0.append(list007) top_n1 = sorted(top_n0) top_n2 = top_n1[::-1] top_n = [] if len(top_n2) != 0: for i in range(len(top_n2)): top_n2[i][1] *= -1 for i in range(n): top_n.append(top_n2[i][::-1]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): seperate = tweet_content.split(' ') print("\n#" + str(tweet_id) + " (" + str(round(jc_coef, 2)) + ")") s = " " n = 1 x = 0 for i in range(len(seperate)): if len(s) + len(seperate[i]) +1 + x <= (n * print_width) + (n-1) : s = s + " " + seperate[i] else: x = (n*print_width)-len(s) s = s + "\n " + seperate[i] n += 1 print(s) #--------------------------------------------
# 6330311621 (20.00) 160 (2021-02-28 23:34) def get_unique( words ): unique_words = [] for i in range(len(words)): if not(words[i] in unique_words): unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): x = 0 for i in words_1: if i in words_2: x += 1 y = len(words_1)+len(words_2)-x if y != 0: jaccard_coef = x/y else: jaccard_coef = 0 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = [] for i in range(len(norm_tweets)): x = jaccard(norm_tweets[i],norm_query) if x > 0: top.append([-1*x,i]) top.sort() for i in range(len(top)): top[i][0] = top[i][0]*(-1) top_n = [] for i in top: top_n.append([i[1],i[0]]) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#"+str(tweet_id)+" ("+str(round(jc_coef,2))+")") x = " " words = tweet_content.split(' ') for word in words: if len(x)+len(word) < print_width: x += " "+word else: print(x) x = " "+word print(x) #--------------------------------------------
# 6330312221 (20.00) 161 (2021-02-26 16:07) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): word = words_1 +words_2 word1 = get_unique(word) word2 = [] for e in word1: if e in words_1 and e in words_2: word2.append(e) jaccard_coef = len(word2)/len(word1) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] index = 0 for e in norm_tweets: jaccard1 = jaccard(e, norm_query) if jaccard1 > 0: if len(top_n) < n: top_n.append([index,jaccard1]) else: min_jaccard = 2 for i in range(n): jaccard_n = top_n[i][1] if jaccard_n < min_jaccard: min_jaccard = jaccard_n min_index = i elif jaccard_n == min_jaccard: now_index = top_n[i][0] other_index = top_n[min_index][0] if other_index < now_index: min_index = i if min_jaccard < jaccard1: top_n[min_index] = [index, jaccard1] elif min_jaccard == jaccard1 and index < min_index: top_n[min_index] = [index, jaccard1] index += 1 for i1 in range(len(top_n)): for i2 in range(len(top_n)-i1-1): if top_n[i2][1] < top_n[i2+1][1]: top_n[i2], top_n[i2+1] = top_n[i2+1], top_n[i2] elif top_n[i2][1] == top_n[i2+1][1]: if top_n[i2][0] > top_n[i2+1][0]: top_n[i2], top_n[i2+1] = top_n[i2+1], top_n[i2] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): pw = print_width - 2 words = tweet_content.split(' ') n_words = len(words) cnt = 1 all_line = [] line = '' for w in words: if len(line) + len(w) <= pw: line += w else: line = line[:len(line)-1] all_line.append(line) line = w if cnt == n_words: all_line.append(line) else: line += ' ' cnt += 1 print() print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') for line in all_line: print(' ' * 2 + line) #--------------------------------------------
# 6330313921 (20.00) 162 (2021-03-01 22:44) def get_unique( words ): unique_words = [] for i in words : if i not in unique_words : unique_words.append(i) return unique_words def jaccard(words_1 , words_2): jaccard_coef = 0 for a in words_2 : if a in words_1 : jaccard_coef += 1 jaccard_coef = jaccard_coef / (len(words_1) + len(words_2) - jaccard_coef ) return jaccard_coef def top_n_similarity(norm_tweets , norm_query , n): jacc1 = [] top_n = [] for x, y in enumerate(norm_tweets ): jacc1.append([-jaccard(y, norm_query), x]) jacc1.sort() for i in range(n): if jacc1[i][0] == 0: break nmn = [jacc1[i][1] , -jacc1[i][0]] top_n.append(nmn) return top_n def show_tweet(tweet_id , tweet_content , jc_coef , print_width): a = tweet_content.split(" ") print('\n' + '#' + str(tweet_id ) , '(' + str(round(jc_coef , 2)) + ')') space = 2 write = ' ' for b in a : wordp = b + ' ' space += len(wordp) if space - 1 == print_width : wordp = b write += wordp space -= 1 elif space <= print_width : write += wordp else : space = 0 wordp = '\n' + ' ' + b + ' ' write += wordp space += len(wordp) - 1 print(write) #--------------------------------------------
# 6330314521 (17.60) 163 (2021-02-27 15:14) def get_unique( words ): unique_words = [] for x in words: if x not in unique_words: unique_words.append(x) return unique_words def jaccard(words_1, words_2): words_3 = [] words_4 = [] for i in range(len(words_1)): if words_1[i] not in words_3: words_3.append(words_1[i]) for i in range(len(words_2)): if words_2[i] not in words_3: words_3.append(words_2[i]) for i in range(len(words_1)): if words_1[i] in words_2: words_4.append(words_1[i]) jaccard_coef = len(words_4)/len(words_3) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): jaccard_norm = [] #jacard coef of norm for i in range(len(norm_tweets)): coef = jaccard(norm_tweets[i], norm_query) jaccard_norm.append([coef, i]) jaccard_norm.sort(key=lambda k:(k[0],-k[1])) #ascending and descending #print(jaccard_norm) top_n = [] for i in range(len(jaccard_norm)-1, len(jaccard_norm)-1-n, -1): top_n.append([jaccard_norm[i][1], jaccard_norm[i][0]]) #print(top_n) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print( '\n'+'#'+str(tweet_id)+' ('+ str(round(jc_coef,2))+')') #print( '\n'+'#'+str(tweet_id)+' ('+ str(jc_coef)+')') list_t = tweet_content.split(" ") c = 2 text = " " for x in list_t: token = x + " " c += len(token) if (c-1 == print_width) : token = x text += token c -= 1 elif c <= print_width: text += token else: c = 0 token = "\n" + " " + x + " " text += token c += len(token)-1 print(text) #--------------------------------------------
# 6330315121 (20.00) 164 (2021-03-01 14:49) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): all_words = [] for i in range(len(words_1)): if words_1[i] not in all_words: all_words.append(words_1[i]) for i in range(len(words_2)): if words_2[i] not in all_words: all_words.append(words_2[i]) c = 0 for e in words_1: if e in words_2: c+=1 jaccard_coef = c/len(all_words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] L = [] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i],norm_query) > 0: L.append([jaccard(norm_tweets[i],norm_query),-i]) L.sort() L = L[:-(n+1):-1] for e in L: top_n.append([-e[1],e[0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') x = tweet_content.split(' ') z = ' ' for i in range(len(x)): y = x[i] w = ' ' if len(z) + len(y) + len(w) <= print_width +1: z+=y z+=w elif len(z) + len(y) + len(w) > print_width: print(z) z = ' ' z+=y z+=w print(z) #--------------------------------------------
# 6330316821 (20.00) 165 (2021-03-01 22:01) def get_unique( words ): unique_words = [] for e in words : if e not in unique_words : unique_words.append(e) return unique_words def jaccard(words_1, words_2): w1 = [] for x in words_1: if x not in w1: w1.append(x) w2 = [] for x in words_2: if x not in w2: w2.append(x) w12 = w1+w2 w3 = [] for x in w12 : if x not in w3: w3.append(x) ST = 0 for x in words_1: if x in words_2 : ST += 1 if len(w3) != 0: jaccard_coef = ST/len(w3) else : jaccard_coef = 0 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] * (n) for i in range(len(norm_tweets)): jack = jaccard(norm_tweets[i], norm_query) if jack > 0: top_n.append([jack, -i]) top_n.sort() for e in range(len(top_n)): top_n[e][1] = abs(top_n[e][1]) top_n = top_n[-1::-1] for x in range(len(top_n)): top_n[x] = [top_n[x][1], top_n[x][0]] top_n = top_n[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(" ") print("#"+str(tweet_id), "("+str(round(jc_coef,2))+")") content = tweet_content.split(" ") count = 1 extra = 0 string = " " for i in range(len(content)): if len(string) + len(content[i]) +1 <= count*print_width + extra: string += " " + content[i] elif len(string) + len(content[i]) +1 > count*print_width + extra: space = " " * (count*print_width - (len(string)) + extra) string += space + "\n" + " "*2 + content[i] extra +=1 count += 1 print(string) #--------------------------------------------
# 6330317421 (20.00) 166 (2021-03-01 22:20) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): words_1 = get_unique(words_1) words_2 = get_unique(words_2) jaccard_coef = (len(words_1)+len(words_2)-len(get_unique(words_1+words_2)))/len(get_unique(words_1+words_2)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] tweet_id = -1 for i in norm_tweets: tweet_id += 1 if jaccard(i,norm_query) > 0: top_n += [[tweet_id,jaccard(i,norm_query)]] def st(x): return x[1] top_n.sort(key=st,reverse=True) return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') t = tweet_content.split(' ') out = t[0] + ' ' for i in range(1,len(t)): if len(out) + len(t[i]) <= print_width - 2: out += t[i] + ' ' else: print(' ' + out) out = t[i] + ' ' print(' ' + out) #--------------------------------------------
# 6330318021 (20.00) 167 (2021-03-01 09:32) def get_unique( words ): unique_words=[] while True : if len(words)==0 : break unique_words.append(words[0]) repeated_word=words[0] while True : if repeated_word not in words : break else : words.remove(repeated_word) return unique_words def jaccard(words_1, words_2): similar_words=[] for i in range(len(words_1)) : if words_1[i] in words_2 : similar_words.append(words_1[i]) for i in range(len(similar_words)) : words_1.remove(similar_words[i]) diff_words=words_1+words_2 jaccard_coef=len(similar_words)/len(diff_words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): list_jcc_tweets=[] for i in range(len(norm_tweets)) : jcc=jaccard(norm_tweets[i],norm_query) if jcc!=0 : list_jcc_tweets.append([jcc,i]) list_jcc_tweets.sort(reverse=True) if len(list_jcc_tweets)<n : top_n=list_jcc_tweets elif len(list_jcc_tweets)>n : repeated_tweets=[] for i in range((len(list_jcc_tweets))): if list_jcc_tweets[i][0]==list_jcc_tweets[n-1][0]: repeated_tweets.append(list_jcc_tweets[i]) repeated_tweets.sort() list_jcc_tweets=list_jcc_tweets[:n] for i in range(len(repeated_tweets)) : if repeated_tweets[i] in list_jcc_tweets : list_jcc_tweets.remove((repeated_tweets[i])) list_jcc_tweets+=repeated_tweets top_n=list_jcc_tweets[0:n] elif len(list_jcc_tweets)==0 : top_n=[] top_n.sort(reverse=True) top_n+=[['','']] n=[] for i in range(len(top_n)-1) : if top_n[i][0]!=top_n[i+1][0] : n.append(i+1) top=[] for i in range(len(n)) : r=[] if i==0 : r=top_n[0:n[i]] else : r=top_n[n[i-1]:n[i]] for k in range(len(r)) : (r[k][0],r[k][1])=(r[k][1],r[k][0]) r.sort() top+=r top_n=top return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') word_show=tweet_content.split(' ') line=[' '] for i in range(len(word_show)) : word_show[i]=' '+word_show[i] sum_len=1 for i in range(len(word_show)) : sum_len+=len(word_show[i]) if sum_len>print_width: line.append('\n ') line.append(word_show[i]) sum_len=len(word_show[i])+1 else : if sum_len==print_width+1 : line.append(word_show[i]) else : line.append(word_show[i]) print(('').join(line)) #--------------------------------------------
# 6330319721 (18.33) 168 (2021-03-01 11:50) def get_unique( words ): unique_words=[] for i in words: if not i in unique_words: unique_words.append(i) return unique_words #-------------------------------------------------------- def jaccard(words_1, words_2): s = 0 t = 0 for x in words_1: if x in words_2: s += 1 else: t += 1 for x in words_2: if not x in words_1: t += 1 jaccard_coef = (s)/(s+t) return jaccard_coef #-------------------------------------------------------- def top_n_similarity(norm_tweets, norm_query, n): top_n=[] x=[] for tweet_id in range(len(norm_tweets)): j= norm_tweets[tweet_id] Jaccard=jaccard(j,norm_query) x=[] if Jaccard>0 : if Jaccard in x: continue x.append(tweet_id) x.append(Jaccard) top_n.append(x) else : x=[] top_n.sort(reverse=True,key=lambda x: x[1]) top_n=top_n[:n:] return top_n #-------------------------------------------------------- def show_tweet(tweet_id, tweet_content, jc_coef, print_width): a = round(jc_coef,2) b = str(a) print('\n'+'#'+str(tweet_id)+' ('+b+')') s = tweet_content.split(' ') o = '' index = len(s) i = 0 while i < index: if len(o)+len(s[i]) < print_width-1 : o += ' '+s[i] i+=1 else: print(' '+o) o = '' if i == index: print(' '+o) #--------------------------------------------
# 6330320221 (16.67) 169 (2021-02-27 16:48) def get_unique(words): unique_words=[] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): dup=0 allword=words_1+words_2 for e in words_1: if e in words_2: dup+=1 jaccard_coef=dup/len(get_unique(allword)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i],norm_query)>0: s=[] s.append(i) s.append(jaccard(norm_tweets[i],norm_query)) top_n.append(s) def takeSecond(x): return x[1] top_n.sort(key=takeSecond,reverse=True) top_n=top_n[:n:] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): word=tweet_content.split(' ') allword=[] while len(word)!=0: c=0 word_line=[] wordspace='' for e in word: if (print_width-len(wordspace)-2)>len(e): word_line+=[e] wordspace=' '.join(word_line) if len(wordspace)>print_width-2: word_line.pop() wordspace=' '.join(word_line) break c+=1 else: break word=word[c::] allword.append(wordspace) allword[0]=' '+allword[0] print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') print('\n '.join(allword)) #--------------------------------------------
# 6330321921 (16.94) 170 (2021-03-01 16:52) def get_unique( words ): words.sort() unique_words = [] words.append('@') for i in range(len(words) - 1): if words[i + 1] != words[i]: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): a = len(words_1) + len(words_2) b = 0 for e in words_1: if e in words_2: a -= 1 b += 1 jaccard_coef = b/a return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] s = [] for i in range(len(norm_tweets)): tweet_id = i jac = jaccard(norm_tweets[tweet_id], norm_query) s.append([jac, tweet_id*(-1)]) s.sort(reverse= True) for k in range(len(s)): s[k][1] = s[k][1]*(-1) for j in range(n): top_n.append([s[j][1], s[j][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#' + str(tweet_id), '(' + str(round(jc_coef,2)) + ')') w = tweet_content.split() c = 1 sentence = ' ' for i in range(len(w)): sentence += ' ' + w[i] c += 1 + len(w) if len(sentence) > print_width: b = sentence.split(' ') q = b.pop(-1) sentence = ' '.join(b) print(sentence) sentence += '\n' sentence = ' ' + q c = 2 + len(q) print(sentence) #--------------------------------------------
# 6330322521 (15.00) 171 (2021-03-01 16:15) def get_unique( words ): unique_words=[] for x in words: if x in unique_words: unique_words.remove(x) unique_words.append(x) return unique_words def jaccard(words_1, words_2): p = 0 q = 0 for e in words_2: if e in words_1: p += 1 else: q += 1 for e in words_1: if not e in words_2: q += 1 jaccard_coef = (p)/(p+q) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): z = [] top_n = [] for tweet_id in range(len(norm_tweets)): t = norm_tweets[tweet_id] o = jaccard(t,norm_query) if o != 0: z.append([-o,tweet_id]) z.sort() for e in z: e[0],e[1]=e[1],-e[0] top_n += z[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): a = round(jc_coef,2) b = str(a) print('\n'+'#'+str(tweet_id)+' ('+b+')') c = tweet_content.split(' ') d = '' index = len(c) i = 0 while i < index: if len(o)+len(s[i]) < print_width-1 : d += ' '+c[i] i+=1 else: print(' '+d) d = '' if i == index: print(' '+d) #--------------------------------------------
# 6330323121 (17.00) 172 (2021-03-01 23:24) def get_unique( words ): words.sort() unique_words = [] for i in words: if i in unique_words: continue else: unique_words.append(i) return unique_words def jaccard(words_1, words_2): u=0 for i in words_1: if i in words_2: u+=1 mix=words_1+words_2 #รวม mix.sort() mix2=[mix[0]] for i in range(1,len(mix)): if mix[i]!=mix[i-1]: mix2.append(mix[i]) d=len(mix2) jaccard_coef=u/d return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[]*n x=[] for tweet_id in range(len(norm_tweets)): jaccards=jaccard(norm_tweets[tweet_id],norm_query) x.append([jaccards,tweet_id*(-1)]) x.sort() x=x[::-1] #เรียงแจค for a,b in x: b = b*(-1) for j in range(n): top_n.append([x[j][1]*-1, x[j][0]]) return top_n def show_tweet(tweet_id, tweet_content, jaccard_coef, print_width): print(' ') print('#'+str(tweet_id)+' '+'('+str(round(jaccard_coef,2))+')') tweet_word = tweet_content.split(' ') line = ' ' for i in tweet_word: if len(line) + len(i) <= print_width: line+=i+' ' else: print(line) line=' '+i+' ' print(line) #--------------------------------------------
# 6330324821 (15.00) 173 (2021-02-28 14:54) def get_unique( words ): word_sort=sorted(words) unique_words=[] for c in word_sort: if c in unique_words: pass else: unique_words.append(c) return unique_words def jaccard(words_1, words_2): same=0 for i in range(len(words_1)): if words_1[i] in words_2: for c in range(len(words_2)): if words_1[i]==words_2[c]: same+=1 jaccard_coef=(same/(len(words_1)+len(words_2)-same)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): All_top=[] top_n=[] for i in range(len(norm_tweets)): tweet_id=i jaccards=jaccard(norm_tweets[i], norm_query) All_top.append([-(jaccards),tweet_id]) top =sorted(All_top) for i in range(n): top_n.append([top[i][1],(-top[i][0])]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_contentword=tweet_content.split(' ') print(' ') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') while len(tweet_contentword)>0: x='' temp=list(tweet_contentword) for c in range(0,len(tweet_contentword)): if len(temp[c])+2+len(x)<=print_width: x+=(temp[c]+' ') tweet_contentword.pop(0) else: break print(' '+x) #--------------------------------------------
# 6330325421 (20.00) 174 (2021-03-01 00:12) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): top = [] for i in range (len(words_1)): if words_1[i] in words_2: top.append(words_1[i]) top = len(top) oldbottom = words_1 + words_2 bottom=[] for i in oldbottom: if i not in bottom: bottom.append(i) bottom= len(bottom) jaccard_coef = top/bottom return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): tweet=list() for q in range(len(norm_tweets)): if jaccard(norm_tweets[q],norm_query)!=0: tweet.append([-jaccard(norm_tweets[q],norm_query),q]) tweet.sort() top_n=tweet[:n] for h in range(len(top_n)): top_n[h][0]=abs(top_n[h][0]) top_n[h][1]=abs(top_n[h][1]) a=top_n[h][1] b=top_n[h][0] top_n[h][0]=a top_n[h][1]=b return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') words = tweet_content.split(' ') text='' for i in range(len(words)): if len(text)+len(words[i])<=print_width-2: if words[i] == ' ': text+=words[i] else:text+=words[i]+' ' else: print(' '+text) text=words[i]+' ' print(' '+text.strip()) #--------------------------------------------
# 6330326021 (20.00) 175 (2021-02-26 12:23) def get_unique( words ): unique_words = [] for s in words: if s not in unique_words: unique_words.append(s) return unique_words def jaccard(words_1, words_2): number_of_intersect = 0 for s in words_1: if s in words_2: number_of_intersect += 1 number_of_words = len(get_unique(words_1 + words_2)) jaccard_coef = number_of_intersect / number_of_words return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): jacc_id = [] for i in range(len(norm_tweets)): jacc_id.append([jaccard(norm_query,norm_tweets[i]),i]) jacc_id.sort() jacc_id_decreasing = jacc_id[::-1] top_n = [] jaccs_n = [] for i in range(n): tweet_id, jacc = jacc_id_decreasing[i][1], jacc_id_decreasing[i][0] if jacc == 0: break jaccs_n.append(jacc) #เอาแค่ jacc ก่อนเพราะ tweet_id เรียงจากมากไปน้อยอยู่ start = 0 for j in jaccs_n: #เอาแต่ละ jacc มาแล้วไปดูใน jacc_id เพื่อเอาตัวที่มี tweet_id น้อยกว่าก่อน for i in range(len(jacc_id)): tweet_id, jacc = jacc_id[i][1],jacc_id[i][0] if j == jacc and [tweet_id,jacc] not in top_n: top_n.append([tweet_id,jacc]) break return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): result = "#" + str(tweet_id) + " (" + str(round(jc_coef,2)) + ")" + "\n" content = tweet_content.split(" ") line = " " for s in content: if len(line) + len(s) > print_width: result += line + "\n" line = " " line += s + " " result += line print("") print(result.strip()) #--------------------------------------------
# 6330327721 (14.75) 176 (2021-02-27 15:54) def get_unique( words ): words.sort() words.append('.') unique_words=[] i = 1 for i in range(1,len(words),1): if words[i-1] != words[i]: unique_words.append(words[i-1]) if len(words)<=1: unique_words=words return unique_words def jaccard(words_1, words_2): t = 0.0 x= words_1 + words_2 for e in words_1: if e in words_2: t+=1 x = get_unique(x) b=len(x) if b>0: jaccard_coef = t/b else : jaccard_coef = 0 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a=len(norm_tweets) i=0 x = [] for i in range(len(norm_tweets)): tweet_id = norm_tweets.index(norm_tweets[i]) J = jaccard(norm_tweets[i], norm_query) if J>0: x.append([J,-i]) a-=1 x.sort() if a<n: n = a top_n = [] for i in range(n): top_n.append([abs(x[-1-i][1]),x[-1-i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+ str(tweet_id) +' ('+ str(round(jc_coef,2)) +')') tweet_content = tweet_content.split(' ') i = 0 y=0 n=2 x=' ' for i in range(len(tweet_content)): a = len(tweet_content[i]) n = n + a if n <= print_width : x += tweet_content[i]+' ' n += 1 y += 1 elif y == 0: x += tweet_content[i]+' ' print(x) x=' ' n = 2 else: print(x) n = 2 + a + 1 y = 1 x=' ' x += tweet_content[i]+' ' print(x) #--------------------------------------------
# 6330328321 (16.94) 177 (2021-02-27 00:48) def get_unique(words): unique_words=list() for i in range(len(words)): if words[i] in unique_words: continue else: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): words=list() for i in range(len(words_1)): words.append(words_1[i]) for i in range(len(words_2)): words.append(words_2[i]) unique_words = get_unique(words) n=len(words)-len(unique_words) jaccard_coef=n/len(unique_words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=list() list_jc=list() for i in range(len(norm_tweets)): jc=jaccard(norm_tweets[i],norm_query) if jc>0: ln=list() ln.append(jc) ln.append(i) list_jc.append(ln) list_jc.sort() top_n=list_jc[-n:] for i in range(len(list_jc)-n): for j in range(len(top_n)): if top_n[j][0]<=list_jc[i][0]: if top_n[j][1]>list_jc[i][0]: top_n[j]=list_jc[i] break else: break for i in range(len(top_n)): top_n[i][0],top_n[i][1]=top_n[i][1],top_n[i][0] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(f'\n#{tweet_id} ({round(jc_coef,2)})') content = tweet_content.split() t=' ' for i in range(len(content)): if len(t)+len(content[i])<=print_width: t+=content[i]+' ' else: print(t) t=' '+content[i]+' ' print(t) #--------------------------------------------
# 6330329021 (20.00) 178 (2021-02-28 02:02) def get_unique( words ): words.sort() unique_words = [] if len(words) != 0: unique_words.append(words[0]) for i in range(len(words)-1): if words[i] != words[i+1]: unique_words.append(words[i+1]) return unique_words def jaccard(words_1, words_2): same_words = 0 for e in words_1: if e in words_2: same_words += 1 jaccard_coef = same_words/(len(words_1)+len(words_2)-same_words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): top_n.append([-jaccard(norm_tweets[i],norm_query),i]) top_n.sort() top_n = top_n[:n] for i in range(len(top_n)): if top_n[i][0] == 0: top_n = top_n[0:i] break else: top_n[i] = [top_n[i][1],-top_n[i][0]] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') lines = [' '] k = 0 t = tweet_content.split(' ') for i in range(len(t)): if len(lines[k])+len(t[i])+1 > print_width: lines.append(' '+t[i]) k += 1 else: lines[k] = lines[k]+' '+t[i] for e in lines: print(e) #--------------------------------------------
# 6330330521 (19.19) 179 (2021-02-28 21:07) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): count = 0 if len(words_1)>= len(words_2): for i in range(len(words_1)): for j in range(len(words_2)): if words_1[i]==words_2[j]: count += 1 else: for i in range(len(words_2)): for j in range(len(words_1)): if words_2[i]==words_1[j]: count+=1 words_sum = get_unique(words_1+words_2) n = len(words_sum) jaccard_coef = count/n return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] jaccard_index_list=[] for i in range(len(norm_tweets)): jac_num = jaccard(norm_tweets[i],norm_query) if jac_num > 0: ind_list = [] ind_list.append(jac_num) ind_list.append(i) jaccard_index_list.append(ind_list) jaccard_index_list.sort() top_n = jaccard_index_list[::-1] top_n = top_n[:n] for i in range(len(top_n)-1): if top_n[i][0] == top_n[i+1][0]: if top_n[i][1] > top_n[i+1][1]: top_n[i],top_n[i+1] = top_n[i+1],top_n[i] for i in range(len(top_n)): top_n[i][0],top_n[i][1]=top_n[i][1],top_n[i][0] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("#"+str(tweet_id)+" "+"("+str(round(jc_coef, 2))+")") tweet = tweet_content.split() content=" " for i in range(len(tweet)): if len(content) + len(tweet[i]) <= print_width: content+=tweet[i]+" " else: print(content) content=" "+tweet[i]+" " print(content) #--------------------------------------------
# 6330331121 (20.00) 180 (2021-03-01 15:21) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): summ = 0 same = 0 for i in words_1: if i in words_2: same += 1 summ = len(words_1) + len(words_2) - same jaccard_coef = same / summ return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): jac = jaccard(norm_tweets[i], norm_query) if jac > 0: top_n.append([-jac, i]) top_n.sort() top_n = top_n[:n] for i in range(len(top_n)): top_n[i] = [(top_n[i][1]), (-top_n[i][0])] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): a = str(tweet_id) b = str(round(jc_coef, 2)) print("") print("#" + a + " (" + b + ")") con = tweet_content.split(" ") sh = "" d = 1 for i in con: if (len(i) + d) <= (print_width - 2): sh += (" " + i) d = len(sh) else: print(" " + sh) sh = "" sh += (" " + i) d = len(sh) if len(sh) != 0: print(" " + sh) #--------------------------------------------
# 6330332821 (20.00) 181 (2021-02-28 23:28) def get_unique( words ): unique_words=[] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): y = [] for e in words_1: if e in words_2: y.append(e) union = words_1 + words_2 union = get_unique(union) jaccard_coef = len(y)/len(union) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): x=[] for e in norm_tweets: x.append(jaccard(e, norm_query)) z=[] for i in range(len(x)): z.append([-x[i],i]) z.sort() z=z[:n] top_n=[] for i in range(len(z)): if z[i][0] != 0: top_n.append([z[i][1],-z[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') t=tweet_content.split(' ') b='' for e in t: if len(b)+len(e)+1 < print_width: b+=' '+e else: print(' '+b) b='' b=b+' '+e print(' '+b) #--------------------------------------------
# 6330333421 (20.00) 182 (2021-02-28 21:11) def get_unique( words ): unique_words=[] for word in words: if word in unique_words: continue else: unique_words.append(word) return unique_words def jaccard(words_1, words_2): jaccard_coef = 0.00 union_words = [] intersect_words = [] for word in words_1+words_2: if (word in words_1) and (word in words_2) and (word not in union_words): intersect_words.append(word) if word in union_words: continue else: union_words.append(word) if len(union_words) == 0: return 0 jaccard_coef = len(intersect_words) / len(union_words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): jaccard_tweets = [] for tweet_id in range(len(norm_tweets)): jcc_tweet_coef = jaccard(norm_tweets[tweet_id], norm_query) if jcc_tweet_coef>0: jaccard_tweets.append([tweet_id, jcc_tweet_coef]) jaccard_tweets = sorted(jaccard_tweets, key=lambda jaccard_tweets: jaccard_tweets[1], reverse=True) top_n = jaccard_tweets[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n#{} ({})'.format(tweet_id, round(jc_coef, 2))) line_characters=print_width-2 list_of_content=tweet_content.split(' ') display=[] for word in list_of_content: if len(' '.join(display+[word])) <= line_characters: display.append(word) else: print(' '.join([' ']+display)) display = [word] if len(display)>0: print(' '.join([' ']+display)) #--------------------------------------------
# 6330334021 (16.25) 183 (2021-02-28 15:04) def get_unique( words ): unique_words = [] for w in words: if not (w in unique_words) : unique_words.append(w) return unique_words def jaccard(words_1, words_2): stimulus_check = 0 for president_biden in words_1: if president_biden in words_2: stimulus_check += 1 jaccard_coef = stimulus_check/(len(words_1) + len(words_2) - stimulus_check) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): usa = [] white_house = [] top_n = [] congress = 0 while congress < len(norm_tweets) - 1: if jaccard(norm_tweets[congress], norm_query) != 0: usa.append([jaccard(norm_tweets[congress], norm_query), (-1) * congress]) congress += 1 usa.sort() for voters in usa: white_house.append([voters[1] * (-1), voters[0]]) white_house = white_house[::-1] for democrat in range(n): top_n.append(white_house[democrat]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#" + str(tweet_id) + " (" + str(round(jc_coef, 2)) + ')') new_content = tweet_content.split(' ') line = ' ' for x in new_content: if len(line) <= print_width - len(str(x)) : line += str(x) line += ' ' else : print(line) line = ' ' line += str(x) line += ' ' print(line) #--------------------------------------------
# 6330335721 (20.00) 184 (2021-02-27 14:56) def get_unique( words ): unique_words =[] for i in words: if i not in unique_words : unique_words.append(i) return unique_words def jaccard(words_1, words_2): word3 =[] sameword = [] for i in words_1: word3.append(i) for x in words_2: if i ==x: sameword.append(i) word3.append(x) word3 = get_unique(word3) sameword = get_unique(sameword) try: jaccard_coef = len(sameword)/len(word3) except ZeroDivisionError: jaccard_coef = 0 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): x = [] j = jaccard(norm_tweets[i],norm_query) if j>0: x.append(i) x.append(j) top_n.append(x) top_n.sort(key =lambda y :y[1],reverse = True) top_n= top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("#"+str(tweet_id),"("+str(round(jc_coef,2))+")") tweetc = tweet_content.split(" ") text =" " c= "" z=0 print_ww= 0 #print(tweetc) for val in tweetc: if len(text+val)> print_width: print(text) text =" " text+=val+" " print(text,end="\n\n") #--------------------------------------------
# 6330336321 (18.01) 185 (2021-02-28 21:46) def get_unique( words ): words.sort() unique_words = [] for i in range(0, len(words)): if words[i] in unique_words: pass else: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): same = 0 union = 0 for i in range(len(words_1)): if words_1[i] in words_2: same += 1 union += len(words_1) + len(words_2) - same jaccard_coef = same/union return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): top_n.append([jaccard(norm_tweets[i], norm_query)*(-1), i]) top_n.sort() for i in range(len(top_n)): top_n[i] = [top_n[i][1],top_n[i][0]*(-1)] top_n = top_n[0:n:1] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() show_1 = ("#" + str(tweet_id) + " " + "(" + str(round(jc_coef,2)) + ")") pseudo = tweet_content.split(' ') prep = " " print(show_1) count = 1 for i in range(len(pseudo)): prep += " " + pseudo[i] count += len(pseudo[i]) + 1 if count > print_width: print(prep[0:-len(pseudo[i]):1]) prep = " " + pseudo[i] count = 2 + len(pseudo[i]) print(prep) #--------------------------------------------
# 6330337021 (20.00) 186 (2021-03-01 02:16) def get_unique( words ): unique_words = [] for c in words: if c not in unique_words : unique_words.append(c) return unique_words def jaccard(words_1, words_2): down = len(get_unique(words_1+words_2)) top = len(words_1)+len(words_2)-down jaccard_coef = top/down return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] pou = [] for i in range(len(norm_tweets)) : jaccard_l = jaccard(norm_tweets[i],norm_query) if jaccard_l > 0 : pou.append([-jaccard_l,i]) pou.sort() for i in range(len(pou)) : top_n.append([pou[i][1],-pou[i][0]]) return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): t = tweet_content.split(" ") g = " " tag = "#"+str(tweet_id)+" ("+str(round(jc_coef,2))+")" print("\n"+tag) for c in t : if len(g)+1+len(c) > print_width : print(g) g = " " + c else : g += " " + c print(g) #--------------------------------------------
# 6330338621 (19.20) 187 (2021-03-01 22:51) def get_unique( words ): unique_words = [] for e in words : if e in unique_words: pass else: unique_words.append(e) return unique_words def jaccard(words_1, words_2): num_s_in_t = 0 num_s_not_t = len(words_1) + len(words_2) for e in words_1: if e in words_2: num_s_not_t -= 1 num_s_in_t += 1 jaccard_coef = (num_s_in_t)/(num_s_not_t) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): k = jaccard(norm_tweets[i],norm_query) if k > 0: top_n.append([i,k]) def jacnum(e): return e[1] top_n.sort(reverse=True,key=jacnum) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content = tweet_content.split(' ') print('') print('#' + str(tweet_id) + ' ' + '(' + str(round(jc_coef,2)) + ')') numsen = 2 sen = [' '] for i in range(len(tweet_content)): numsen += len(tweet_content[i]) if numsen <= print_width : sen.append(tweet_content[i]) numsen += 1 if i == len(tweet_content) - 1: print(" ".join(sen)) else: print(" ".join(sen)) numsen = 2 + len(tweet_content[i]) sen = [' '] + [tweet_content[i]] if i == len(tweet_content) - 1: print(" ".join(sen)) #--------------------------------------------
# 6330339221 (18.01) 188 (2021-02-26 20:14) def get_unique( words ): unique_words = [] for i in range(len(words)) : if words[i] not in unique_words : unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): x = len(words_2) y = 0 for i in words_1 : if i not in words_2 : x = x+1 else : y = y+1 jaccard_coef = y/x return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range(len(norm_tweets)) : top_n.append([i,jaccard(norm_tweets[i],norm_query)]) top_n = [[y,-x] for x,y in sorted([-y,x] for x,y in top_n)][:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id),'('+str(round(jc_coef ,2))+')') tweet_content = tweet_content.split(' ') a = 0 x = [] while a < len(tweet_content) : x.append(tweet_content[a]) x = ' '.join(x) if x == '' : x = x+' ' if len(x)>print_width-2 : x = x.split(' ') if len(x) == 1 : x = ' '.join(x) print(' ',x) a = a+1 x = [] else : x = x[:len(x)-1] x = ' '.join(x) print(' ',x) x = [] else : a=a+1 x = x.split(' ') x = ' '.join(x) print(' ',x) #--------------------------------------------
# 6330340821 (20.00) 189 (2021-02-28 00:41) def get_unique( words ): unique_words=[] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): a=0 b=len(words_1) for i in words_1: if i in words_2: a+=1 for i in words_2: if i not in words_1: b+=1 jaccard_coef=a/b return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): tweet_id=[] jac=[] for i in range (len(norm_tweets)): tweet_id.append(i) jac.append(jaccard(norm_tweets[i],(norm_query))) top_n=[] for i in range (len(norm_tweets)): if jac[i]!=0: top_n.append([jac[i],-tweet_id[i]]) top_n=sorted(top_n,reverse=True) top_n=top_n[:n:] for i in top_n: i[0],i[1]=-i[1],i[0] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): jround=round(jc_coef,2) print(' ') print('#'+str(tweet_id)+' ('+str(jround)+')') text = tweet_content.split(" ") current_width = 0 first_word = True for i in text: if current_width+len(i) <= print_width: if first_word == True: print(" ", end="") current_width += 2 print(i, end=" ") current_width += len(i)+1 first_word = False else: print() current_width = 0 first_word = True if current_width+len(i) > print_width: print(" " + i) current_width = 0 continue else: print(" ", end="") current_width += 2 print(i, end=" ") current_width += len(i)+1 first_word = False print() #--------------------------------------------
# 6330341421 (17.00) 190 (2021-02-28 16:07) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): a = 0 for i in words_1: if i in words_2: a +=1 jaccard_coef = float(a/(len(words_1)+len(words_2)-a)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] many = [] for i in range(len(norm_tweets)): tweet_id = int(i) many.append([tweet_id,jaccard(norm_tweets[tweet_id],norm_query)]) for c in many: c[0],c[1] = c[1],c[0] many.sort() for c in many: c[0],c[1] = c[1],c[0] many = many[::-1] c=0 while c in range(len(many)-1): for i in range(len(many)-1): if many[i][1] == many[i+1][1]: if many[i][0]>many[i+1][0]: many[i],many[i+1] = many[i+1],many[i] c+=1 #รอนานหน่อยนะครับ for i in range(n): top_n.append(many[i]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): d = [] tweet_content = tweet_content.split(' ') s = 2 print('\n'+'#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') for i in range(len(tweet_content)): s += (len(tweet_content[i])) if s < print_width: d.append(tweet_content[i]) s += 1 elif s > print_width: d.append('\n ') d.append(tweet_content[i]) s=2 s+=(len(tweet_content[i])) s+=1 elif s == print_width: d.append(tweet_content[i]) d.append('\n ') s=2 print(' ',(' ').join(d)) #--------------------------------------------
# 6330342021 (20.00) 191 (2021-03-01 02:54) def get_unique( words ): t = [] for e in words: t.append([len(e),e]) t.sort() for i in range(len(t)): words[i] = t[i][1] unique_words=[] for i in range(len(words)-1): if words[i]!=words[i+1]: unique_words.append(words[i]) if len(words)!=0: unique_words+=[words[-1]] return unique_words def jaccard(words_1, words_2): w=words_1+ words_2 t = [] for e in w: t.append([len(e),e]) t.sort() for i in range(len(t)): w[i] = t[i][1] x=[] for i in range(len(w)-1): if w[i]!=w[i+1]: x.append(w[i]) x+=[w[-1]] c=0 for i in range(len(words_1)): if words_1[i] in words_2 : c+=1 jaccard_coef=c/len(x) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): x=[] for i in range(len(norm_tweets)): x.append([jaccard(norm_tweets[i], norm_query),i]) y= sorted(x, reverse=True) m=0 for k in range(1,len(x)): if y[k-1][0]!=y[k][0]: y[m:k]=sorted(y[m:k]) m=k for i in range(len(y)): y[i][0],y[i][1] = y[i][1],y[i][0] top_n=y[:n] top=[] top+=top_n for i in range(len(top)): if top[i][1]==0: top_n.remove(top[i]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') j=tweet_content.split(' ') k=2 s=' ' for i in range(len(j)): k+=len(j[i]) if k<=print_width : s+=str(j[i])+' ' k+=1 else : s+='\n'+' '+str(j[i])+' ' k=2+len(j[i])+1 print(s) #--------------------------------------------
# 6330343721 (17.00) 192 (2021-03-01 23:43) def get_unique( words ): unique_words = [] #Find the non repeat word for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): word1plus2 = words_1+words_2 unique_words = [] for i in range(len(word1plus2)): if word1plus2[i] not in unique_words: unique_words.append(word1plus2[i]) jaccard_coef = (len(word1plus2)-len(unique_words))/len(unique_words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): ########HELLLOOP generatenormtweetandjaccardorder = [] # To generate the norm tweet as a pattern for i in range(0,int(len(norm_tweets)), 1): tweet_id = int(i) generatenormtweetandjaccardorder.append([tweet_id, jaccard(norm_tweets[tweet_id], norm_query)]) # Swap order to sort order for sort1 in generatenormtweetandjaccardorder: sort1[0], sort1[1] = sort1[1], sort1[0] # Sort it generatenormtweetandjaccardorder.sort() # Swap again !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! for sort2 in generatenormtweetandjaccardorder: sort2[0], sort2[1] = sort2[1], sort2[0] generatenormtweetandjaccardorder = generatenormtweetandjaccardorder[::-1] # Order the Jaccard order and Tweet order i = 0 #While Loop เซ็ต i เริ่มต้นเป็น 0 while i in range(len(generatenormtweetandjaccardorder) - 1): for j in range(0,len(generatenormtweetandjaccardorder) - 1, 1): if generatenormtweetandjaccardorder[j][1] == generatenormtweetandjaccardorder[j + 1][1]: if generatenormtweetandjaccardorder[j][0] > generatenormtweetandjaccardorder[j + 1][0]: generatenormtweetandjaccardorder[j], generatenormtweetandjaccardorder[j + 1] = generatenormtweetandjaccardorder[j + 1], generatenormtweetandjaccardorder[j] i += 1 top_n = [] # Shosen Tweet in range n # Construct Top n for i in range(n): top_n.append(generatenormtweetandjaccardorder[i]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(" ") #Print the blank space print('#'+str(tweet_id)+' '+"("+str(round(jc_coef, 2))+")") ysplit = tweet_content.split(" ") tweet = ' ' for i in range(0, len(ysplit), 1): if (int(len(tweet))+1+int(len(ysplit[i]))) <= int(print_width): tweet += " " + ysplit[i] else: print(tweet) tweet = " " + ysplit[i] print(tweet) #again ??? #--------------------------------------------
# 6330344321 (20.00) 193 (2021-03-01 22:48) def get_unique( words ): unique_words = [] for w in words: if w not in unique_words: unique_words.append(w) return unique_words def jaccard(words_1, words_2): u = 0 check_words = [] all_words = words_1 + words_2 for w in all_words: if w not in check_words: check_words.append(w) for w in check_words: if (w in words_1) and (w in words_2): u += 1 jaccard_coef = u/len(check_words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for t in range(len(norm_tweets)): if jaccard(norm_tweets[t],norm_query) > 0: top_n.append([-jaccard(norm_tweets[t],norm_query), t]) top_n = sorted(top_n)[:n] for i in top_n: i[0] = -i[0] i[0], i[1] = i[1], i[0] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("\n" + "#" + str(tweet_id) + " (" + str(round(jc_coef,2)) + ")") tlist = tweet_content.split(' ') word_count = 1 s = ' ' for t in tlist: word_count = word_count + len(t) + 1 if s == ' ' and word_count > print_width: s = s + ' ' + t print(s) s = ' ' word_count = 1 elif word_count <= print_width: s = s + ' ' + t else: print(s) s = ' ' s = s + ' ' + t word_count = 2 + len(t) if s != ' ': print(s) #--------------------------------------------
# 6330345021 (20.00) 194 (2021-02-27 20:46) def get_unique( words ): words.sort() unique_words = [] if len(words) != 0: unique_words.append(words[0]) for i in range(len(words)-1): if words[i] != words[i+1]: unique_words.append(words[i+1]) return unique_words def jaccard(words_1, words_2): all_word = words_1 + words_2 unique = get_unique( all_word ) same_word = [] for i in range(len(words_1)): for k in range(len(words_2)): if words_1[i] == words_2[k] and not(words_1[i] in same_word): same_word.append(words_1[i]) if len(unique) != 0: jaccard_coef = len(same_word)/len(unique) else: jaccard_coef = 0 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): prime_s = [] for i in range(len(norm_tweets)): prime_s.append([(jaccard(norm_tweets[i], norm_query))*-1,i]) prime_s.sort() top_n = [] prime = [] for e in range(len(prime_s)): if prime_s[e][0] != 0: prime.append(prime_s[e]) if n > len(prime): n = len(prime) for k in range(n): prime[k][0] *= -1 prime[k][0],prime[k][1] = prime[k][1],prime[k][0] top_n.append(prime[k]) else: for k in range(n): prime[k][0] *= -1 prime[k][0],prime[k][1] = prime[k][1],prime[k][0] top_n.append(prime[k]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_word = tweet_content.split(' ') print("\n""#"+str(tweet_id),"("+str(round(jc_coef,2))+")") c = 2 text = [] for e in tweet_word: c+=len(e) if c <= print_width or (2+len(e) > print_width and text == []): text.append(e) c+=1 if c >= print_width: text_ = " ".join(text) text_old = text print(" "+text_) text = [] c = 2 if not e in text_old: c+=len(e) text.append(e) c+=1 text_old = [] if e == tweet_word[-1] and not text == []: text_ = " ".join(text) print(" "+text_) #--------------------------------------------
# 6330346621 (17.97) 195 (2021-03-01 23:10) def get_unique( words ): unique_words=[] for e in words: if not e in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): a=0 for e in words_1: for f in words_2: if f==e: a+=1 jaccard_coef=a/(len(words_1)+len(words_2)-a) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a=[] for i in range(len(norm_tweets)): norm_tweets[i] jac=jaccard(norm_tweets[i],norm_query) if jac>0: a.append([-jac,i]) a.sort() for e in a: e[0],e[1]=e[1],-e[0] top_n=a[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') a=tweet_content.split(' ')+[''] i=0 while i < len (a)-1: b=' '+a[i] if i < len (a)-1:i+=1 else:break while len (b+a[i])+1 < print_width and i < len (a)-1: b+=' '+a[i] if i < len (a)-1:i+=1 else:break print(b) #--------------------------------------------
# 6330347221 (19.20) 196 (2021-03-01 14:38) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): a = 0 for e in words_1: if e in words_2: a += 1 words_tot = words_1 b = len(words_tot) for e in words_2: if e not in words_tot: b += 1 jaccard_coef = a/b return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] all_top_n = [] for i in range(len(norm_tweets)): x = [] if jaccard(norm_tweets[i], norm_query) == 0: pass elif jaccard(norm_tweets[i], norm_query) > 0: x.append(jaccard(norm_tweets[i], norm_query)) x.append(-1*i) all_top_n.append(x) all_top_n.sort() all_top_n.reverse() for e in all_top_n: e[0],e[1] = e[1],e[0] e[0] *= -1 if len(all_top_n) < n: top_n = all_top_n elif len(all_top_n) >= n: for i in range(n): top_n.append(all_top_n[i]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content1 = tweet_content.split(' ') print(' ') print('#'+str(tweet_id),'('+str(round(float(jc_coef),2))+')') c = 2 print(' ',end = '') for e in tweet_content1: if c + len(e) <= print_width: print(' '+e,end = '') c += len(e) +1 else: print() print(' ',end ='') c = 2 + len(e) print(' '+e,end = '') print(' ') #--------------------------------------------
# 6330348921 (18.37) 197 (2021-02-28 14:56) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): c = 0 for i in range(len(words_2)): if words_2[i] in words_1: c += 1 jaccard_coef = c / len(get_unique(words_1 + words_2)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = []*n a = []*n for i in range(len(norm_tweets)): for j in range(len(norm_query)): if norm_query[j] in norm_tweets[i]: tweet_id = i jaccardd = jaccard(norm_tweets[i], norm_query) if jaccardd > 0: a.append([float(str(int(jaccardd*(10**2))) + str(5000-tweet_id)),tweet_id,jaccardd]) break a.sort(reverse = True) for k in range(len(a)): y = a[k].pop(0) top_n[:n] = a[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('#'+ str(tweet_id) +' ' +'('+ str(round((jc_coef),2)) + ')') tweet_content = tweet_content.split(' ') x = [] a = 1 for i in range(len(tweet_content)): a += len(tweet_content[i])+1 if a <= print_width: x.append(tweet_content[i]) y = (' ').join(x) else: print(' '+y) y = '' x = [] a = 1 x.append(tweet_content[i]) a+=len(tweet_content[i])+1 y = (' ').join(x) print(' '+y) #--------------------------------------------
# 6330349521 (17.00) 198 (2021-03-01 19:56) def get_unique( words ): unique_words = [] while True: if words == []: break x = words[0] unique_words.append(x) while x in words: words.remove(x) if words == []: break unique_words.sort() return unique_words def jaccard(words_1, words_2): num = 0 den = 0 for e in words_1: if e in words_2: num += 1 den = len(words_1)+len(words_2)-num jaccard_coef = num/den return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): g = [] top_n = [] for i in range(len(norm_tweets)): g.append([jaccard(norm_tweets[i],norm_query),-i]) g.sort(reverse=True) for r in range(n): top_n.append([-(g[r])[1],(g[r])[0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(' ') l = 2 print('#'+str(tweet_id), '('+str(round(jc_coef,2))+')') c = tweet_content.split(' ') for i in range(len(c)): if i == 0: print(" ",end=" ") if l<print_width and len(c[i])<print_width-l+1: print(c[i],end = " ") l += len(c[i])+1 else: l = 2 print('\n',end=' ') print(" "+c[i],end=" ") l += len(c[i])+1 print(' ') #--------------------------------------------
# 6330350021 (18.44) 199 (2021-03-01 02:45) def get_unique( words ): unique_words = [] a = [] if words != []: for i in words: a.append([len(i),i]) a.sort() for i in range(len(a)-1): if a[i] != a[i+1]: unique_words.append(a[i+1][1]) unique_words.append(a[0][1]) return unique_words def jaccard(words_1, words_2): a = [] b = [] c = 0 d = words_1 + words_2 e = [] f = [] for i in words_1: a.append([len(i),i]) a.sort() for i in words_2: b.append([len(i),i]) b.sort() if len(a) <= len(b): for i in a: if i in b: c += 1 else: for i in b: if i in a: c += 1 for i in d: e.append([len(i),i]) e.sort() for i in range(len(e)-1): if e[i] != e[i+1]: f.append(e[i][1]) if a != []: f.append(a[-1][1]) if len(f) == 0: jaccard_coef = 0 else: jaccard_coef = c/len(f) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] a = [] for tweet_id in range(len(norm_tweets)): if jaccard(norm_tweets[tweet_id],norm_query) > 0: z = jaccard(norm_tweets[tweet_id],norm_query) a.append([-z,tweet_id]) a.sort() if len(a) != 0: for i in range(n): top_n.append([a[i][1],-a[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef, 2))+')') a = tweet_content.split() d = 2 e = ' ' for c in a: d += len(c) if d <= print_width: e += c+' ' d += 1 else: print(e) e = ' '+c+' ' d = len(e) print(e) #--------------------------------------------
# 6330351721 (20.00) 200 (2021-02-26 23:01) def get_unique( words ): words.sort() ans = [] c = 0 for i in range(len(words)): if i == 0: ans += [words[i]] ans += [1] else: if words[i] == ans[c]: ans[c+1] += 1 else: c += 2 ans += [words[i]] ans += [1] unique_words = ans[::2] return unique_words def jaccard(words_1, words_2): words_1.sort() words_2.sort() ans1 = [] c = 0 for i in range(len(words_1)): if i == 0: ans1 += [words_1[i]] ans1 += [1] else: if words_1[i] == ans1[c]: ans1[c+1] += 1 else: c += 2 ans1 += [words_1[i]] ans1 += [1] unique_words1 = ans1[::2] #print(unique_words1) # ans2 = [] c = 0 for i in range(len(words_2)): if i == 0: ans2 += [words_2[i]] ans2 += [1] else: if words_2[i] == ans2[c]: ans2[c+1] += 1 else: c += 2 ans2 += [words_2[i]] ans2 += [1] unique_words2 = ans2[::2] #print(unique_words2) # n = 0 for i in range(len(unique_words2)): if unique_words2[i] in unique_words1: n += 1 # for i in range(len(unique_words2)): unique_words1.append(unique_words2[i]) unique_words1.sort() ans = [] c = 0 for i in range(len(unique_words1)): if i == 0: ans += [unique_words1[i]] ans += [1] else: if unique_words1[i] == ans[c]: ans[c+1] += 1 else: c += 2 ans += [unique_words1[i]] ans += [1] unique_wordx = ans[::2] s = len(unique_wordx) ans = n/s jaccard_coef = ans return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] new = [] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i], norm_query) != 0: new += [[jaccard(norm_tweets[i], norm_query),-i]] new.sort() new = new[::-1] for i in range(len(new)): new[i][0],new[i][1] = -1*new[i][1],new[i][0] #for i in range(n): # top_n += [new[i]] new = new[:n] top_n = new return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): a = ' ' print(a) print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') x = tweet_content.split(' ') for i in range(len(x)): a += ' ' + x[i] if len(a) > print_width: a = a[:-(len(x[i])+1)] print(a) a = ' ' + x[i] print(a) #--------------------------------------------
# 6330352321 (17.00) 201 (2021-03-01 23:45) def get_unique( words ): words.sort() unique_words = [] for e in words : if e not in unique_words : unique_words.append(e) return unique_words def jaccard(words_1, words_2): words_1.sort() words_2.sort() rpt_word = 0 for e in words_1 : if e in words_2 : rpt_word += 1 all_words = words_1 + words_2 all_words.sort() aw = [all_words[0]] for i in range(1,len(all_words)) : if all_words[i] != all_words[i-1] : aw.append(all_words[i]) jaccard_coef = rpt_word/len(aw) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = []*n mtn = [] for i in range(len(norm_tweets)) : jac = jaccard(norm_tweets[i],norm_query) mtn.append([jac,i*(-1)]) mtn.sort() mtn = mtn[::-1] for x,y in mtn : y = y*(-1) for i in range(n): top_n.append([mtn[i][1]*(-1), mtn[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): words = tweet_content.split(" ") line = " " print(" ") print("#"+str(tweet_id)+" ("+str(round(jc_coef,2))+")") for e in words: if len(line) + len(e) <= print_width : line += e+" " else : print(line) line = " "+e+" " print(line) #--------------------------------------------
# 6330353021 (15.40) 202 (2021-03-01 22:45) def get_unique( words ): l = words for i in range (1,len(l),1) : if l[i] in l[0 :i:1] : l[i] = 'าา' for i in range (len(l)) : if 'าา' in l : l.remove('าา') unique_words = l return unique_words def jaccard(words_1, words_2): top = 0 for i in words_1 : if i in words_2 : top += 1 down = len(words_1)+len(words_2)-top jaccard_coef = top/down return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)) : tweet_id = i j = jaccard(norm_tweets[i], norm_query) top_n.append([tweet_id, j]) k = top_n for i in range (len(k)) : for j in range (i+1,len(k)) : if (k[i])[1] < (k[j])[1] : k[i], k[j] = k[j], k[i] for i in range (len(k)) : for j in range (i+1,len(k)) : if (k[i])[1] == (k[j])[1] and (k[i])[0] > (k[j])[0]: k[i], k[j] = k[j], k[i] top_n = k[:n:] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): n= int(print_width) print(' ') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') a = tweet_content k = 0 while a != '' : if len(a) <= n-2 : c = a while c[0] == ' ' : c = c[1::] print(' '+c) break else : if a[n-2] == ' ' : c= a[0:n-2:] while c[0] == ' ' : c = c[1::] print(' '+c[0:n-2:]) a = a[0:n-2:] else : a = a[0:n-2:] while a[-1] != ' ' : a= a[0:len(a)-1:] c = a while c[0] == ' ' : c = c[1::] print(' '+c) k += len(a) a = tweet_content[k::] #--------------------------------------------
# 6330354621 (18.33) 203 (2021-03-01 00:49) def get_unique( words ): unique_words=[] for i in words: if i in unique_words: unique_words.remove(i) unique_words.append(i) return unique_words def jaccard(words_1, words_2): a=[] n=0 for i in words_1: a.append(i) for j in words_2: a.append(j) for k in words_1: if k in words_2: n+=1 jaccard_coef=(n/len(get_unique(a))) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] x=[] for tweet_id in range(len(norm_tweets)): #ได้ตำแหน่ง index ทีละตัว j= norm_tweets[tweet_id] #ให้ j = norm_tweets ตัวที่ index Jaccard=jaccard(j,norm_query) #sk jaccard x=[] if Jaccard>0 : #ถ้า jaccard >0 ใส่ [id,jacc] if Jaccard in x: continue x.append(tweet_id) x.append(Jaccard) top_n.append(x) else : x=[] top_n.sort(reverse=True,key=lambda x: x[1]) top_n=top_n[:n:] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): a = round(jc_coef,2) b = str(a) print('\n'+'#'+str(tweet_id)+' ('+b+')') c = tweet_content.split(' ') x = '' lenght = len(c) i = 0 while i < lenght: if len(x)+len(c[i]) < print_width-1 : x+=' '+c[i] i+=1 else: print(' '+x) x='' if i == lenght: print(' '+x) #--------------------------------------------
# 6330355221 (18.01) 204 (2021-03-01 01:53) def get_unique( words ): unique_words = [] for a_xy in range(len(words)): if words[a_xy] not in unique_words: unique_words.append(words[a_xy]) return unique_words #-------------------------------------------- def jaccard(words_1, words_2): mxvalueqq=0 for e in words_1:mxvalueqq += int(e in words_2) nyvalepp=len(get_unique(words_1+words_2)) jaccard_coef = mxvalueqq / nyvalepp return jaccard_coef #-------------------------------------------- def top_n_similarity(norm_tweets, norm_query, n): inpxforthink = [] for b in range(len(norm_tweets)) : inpxforthink.append(jaccard(norm_tweets[b],norm_query)) soluxqq = [[-inpxforthink[b],b] for b in range(len(norm_tweets))] soluxqq = sorted(soluxqq) top_n = [[soluxqq[b][1],-soluxqq[b][0]] for b in range(len(soluxqq))][:n] return top_n #-------------------------------------------- def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() finalab = '(' + str(round(jc_coef, 2)) + ')' finalcd = '#' + str(tweet_id) print(finalcd, finalab) wordspace = tweet_content.split(' ') ansinfour = ' ' + wordspace[0] for soxy in wordspace[1:]: if len(ansinfour) + len(' ' + soxy) <= print_width: ansinfour += ' ' + soxy else: print(ansinfour) ansinfour = ' ' + soxy print(ansinfour) #--------------------------------------------
# 6330356921 (18.01) 205 (2021-02-28 23:16) def get_unique( words ): unique_words=[] for i in words : if i not in unique_words : unique_words.append(i) return unique_words def jaccard(words_1, words_2): words = words_1+words_2 a = sorted(words) c=0 all_words=[] same_words=[] same_words2=[] for i in words : if i not in all_words : all_words.append(i) c=0 while c<(len(a)-1) : if a[c] == a[c+1] : same_words.append(a[c]) c+=1 else : c+=1 for i in same_words : if i not in same_words2 : same_words2.append(i) jaccard_coef = len(same_words2)/len(all_words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)) : tweet_id = i a = jaccard(norm_tweets[i], norm_query) top_n.append([tweet_id, a]) top_n=[[-y,x]for x,y in top_n] top_n.sort() top_n = [[y,-x] for x,y in top_n] top_n = top_n[0:n:1] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): t = tweet_content.split(" ") print("") print("#"+str(tweet_id)+" "+"("+str(round(jc_coef,2))+")") c = 0 print(' ',end ="") c = 2 for i in range(len(t)) : if t[i]=="" : print(" ",end="") c+=1 elif c+len(t[i]) > print_width : if i==0 : print(t[0],end=" ") else : c = 2 print(" ") print(" "+t[i],end=" ") c+=1+len(t[i]) elif c+len(t[i]) <= print_width : print(t[i],end=" ") c+=1+len(t[i]) print(" ") #--------------------------------------------
# 6330357521 (18.50) 206 (2021-03-01 22:45) def get_unique( words ): unique_words =[] words.sort() for i in range(len(words)): if i ==0: unique_words += [words[i]] else: if words[i]!=words[i-1]: unique_words += [words[i]] return unique_words def jaccard(words_1, words_2): all=get_unique( words_1+words_2 ) s=0 for w in words_1: if w in words_2: s+=1 jaccard_coef = s/len(all) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[[]]*n for i in range(n): top_n[i]=[jaccard(norm_tweets[i],norm_query),-i] top_n.sort() for i in range(n,len(norm_tweets)): jaccard1=jaccard(norm_tweets[i],norm_query) if jaccard1>top_n[0][0]: top_n[0][0]=jaccard1 top_n[0][1]=-i top_n.sort() top_n.sort(reverse=True) for i in range(len(top_n)): top_n[i][0], top_n[i][1]=-top_n[i][1],top_n[i][0] top = top_n[::] for i in top_n: if i[-1] == 0: top.remove(i) top_n = top return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() a1 = '#'+str(tweet_id) a2 = '('+str(round(jc_coef,2))+')' print(a1,a2) words = tweet_content.split(' ') ans = ' ' + words[0] for w in words[1:]: if len(ans) + len(' '+w)<= print_width: ans += ' ' + w else: print(ans) ans = ' ' + w print(ans) #--------------------------------------------
# 6330358121 (18.50) 207 (2021-02-28 23:29) def get_unique( words ): unique_words=[] for e in words: if e not in unique_words:unique_words.append(e) return unique_words def jaccard(words_1, words_2): m=0 for e in words_1:m+=int(e in words_2) n=len(get_unique(words_1+words_2)); jaccard_coef=m/n return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): l=[] for i in range(len(norm_tweets)): jac=jaccard(norm_tweets[i],norm_query) if jac>0:l.append([jac,i]) g=sorted(l,reverse=True);m=0 for k in range(1,len(g)): if g[k-1][0]!=g[k][0]:g[m:k]=sorted(g[m:k]);m=k g[m:]=sorted(g[m:]);top_n=g[:min(n,len(g))] for j in range(len(top_n)):top_n[j]=top_n[j][::-1] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') l=tweet_content.split(' ') for k in range(len(l)-1):l.insert(2*k+1,' ') j=0 # [s1,'',s2,'',s3,...] for i in range(1,len(l)+1): g=l[j:i-1];h=''.join(g) if i==len(l):p=''.join(l[j:]) else:p=''.join(l[j:i]) if len(p)>print_width-2: if g!=[] and h!=' '*(len(g)//2+1): if g[0]==' ': while g[0]==' ':g.pop(0) print(' '+''.join(g));j=i-1 else:j=i-1 g=l[j:] if g[0]==' ': while g[0]==' ':g.pop(0) print(' '+''.join(g)) #--------------------------------------------
# 6330360321 (15.50) 208 (2021-03-01 06:10) def get_unique( words ): unique_words = [] for n in words : if n not in unique_words : unique_words.append(n) return unique_words def jaccard(words_1, words_2): i = 0 for n in words_1 : if n in words_2 : i += 1 jaccard_coef = i/(len(words_1)+len(words_2)-i) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] lis = [] i = 0 m = 0 for h in range(len(norm_tweets)) : h1 = norm_tweets[h] d = jaccard(h1,norm_query) h = h+1 lis.append([d+1/(h*10000000000),h-1]) lis.sort(key=None, reverse=False) lis = lis[::-1] while i != n : if lis[0][0] <= 0.000001 : break else : top_n.append([lis[i][1],lis[i][0]]) i += 1 return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content = tweet_content.split(' ') print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') show = '' for n in tweet_content : if (len(show)+len(n)) > print_width-2 : print(' '+show) show = n+' ' else : show += n+' ' print(' '+show) #--------------------------------------------
# 6330361021 (18.33) 209 (2021-03-01 02:38) def get_unique( words ): unique_words = [] for i in words: if not i in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): same = [] for i in words_1: if i in words_2: same.append(i) dif = words_1[::] for i in words_2: if not i in words_1: dif.append(i) jaccard_coef = len(same)/len(dif) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = [] for i in range(len(norm_tweets)): a = [] a.append(jaccard(norm_tweets[i], norm_query)) a.append(i) top.append(a) top.sort(reverse=True) top_new = [] for i in top: b = [] b.append(-1*i[0]) b.append(i[1]) top_new.append(b) top_new.sort() top_m = [] for i in top_new: if len(top_m)==n: break else: top_m.append(i) top_n_b = [] for i in top_m: b = [] b.append(-1*i[0]) b.append(i[1]) top_n_b.append(b) top_n_a = [] for e in top_n_b: top_n_a.append([e[1],e[0]]) top_n = top_n_a[::] for i in top_n_a: if i[1] == 0: top_n.remove(i) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+")") message = tweet_content message_lst = list(message) #print(message_lst) new_message = [] b = '' j= 1 for i in message_lst: if i != ' ': b += i if j == len(message_lst) : new_message.append(b) else: if b != '': new_message.append(b) new_message.append(' ') b = '' j+=1 #print(new_message) a = ' ' while new_message != []: if len(a)+len(new_message[0])<=print_width: a += new_message[0] new_message.pop(0) if new_message == []: print(a) else: print(a) if new_message[0] == ' ': a = ' ' else: a = ' ' #print(new_message) #--------------------------------------------
# 6330362621 (20.00) 210 (2021-02-28 19:32) def get_unique(words): unique_words = [] words.sort() for i in range(len(words)): if i == 0: unique_words += [words[i]] else: if words[i] != words[i - 1]: unique_words += [words[i]] return unique_words def jaccard(words_1, words_2): all = get_unique(words_1 + words_2) s = 0 for w in words_1: if w in words_2: s += 1 jaccard_coef = s / len(all) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): jaccard1 = jaccard(norm_tweets[i], norm_query) if len(top_n) < n: if jaccard1 > 0: top_n.append([i,jaccard1]) else: top_n.sort(key=lambda e: -e[0]) top_n.sort(key=lambda e: e[1]) if top_n[0][1] < jaccard1: top_n[0][1] = jaccard1 top_n[0][0] = i top_n.sort(key=lambda e: -e[0]) top_n.sort(key=lambda e: e[1]) top_n.sort(key=lambda e: e[0]) top_n.sort(key=lambda e: -e[1]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() a1 = '#' + str(tweet_id) a2 = '(' + str(round(jc_coef, 2)) + ')' print(a1, a2) words = tweet_content.split(' ') ans = ' ' + words[0] for w in words[1:]: if len(ans) + len(' ' + w) <= print_width: ans += ' ' + w else: print(ans) ans = ' ' + w print(ans) # --------------------------------------------
# 6330365521 (14.75) 211 (2021-02-26 23:42) def get_unique( words ): unique_words=[] for a in words: if not(a in unique_words) : unique_words.append(a) return unique_words def jaccard(words_1, words_2): al=0 for i in words_1: if i in words_2: al=al+1 jaccard_coef=al/(len(words_1) + len(words_2)-al) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): x=[] top_n=[] for i in range (len(norm_tweets)): x.append([jaccard(norm_tweets[i],norm_query),-i]) x.sort(reverse=True) for j in range (n): m,k=x[j] top_n.append([-k,m]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n'+'#'+str(tweet_id),'('+str(round(jc_coef,2))+')') t=tweet_content.split(' ') o=[] ch=2 for i in range (len(t)): ch=ch+len(t[i])+1 if ch > print_width: print(" ".join(o)) o=[] ch=2+len(t[i]) o.append(t[i]) print(" ".join(o)) #--------------------------------------------
# 6330366121 (20.00) 212 (2021-02-28 18:01) def get_unique( words ): unique_words= [] C4 = 0 while True: for b in words: if b not in unique_words: unique_words.append(b) C4 += 1 if C4 == len(unique_words): break return unique_words def jaccard(words_1, words_2): Suddam3=[] Suddam4=[] for s in words_1: if s not in words_2: Suddam3.append(s) Suddam3 += words_2 for s in words_1: if s in words_2: Suddam4.append(s) AK = len(Suddam3) RPG = len(Suddam4) jaccard_coef = RPG/AK return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i],norm_query) > 0: top_n.append([jaccard(norm_tweets[i],norm_query) , -i]) top_n.sort(reverse = True) for e in top_n: e[0] ,e[1] = -e[1] , e[0] top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') while len(tweet_content) != 0 : start = 0 if ' ' not in tweet_content[:print_width-2]: sub = tweet_content[:tweet_content.find(' ')] tweet_content = tweet_content[tweet_content.find(' '):].strip() +' ' print(' '+sub) else: while True: if len(tweet_content) < print_width: sub = tweet_content tweet_content = '' print(' '+sub) break end = tweet_content.find(' ', start) #indexที่ 20-> ตัวที่ 21 เก็บไว้ 20 ตัวแล้ว if end > print_width-2: sub = tweet_content[:start] tweet_content = tweet_content[start:].strip(' ') + ' ' print(' '+sub) break else: start = end +1 #--------------------------------------------
# 6330367821 (20.00) 213 (2021-03-01 23:16) def get_unique( words ): x = [] for k in words: if k in x: pass else: x.append(k) unique_words = x return unique_words def jaccard(words_1, words_2): c=0 for a in words_1: if a in words_2: c += 1 x = words_1 + words_2 k = [x[0]] for b in x: if b in k: pass else: k.append(b) jaccard_coef = c/len(k) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = [] for i in range(len(norm_tweets)): a = jaccard(norm_tweets[i], norm_query) if a > 0: top.append([a,-i]) top = sorted(top, reverse=True) top1 = top[:n] top_n = [] for c,d in top1: top_n.append([-d,c]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#' + str(tweet_id) + ' (' + str(round(jc_coef,2)) + ')') a = tweet_content.split(' ') b = 0 c = 0 d = [] for i in a: b += len(i) if b<=(print_width-2): d.append(i) b += 1 else: print(' '+' '.join(d)) d = [i] b = len(i)+1 if len(d)>0 and d[-1] == a[-1]: print(' '+' '.join(d)) #--------------------------------------------
# 6330370621 (17.00) 214 (2021-02-25 17:47) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): words_i = [] words_u = [] for e in words_1: words_u.append(e) if e in words_2: words_i.append(e) for e in words_2: if e not in words_i: words_u.append(e) jaccard_coef = len(words_i)/len(words_u) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): sim = [] for i in range(len(norm_tweets)): sim.append([i,jaccard(norm_tweets[i], norm_query)]) sim2 = [] for e in sim: if e[1]>0: sim2.append(e) for e in sim2: e[0], e[1] = e[1], e[0] e[1] *= -1 sim2.sort() sim2 = sim2[::-1] for e in sim2: e[0], e[1] = e[1], e[0] e[0] *= -1 top_n = [] for i in range(n): top_n.append(sim2[i]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') tc = str(tweet_content).split(' ') twl = 2 s =[] for i in range(len(tc)): twl += len(tc[i]) if twl<=print_width: s.append(tc[i]); twl += 1 else: print(' '+' '.join(s)) twl = 2+len(tc[i])+1; s = [tc[i]] print(' '+' '.join(s)) #--------------------------------------------
# 6330371221 (18.77) 215 (2021-03-01 16:49) def get_unique( words ): unique_words = [] for a in words: if a not in unique_words: unique_words.append(a) return unique_words def jaccard(words_1, words_2): empty_list = [] for a in words_1: for b in words_2: if a == b: empty_list.append( a ) intersec=len(empty_list) union=(len(words_1)+len(words_2))-intersec Jaccard_coef = intersec / union return Jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range(len(norm_tweets)): j=jaccard(norm_tweets[i],norm_query) if j>0: top_n.append([j,-i]) top_n.sort() top_n=top_n[::-1] top_n = [[-y,x] for x,y in top_n] top_n=top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): a = tweet_content.split() b = [] k = 2 print('') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') for i in range(len(a)): k += len(a[i])+1 if k > print_width: print(' '+' '.join(b)) b = [] k = 2+len(a[i]) b.append(a[i]) print(' '+' '.join(b)) #--------------------------------------------
# 6330372921 (20.00) 216 (2021-02-27 21:54) def get_unique( words ): unique_words = [] for i in words: if not i in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): a1 = [] a2 = [] for i in words_1: if not i in a2: a2.append(i) for i in words_2: if not i in a2: a2.append(i) for i in a2: if i in words_1 and i in words_2: a1.append(i) jaccard_coef = len(a1)/len(a2) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] index = 0 for i in norm_tweets: jaccard_value = jaccard(i,norm_query) if jaccard_value >0: top_n.append([index,jaccard_value]) index += 1 top_n.sort(key = lambda k:(k[1],-k[0]),reverse=True) top_n = top_n[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() tweet_content = tweet_content.split(' ') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') print(' ',end='') line_character_count = 2 for word in tweet_content: word_length = len(word) if(line_character_count+word_length > print_width): print() print(' ',end='') line_character_count = 2 print(word+' ',end='') line_character_count += (word_length+1) print() #--------------------------------------------
# 6330374121 (20.00) 217 (2021-02-28 19:42) def get_unique( words ): unique_words = [] for word in words: if word not in unique_words: unique_words.append(word) return unique_words def jaccard(words_1, words_2): top = 0 for word in words_1: if word in words_2: top += 1 bottom = len(words_1) + len(words_2) - top jaccard_coef = top/bottom return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): jc = jaccard(norm_tweets[i], norm_query) if jc > 0: top_n.append([jc,-i]) top_n = sorted(top_n, reverse=True)[:n] top_n2 = [] for x,y in top_n: top_n2.append([-y,x]) return top_n2 def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#" + str(tweet_id) + " (" + str(round(jc_coef,2)) + ")") words = tweet_content.split(' ') width = print_width-2 trace = 0 li = [] for i in range(len(words)): trace += len(words[i]) if trace <= width: li.append(words[i]) trace += 1 else: print(" " + " ".join(li)) li = [words[i]] trace = len(words[i]) + 1 if i == len(words)-1: print(" " + " ".join(li)) #--------------------------------------------
# 6330375821 (18.01) 218 (2021-03-01 21:19) def get_unique( words ): unique_words=[] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): ins = [] U = [] for i in range(len(words_1)): if words_1[i] in words_2 and words_1[i] not in ins: ins.append(words_1[i]) for i in range(len(words_2)): if words_2[i] in words_1 and words_2[i] not in ins: ins.append(words_2[i]) U = get_unique(words_1+words_2) jaccard_coef = len(ins)/len(U) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n =[] for i in range(len(norm_tweets)): j_coef = jaccard(norm_tweets[i],norm_query) top=[i,j_coef] top_n.append(top) top_n.sort(key = lambda x: x[1],reverse=True) return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): id= '\n#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')' print(id) tweet_content= tweet_content.split(' ') show = ' ' for i in range(len(tweet_content)): if len(show)+1+len(tweet_content[i])<=print_width: show=show+' '+tweet_content[i] else: print(show) show = ' '+tweet_content[i] print(show) #--------------------------------------------
# 6330376421 (20.00) 219 (2021-02-28 18:11) def get_unique( words ): unique_words=[] for i in words: if not i in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): a=[] for i in words_1: if i in words_2: a.append(i) b=get_unique(words_1+words_2) jaccard_coef=len(a)/len(b) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): x=[] for i in range(len(norm_tweets)): x.append([jaccard(norm_tweets[i], norm_query),i]) x.sort(reverse=True) e=[] top_n=[] for i in range(len(x)-1): if x[i][0]>0: e.append([x[i][1],x[i][0]]) if x[i][0]!=x[i+1][0]: e.reverse() top_n+=e e=[] if len(top_n)>=n: break top_n=top_n+e top_n=top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content=tweet_content.split(' ') a=2 e='' print("\n#"+str(tweet_id)+' ('+str(round(jc_coef,2))+')') for i in range(len(tweet_content)): if a+len(tweet_content[i])>print_width: print(' '+e) a=2+len(tweet_content[i]+' ') e='' else: a+=len(tweet_content[i]+' ') e+=tweet_content[i]+' ' print(" "+e) #--------------------------------------------
# 6330377021 (15.52) 220 (2021-02-28 23:30) def get_unique( words ): unique_words = [] for c in words: if c in unique_words: pass else: unique_words.append(c) return unique_words def jaccard(words_1, words_2): k = 0 if len(words_1) < len(words_2): for i in range(len(words_1)): if words_1[i] in words_2: k += 1 else: for i in range(len(words_2)): if words_2[i] in words_1: k += 1 jaccard_coef = k/(len(words_1)+len(words_2)-k) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a = [] top_n = [] for i in range(len(norm_tweets)): a.append([jaccard(norm_tweets[i],norm_query),-i]) a.sort() a = a[::-1] for i in range(n): top_n.append([-a[i][1],a[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): t = tweet_content.split() print('\n#'+str(tweet_id),'('+str(round(jc_coef,2))+')') x = [] ga = 2 for i in range(len(t)): ga += len(t[i])+1 if ga > print_width: print(" "+" ".join(x)) x = [] ga = 2+len(t[i]) x.append(t[i]) print(" "+" ".join(x)) #--------------------------------------------
# 6330378721 (19.45) 221 (2021-02-28 18:05) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] in words and not words[i] in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): intersect = [] union = 0 for i in range(len(words_1)): if words_1[i] in words_2: intersect.append(words_1[i]) union += 1 for i in range(len(words_1)): if not (words_1[i] in words_2 or words_1[i] in intersect): union +=1 for i in range(len(words_2)): if not (words_2[i] in words_1 or words_2[i] in intersect): union +=1 jaccard_coef = len(intersect)/union return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] maxx = [] for j in range(len(norm_tweets)): maxx.append(len(norm_tweets[j])) maxx = max(maxx) for i in range(len(norm_tweets)): tweets_id = norm_tweets[i] top_n.append([jaccard(tweets_id,norm_query),(maxx-i)]) top_n.sort() for k in range(len(norm_tweets)): top_n[k][1] = maxx-top_n[k][1] top_n[k][0],top_n[k][1]=top_n[k][1],top_n[k][0] top_n.reverse() top_n = top_n[0:n:] if top_n[0][0] == 0: top_n = [] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n#' + str(tweet_id) + ' ' + '(' + str(round(jc_coef, 2)) + ')') w = tweet_content.split() n_words = len(tweet_content.split()) n_space = n_words-1 a=[] k = 0 for i in range(n_words): if not len(w[i]) > (print_width-2)-k : a.append(w[i]) k += len(w[i])+1 if w[i] == w[n_words-1]: content = ' '.join(a) print(' '+ content) else: content = ' '.join(a) print(' '+ content) k = 0 a = [] a.append(w[i]) k += len(w[i])+1 if w[i] == w[n_words-1]: content = ''.join(a) print(' '+ content) #--------------------------------------------
# 6330379321 (19.38) 222 (2021-02-28 08:27) def get_unique( words ): unique_words=[] unique_words+=words for i in range(len(words)-1): if words[i] in words[i+1:] : unique_words.remove(words[i]) return unique_words def jaccard(words_1, words_2): c=0 word_1=[]+words_1 word_2=[]+words_2 for e in range(len(words_1)) : if words_1[e] in words_2 : c+=1 word_1.remove(words_1[e]) word_2.remove(words_1[e]) P=len(word_1)+len(word_2)+c if P != 0 : jaccard_coef=c/(P) else : jaccard_coef = 0 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): list_of_top=[] for tweet_id in range(len(norm_tweets)) : jaccards=jaccard(norm_tweets[tweet_id],norm_query) list_of_top.append([jaccards,tweet_id]) list_of_top.sort(reverse=True) list_of_Top=[]+list_of_top top=[list_of_Top[0]] top_nn = [] top_n = [] for k in range(len(list_of_Top)-1) : if (list_of_Top[k])[0] == (list_of_Top[k+1])[0] : top.append(list_of_Top[k+1]) else: top.sort() top_nn+=top top.append(list_of_Top[k+1]) top=[list_of_Top[k+1]] for m in top_nn : if m[0] <= 0 : top_nn.remove(m) for e in top_nn : top_n.append([e[1],e[0]]) top_n=top_n[:n ] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') tweet_content=tweet_content.split(' ') show=' '+tweet_content[0] for w in range(len(tweet_content)-1): w = w+1 if len(show+tweet_content[w]) < print_width : show +=' '+tweet_content[w] if w ==len(tweet_content)-1 : print(show) else : print(show) show=' '+tweet_content[w] #--------------------------------------------
# 6330380921 (15.87) 223 (2021-03-01 22:37) def get_unique( words ): k=0 b=[] while k<=len(words)-1: if words[k] in b: k+=1 else: b.append(words[k]) k+=1 if b==[]: b=[''] unique_words=b return unique_words def jaccard(words_1, words_2): union=words_1 + words_2 k3=0 b3=[] if len(words_1) >= len(words_2): while k3<=len(words_1)-1: if words_1[k3] in words_2: b3.append(words_1[k3]) k3+=1 else: k3+=1 else: while k3<=len(words_2)-1: if words_2[k3] in words_1: b3.append(words_2[k3]) k3+=1 else: k3+=1 k=0 b=[] while k<=len(union)-1: if union[k] in b: k+=1 else: b.append(union[k]) k+=1 if len(b)==0: b=[''] jaccard_coef= float(len(b3)/len(b)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): c=[] d=[] h=[] e=0 top_n=[] for i in range(len(norm_tweets)-1): tweet_id = i jacc=jaccard(norm_tweets[tweet_id],norm_query) c.append([-jacc,tweet_id]) c.sort() for i2 in range(len(norm_tweets)-1): #ทำให้ลบหายไป c[i2][0]=-c[i2][0] for i3 in range(len(c)): d.append(c[i3]) for i4 in range(len(d)): y=d[i4] y[0],y[1]=y[1],y[0] for i5 in range(n): top_n.append(d[i5]) topn=[]+top_n for xxx in topn: if xxx[1]<=0: top_n.remove(xxx) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): a=tweet_content.split(' ') x=round(jc_coef,2) b=[] d='' k=2 print() print('#'+str(tweet_id)+' ('+str(x)+')') for e in a: k+=len(e) if k<print_width: b.append(e) d=' '*2+' '.join(b) k+=1 elif k==print_width: b.append(e) d=' '*2+' '.join(b) else: print(d) k=2 b=[] b.append(e) k+=len(e)+1 d='' print(' '*2+' '.join(b)) #--------------------------------------------
# 6330381521 (18.01) 224 (2021-03-01 02:34) def get_unique( words ): unique_words = [] words.sort() for un_w in range(len(words)) : if un_w == 0 : unique_words+=[words[un_w]] else : if words[un_w]!=words[un_w-1] : unique_words+=[words[un_w]] return unique_words def jaccard(words_1, words_2): word_1plusplusword_2 = words_1 + words_2 unique_words = [] for e_e in range(len(word_1plusplusword_2)) : if word_1plusplusword_2[e_e] not in unique_words : unique_words.append(word_1plusplusword_2[e_e]) jaccard_coef = (len(word_1plusplusword_2) - len(unique_words)) / len(unique_words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): aa_aa = [] for bb in range(len(norm_tweets)) : aa_aa.append(jaccard(norm_tweets[bb] , norm_query)) z_aba = [[-aa_aa[bb] , bb] for bb in range(len(norm_tweets))] z_aba = sorted(z_aba) top_n = [[z_aba[bb][1] , -z_aba[bb][0]] for bb in range(len(z_aba))][0 : n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() speed_of_searching = "(" + str(round(jc_coef , 2)) + ")" number_of_tweet = "#" + str(tweet_id) print(number_of_tweet, speed_of_searching) words = tweet_content.split(" ") answer_search = " " + words[0] for web_research in words[1:] : if len(answer_search) + len(" " + web_research) <= print_width : answer_search += " " + web_research else : print(answer_search) answer_search = " " + web_research print(answer_search) #--------------------------------------------
# 6330382121 (15.41) 225 (2021-03-01 23:14) def get_unique( words ): unique_words = [] for i in words: if not i in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): a = 0 b = len(words_1) + len(words_2) for i in words_1: for j in words_2: if i == j: a = a + 1 c = b - a jaccard_coef = round(a/(c+0.000000000000000000000000000000001), 2) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): x = [] y = [] b = len(norm_tweets) for i in range(b): x.append([jaccard(norm_tweets[i], norm_query), -i]) x.sort(reverse = True) for j in range(n): y.append([-x[j][1], x[j][0]]) top_n = [] for k in y: top_n.append(k) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+ str(tweet_id), '(' + str(round(jc_coef,2)) + ')') a = tweet_content.split(' ') b = len(tweet_content) c = (b//(print_width-2)) + 1 t = [' '] e = 1 f = 0 for j in range(len(a)): if len(a[j]) + 2 > print_width: c = c + 1 for x in range(c): for i in range(f, len(a)): if (e <= (x + 1) * print_width) and (e > (x) * print_width) and e + (1+len(a[i])) <= (x + 1) * print_width: t.append(a[i]) e = e + 1 + len(a[i]) elif e <= (x + 1) * print_width and e > (x) * print_width and e + (1+len(a[i])) > (x + 1) * print_width and len(a[i]) + 2 <= print_width: t.append('\n ') e = ((x+1) * print_width) + 1 f = i elif e <= (x + 1) * print_width and e > (x) * print_width and e + (1+len(a[i])) > (x + 1) * print_width and len(a[i]) + 2 > print_width: t.append('\n ') t.append(a[i]) t.append('\n ') e = ((x+2) * print_width) + 1 f = i + 1 print(' '.join(t)) #--------------------------------------------
# 6330384421 (20.00) 226 (2021-03-01 23:19) def get_unique( words ): unique_words=[] for i in words: if not i in unique_words:unique_words.append(i) return unique_words def jaccard(words_1, words_2): sameword=0 for i in words_1: if i in words_2:sameword += 1 jaccard_coef=sameword/(len(words_1)+len(words_2)-sameword) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range(len(norm_tweets)): top_n.append([jaccard(norm_tweets[i], norm_query),-i]) top_n.sort() top_n=top_n[::-1] top_n=top_n[:n] top_n_real=[] for i in range(len(top_n)): if top_n[i][0]>0: top_n_real.append([-top_n[i][1],top_n[i][0]]) top_n=top_n_real return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#' + str(tweet_id) + ' (' + str(round(jc_coef, 2)) + ')') tweet_content_list=tweet_content.split(' ');out = ' ';a = 1 for i in tweet_content_list: if (a+1+len(i)) > print_width: print(out) out=' ' out+=(' '+i) a=len(out) else: out+=(' '+i) a=len(out) if out !='':print(out) #--------------------------------------------
# 6330386721 (15.00) 227 (2021-03-01 23:26) def get_unique( words ): l = [] for e in words: if e not in l: l.append(e) unique_words = l return unique_words def jaccard(words_1, words_2): st = 0 for e in words_1: if e in words_2: st += 1 snt = len(words_1)+len(words_2)-st jaccard_coef = st/snt return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): L = [] for i in range(len(norm_tweets)): jc = jaccard(norm_tweets[i] ,norm_query) if jc > 0: L.append([-jc, i]) L.sort() top_n = [] for i in range(n): top_n.append([L[i][1],-L[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): x = tweet_content.split(' ') print('') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') s = ' ' while len(x) > 0: if len(s+x[0]) < print_width: s += x[0]+' ' x.pop(0) elif len(s+x[0]) == print_width: s += x[0] x.pop(0) print(s) s = ' ' else: print(s) s = ' ' if s != ' ': print(s) #--------------------------------------------
# 6330387321 (18.33) 228 (2021-03-01 23:12) def get_unique( words ): unique_words = [] for i in range(0,len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): a = 0 b = len(words_1)+len(words_2) for i in range(0,len(words_1)): for j in range(0,len(words_2)): if words_1[i] == words_2[j]: a += 1 else: pass c = b-a jaccard_coef = a/c return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): nt = len(norm_tweets) top_n0 = [] top_n = [] for i in range(0,nt): a = jaccard(norm_tweets[i],norm_query) if a > 0: top_n0.append([a,i*(-1)]) else: pass top_n0 = sorted(top_n0, reverse=True)[0:n:1] for x,y in top_n0: top_n.append([y*(-1),x]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet = tweet_content.split(' ') a = round(float(jc_coef), 2) print("") print("#"+str(tweet_id), "("+str(a)+")") t = len(tweet_content) while t > int(print_width): b0 = [tweet[0]] c0 = len(tweet[0]) a0 = 2 d = 0 for i in range(1,len(tweet)): if c0+a0 <= int(print_width): b0.append(tweet[i]) a0 += 1 c0 += len(tweet[i]) d += 1 else: pass b0 = b0[:-1:1] print(" "+" ".join(b0)) tweet = tweet[d::1] t = len(" ".join(tweet)) +2 b0 = [tweet[0]] c0 = len(tweet[0]) a0 = 2 d = 0 for i in range(1,len(tweet)): if c0+a0 <= int(print_width): b0.append(tweet[i]) a0 += 1 c0 += len(tweet[i]) d += 1 else: pass print(" "+" ".join(b0)) #--------------------------------------------
# 6330388021 (16.30) 229 (2021-03-01 10:01) def get_unique( words ): unique_words=[] for i in words : if i not in unique_words : unique_words+=[i] return unique_words def jaccard(words_1, words_2): c=0 a=get_unique( words_1 ) b=get_unique( words_2 ) for i in a : if i in b : c+=1 d=len(get_unique( a+b )) jaccard_coef=c/d return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top=[] for i in range(len(norm_tweets)) : top+=[[jaccard(norm_tweets[i],norm_query),-i]] top.sort(reverse=True) for i in range(len(top)) : top[i][0],top[i][1]=-top[i][1],top[i][0] top_n=top[:n] for e in top_n : if e[1]<=0 : top_n.remove(e) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): a=tweet_content.split(' ') print(' ') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') s=' ' c=1 for i in a: if c==1 : if len(i)+2>=int(print_width): print(' '+i) c+=(len(i)+1) if c < int(print_width): s+=' ' s+=i else: if len(s)!=2 : print(s) c=2+len(i) s=' ' s+=i print(s) #--------------------------------------------
# 6330389621 (18.33) 230 (2021-03-01 23:21) def get_unique( words ): unique_words=[] for i in range(len(words)): if words[i] not in unique_words: unique_words += [words[i]] return unique_words def jaccard(words_1, words_2): x=[] mix_words= words_1 + words_2 for i in range(len(mix_words)): if mix_words[i] not in x: x += [mix_words[i]] count=0 for j in range(len(words_1)): if words_1[j] in words_2: count+=1 jaccard_coef=count/(int(len(x))) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): jacc=[] for i in range(len(norm_tweets)): found_jaccard=jaccard(norm_tweets[i], norm_query) if found_jaccard >0: jacc.append([-found_jaccard,i]) jacc.sort() top_n=[] for a1,a2 in jacc: top_n.append([a2,-a1]) top_n=top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(" ") print("#"+ str(tweet_id) + " "+"("+ str(round(jc_coef,2))+")") s=tweet_content.split(' ') while len(s)>0: a="" b=list(s) for i in range(0,len(s)): if 2+len(b[i])+len(a)<=print_width: a+=(b[i]+" ") s.pop(0) else: break print(" "+a) #--------------------------------------------
# 6330391821 (15.00) 231 (2021-03-01 16:15) def get_unique( words ): unique_words = [] for e in range(len(words)): if words[e] not in unique_words: unique_words.append(words[e]) return unique_words def jaccard(words_1, words_2): a = [] for e in range(len(words_2)): a.append(words_2[e]) c = 0 for e in range(len(words_1)): if words_1[e] in words_2: c += 1 a.append(words_1[e]) z = get_unique(a) jaccard_coef = c/len(z) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): jaccard_list = [] for number in range(len(norm_tweets)): jack = jaccard(norm_tweets[number],norm_query) jaccard_list.append(jack) jaccard_sort = [] for s in range(len(jaccard_list)): jaccard_sort.append(jaccard_list[s]) jaccard_sort.sort() tweet_id = [] for r in range(n): d = (jaccard_list.index(jaccard_sort[-r-1])) tweet_id.append(d) jaccard_list = jaccard_list[:d:]+[2]+jaccard_list[d+1::] top_n = [] for g in range(n): top_n.append([tweet_id[g],jaccard_sort[-g-1]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): x = tweet_content.split(" ") print("") print("#"+str(tweet_id)+" ("+str(round(jc_coef,2))+")") y = " " c = 0 while c != len(x): while len(y) < print_width and c < len(x): y +=" "+x[c] c += 1 if len(y) > print_width: y = y[:-len(x[c-1]):] print(y) c -= 1 y = " " else: print(y) y = " " #--------------------------------------------
# 6330392421 (20.00) 232 (2021-02-26 11:15) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words #-------------------------------------------------------- def jaccard(words_1, words_2): c = 0 all_words = [] for e in words_1: if e not in all_words: all_words.append(e) for e in words_2: if e not in all_words: all_words.append(e) else: c += 1 jaccard_coef = c/(len(all_words)) return jaccard_coef #-------------------------------------------------------- def top_n_similarity(norm_tweets, norm_query, n): pre1 = [] for i in range(len(norm_tweets)): tweet_id = i jc = jaccard(norm_tweets[i], norm_query) new_tweet_id = i*-1 pre1.append([new_tweet_id,jc]) pre2 = [] for i in range(len(pre1)): if pre1[i][1] > 0: pre2.append(pre1[i]) for i in range(len(pre2)): pre2[i][0],pre2[i][1] = pre2[i][1],pre2[i][0] pre2.sort() pre3 = pre2[::-1] for i in range(len(pre2)): pre3[i][0],pre3[i][1] = pre3[i][1],pre3[i][0] pre3[i][0] = pre3[i][0]*-1 top_n = pre3[:n] return top_n #-------------------------------------------- def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') result = tweet_content.split(' ') c = ' ' for i in range(len(result)): if len(c)+len(result[i]) > print_width: print(c) c = ' ' + str(result[i]) + ' ' else: if len(c)+len(result[i]) == print_width: c = c + str(result[i]) print(c) c = ' ' else: c = c + str(result[i]) + ' ' print(c) #--------------------------------------------
# 6330393021 (16.94) 233 (2021-03-01 20:54) def get_unique( words ): unique_words = [] for word in words : if word not in unique_words : unique_words.append(word) return unique_words def jaccard(words_1, words_2): sameWords = [] for word in (words_1) : if(word in words_1 and word in words_2) : sameWords.append(word) nAllWord = len(words_1)+len(words_2)-len(sameWords) return len(sameWords)/nAllWord def top_n_similarity(norm_tweets, norm_query, n): top_n = [];all_data = [] for i in range(len(norm_tweets)) : if(jaccard(norm_tweets[i],norm_query)>0): all_data.append([-jaccard(norm_tweets[i],norm_query),i]) all_data.sort() for i in range(n) : top_n.append([all_data[i][1],-all_data[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("\n#"+str(tweet_id),"("+str(round(jc_coef,2))+")") tweet_word = tweet_content.split() line = " " content = [] for word in tweet_word : if len(line+" "+word) <= print_width : line+=" "+word else : content.append(line) line = " " + word if(line not in content) : content.append(line) for e in content : print(e) #--------------------------------------------
# 6330394721 (13.63) 234 (2021-03-01 05:36) def get_unique( words ): unique_words=[] words.sort() for i in range(len(words)-1): if words[i]!=words[i+1]: unique_words.append(words[i]) if len(words)!=0: unique_words.append(words[-1]) return unique_words def jaccard(words_1, words_2): same_words = [] for e in words_1: if e in words_2: same_words.append(e) if len(same_words)-len(words_1)-len(words_2)!=0: jaccard_coef = len(same_words)/(len(words_1) + len(words_2) - len(same_words)) return jaccard_coef else: return 0 def top_n_similarity(norm_tweets, norm_query, n): top_n=[0]*len(norm_tweets) same_words=[] for tweet_id in range(len(norm_tweets)): jaccards=jaccard(norm_tweets[tweet_id], norm_query) top_n[tweet_id]=[jaccards,tweet_id] top_n.sort() top_n=top_n[:-n-1:-1] for i in range(n): top_n[i]=[top_n[i][1],top_n[i][0]] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content=tweet_content.split() k=0 j=0 m=0 print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') for i in range(len(tweet_content)-1): k+=len(tweet_content[i]) if k>print_width-2: print(' '+' '.join(tweet_content[m:m+j])) k=len(tweet_content[i]) m+=j j=0 if k<=print_width-2: k+=1 j+=1 print(' '+' '.join(tweet_content[m:])) #--------------------------------------------
# 6330395321 (20.00) 235 (2021-03-01 22:13) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): allword = [] same = 0 for e in words_1: if e in words_2: same += 1 for e in words_1: if e not in allword: allword.append(e) for e in words_2: if e not in allword: allword.append(e) jaccard_coef = same/len(allword) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): g = jaccard(norm_tweets[i],norm_query) if g > 0: top_n.append([g,-i]) top_n.sort(reverse = True) for e in top_n: e[0] ,e[1] = -e[1] , e[0] top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+ str(tweet_id),'('+str(round(jc_coef,2)) +')') a = tweet_content while len(a) != 0 : start = 0 if ' ' not in a[:print_width-2]: sub = a[:a.find(' ')] a = a[a.find(' '):].strip() + ' ' print(' '+sub) else: while True: if len(a) < print_width: sub = a a = '' print(' '+sub) break end = a.find(' ', start) if end > print_width-2: sub = a[:start] a = a[start:].strip(' ') + ' ' print(' '+sub) break else: start = end +1 #--------------------------------------------
# 6330396021 (17.95) 236 (2021-03-01 17:43) def get_unique( words ): unique_words = words.copy() for i in range(len(words)-1): words.sort() if words[i] == words[i+1] : unique_words.remove(words[i]) return unique_words def jaccard(words_1, words_2): n_same = 0 for i in words_1: if i in words_2: n_same += 1 jaccard_coef = n_same/(len(words_1)+len(words_2)- n_same) return jaccard_coef def top_n_similarity(norm_tweets, norm_query,n): top_n_copy = [] for i in range(len(norm_tweets)): j = jaccard(norm_tweets[i], norm_query) top_n_copy.append([i,j]) for i in range(len(top_n_copy)): top_n_copy[i] = [-top_n_copy[i][1],top_n_copy[i][0]] top_n_copy.sort() for i in range(len(top_n_copy)): top_n_copy[i] = [top_n_copy[i][1],-top_n_copy[i][0]] top_n = top_n_copy[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#" +str(tweet_id)+" ("+str(round(jc_coef,2))+")") tw_c = tweet_content.split() text_tweet = " " for i in tw_c: text_tweet_copy = text_tweet text_tweet += " "+str(i) if print_width < len(text_tweet) : if i == tw_c[0] : print(text_tweet) text_tweet = " " else : print(text_tweet_copy) text_tweet = " " + str(i) print(text_tweet) #--------------------------------------------
# 6330398221 (18.50) 237 (2021-03-01 16:14) def get_unique( words ): unique_words=[] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): w=words_1+words_2 d=[] u=[] for e in range(len(w)): if w[e] not in d: d.append(w[e]) else: u.append(w[e]) if len(u)!=0: u1=[u[0]] for e1 in range(len(u)): if u[e1] not in u1: u1.append(u[e1]) up=len(u1);down=len(d) else:up=len(u);down=len(d) jaccard_coef=up/down return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top=[] top_n=[] for m1 in range(len(norm_tweets)): tweet_id=m1 wn=norm_tweets[m1]+norm_query dn=[] un=[] for mn in range(len(wn)): if wn[mn] not in dn: dn.append(wn[mn]) else: un.append(wn[mn]) if len(un)!=0: un1=[un[0]] for mn1 in range(len(un)): if un[mn1] not in un1: un1.append(un[mn1]) upn=len(un1);downn=len(dn) else:upn=len(un);downn=len(dn) jaccard=upn/downn t=[jaccard,-tweet_id] top.append(t) top.sort(reverse=True) for m2 in range(len(top)): top1=top[m2] f=-top1[1] b=top1[0] final=[f,b] if b!=0: top_n.append(final) while len(top_n)!=n and len(top_n)!=0: top_n.pop(-1) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): text=' ' tw=tweet_content.split(' ') jaco=round(jc_coef,2) print('\n#'+str(tweet_id)+' ('+str(jaco)+')') for h in range(len(tw)): if len(text)+len(tw[h])<=print_width: text+=tw[h] text+=' ' else: print(text) text=' '+tw[h]+' ' print(text) #--------------------------------------------
# 6330399921 (19.95) 238 (2021-02-28 21:45) def get_unique( words ): unique_words=[] for x in words: if not x in unique_words: unique_words.append(x) return unique_words def jaccard(words_1, words_2): i=0 for x in words_1: if x in words_2: i+=1 jaccard_coef=i/(len(words_1)+len(words_2)-i) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] total=[] lis_jac=[] for i in range(len(norm_tweets)): jac=jaccard(norm_tweets[i],norm_query) if jac>0: if not jac in lis_jac: lis_jac.append(jac) total.append([i,jac]) lis_jac.sort() lis_jac=lis_jac[::-1] for j in lis_jac: for x in total: if x[1]==j: if not len(top_n)==n: top_n.append(x) else: break return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') wrd=tweet_content.split() txt=' ' for i in wrd: if len(txt+' '+i)>print_width : print(txt) txt=' '+i if i==wrd[-1]: print(txt) elif i==wrd[-1]: print(txt+' '+i) else: txt+=' '+i #--------------------------------------------
# 6330400821 (20.00) 239 (2021-02-28 16:05) def get_unique( words ): words.sort() x = len(words) i = 0 while i < x-1: c = words[0] words.remove(c) if c not in words : words.append(c) i += 1 unique_words = words return unique_words def jaccard(words_1, words_2): n = 0 s = 0 if len(words_1) > len(words_2): while n < len(words_2): if words_2[n] in words_1: s += 1 n += 1 elif len(words_2) > len(words_1): while n < len(words_1): if words_1[n] in words_2: s += 1 n += 1 else : while n < len(words_1): if words_1[n] in words_2: s += 1 n += 1 jaccard_coef = s/(len(words_2)+len(words_1)-s) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n) : tweet_id = 0 su_m = [] i = 1 while tweet_id < len(norm_tweets): x = jaccard(norm_tweets[(tweet_id)], norm_query) if x != 0 : su_m.append([x,-tweet_id]) tweet_id += 1 su_m.sort(reverse=True) for r in su_m: r[0], r[1] = -r[1], r[0] top_n = su_m[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("\n#" + str(tweet_id) + " " + "(" + str(round(jc_coef, 2)) + ")") i = 0 y = tweet_content.split(" ") n = 1 st_r = " " while i < len(y) : if len(st_r) + len(y[i]) <= print_width : st_r += y[i] + " " else : print(st_r) st_r = " " st_r += y[i] + " " i += 1 print(st_r) #--------------------------------------------
# 6330401421 (17.47) 240 (2021-02-28 22:32) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): a = 0 b = len(words_1) for i in words_1: if i in words_2: a += 1 for i in words_2: if i not in words_1: b += 1 jaccard_coef = a/b return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i], norm_query) > 0: top_n.append([jaccard(norm_tweets[i], norm_query), -i]) top_n = sorted(top_n)[::-1][:n] for i in range(len(top_n)): top_n[i] = [-top_n[i][1], top_n[i][0]] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("\n#" + str(tweet_id) + " (" + str(round(jc_coef,2)) + ")") tw = tweet_content.split() current_text = " " for word in tw: if (len(current_text)+2) + len(word) > print_width: print(current_text) current_text = " " current_text += str(word) + " " print(current_text) #--------------------------------------------
# 6330402021 (20.00) 241 (2021-02-27 20:17) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): all_words = [] for i in words_1+words_2: if i not in all_words: all_words.append(i) inter_words = [] for i in words_1: if i in words_2: inter_words.append(i) jaccard_coef = len(inter_words)/len(all_words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] words_n = [] for i in range(len(norm_tweets)): jac = jaccard(norm_tweets[i], norm_query) words_n.append([jac, i]) words_n.sort(reverse = True) for i in range(len(words_n)): words_n[i][0],words_n[i][1] = words_n[i][1],words_n[i][0] lenn = len(words_n)-1 for i in range(n): if i < lenn: if words_n[0][1] > 0 and words_n[0][1] != words_n[0+1][1]: top_n.append(words_n.pop(0)) elif words_n[0][1] > 0: dup = [] for e in words_n: if e[1] == words_n[0][1]: dup.append(e) dup.sort() top_n.append(words_n.pop(words_n.index(dup[0]))) elif i == lenn and words_n[0][1] > 0: top_n.append(words_n.pop(0)) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#"+str(tweet_id)+" ("+str(round(jc_coef,2))+")") content = tweet_content.split(" ") word = len(content) while word > 0: width_count = 0 prin_t1 = " " w1 = content.pop(0) prin_t2 = [w1] word -= 1 width_count += len(w1) for i in content: width_count += len(i)+1 if width_count <= print_width-2: prin_t2.append(i) word -= 1 print(prin_t1 + " ".join(prin_t2)) for i in prin_t2[1:]: content.remove(i) #--------------------------------------------
# 6330403721 (18.98) 242 (2021-02-28 18:17) def get_unique( words ): unique_words = [] words.sort() if len(words) != 0 : for i in range(len(words)): if words[i-1] != words[i]: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): unique_words1 = get_unique( words_1 ) unique_words2 = get_unique( words_2 ) wt = unique_words1 + unique_words2 wt.sort() t = 0 s = 0 for i in range(len(wt)): if wt[i-1] != wt[i]: t += 1 if wt[i-1] == wt[i]: s += 1 if t == 0 : jaccard_coef = 0 else : jaccard_coef = s/t return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): tweet_id = i jc = jaccard(norm_tweets[i],norm_query) if jc != 0 : top_n.append([-1*jc,tweet_id]) top_n.sort() for i in range(len(top_n)): top_n[i][0] = -1*top_n[i][0] top_n[i][0],top_n[i][1] = top_n[i][1],top_n[i][0] top_n = top_n[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() tweet_content_list = tweet_content.split(' ') tweet_content_list.append(' ') jcc = round(jc_coef,2) print('#'+str(tweet_id)+' ('+str(jcc)+')') c = 0 for i in range(len(tweet_content_list)-1): c += len(tweet_content_list[i])+1 if print_width-2-c >= len(tweet_content_list[i+1]): print(tweet_content_list[i],end = ' ') else: print(tweet_content_list[i]) c = 0 print() #--------------------------------------------
# 6330404321 (18.01) 243 (2021-03-01 20:25) def get_unique( words ): unique_words=[] for e in words: unique_words.append(e) for i in range(len(unique_words)): if unique_words[i] in unique_words[0:i]: unique_words.remove(unique_words[i]) return unique_words def jaccard(words_1, words_2): s=[] x=get_unique( words_1+words_2 ) for i in range(len(words_1)): if words_1[i] in words_2: s.append(words_1[i]) a=len(s) jaccard_coef=a/len(x) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range(len(norm_tweets)): top_n.append([jaccard(norm_query,norm_tweets[i]),i]) for i in range(len(top_n)): top_n[i][0]=-top_n[i][0] top_n.sort() for i in range(len(top_n)): top_n[i][0]=-top_n[i][0] top_n[i]=top_n[i][::-1] top_n=top_n[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() x='#'+str(tweet_id) print(x+' '+'('+str(round(jc_coef,2))+')') words = tweet_content.split(" ") q = " " for i in range(len(words)): if len(q) + len(' '+words[i])<= print_width: q += ' ' + words[i] else: print(q) q =' ' + words[i] print(q) #--------------------------------------------
# 6330405021 (18.50) 244 (2021-02-26 00:33) def get_unique( words ): unique_words=[] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): s=[x for x in words_1 if x in words_2] n=words_1+[y for y in words_2 if y not in words_1] jaccard_coef=len(s)/(len(n)+int(len(n)==0)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): count=[] for i in range(len(norm_tweets)): count.append([jaccard(norm_tweets[i],norm_query),-i]) count.sort() count=count[len(count)-n:] top_n=[[abs(a),b] for b,a in count] top_n.reverse() return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(f"\n#{tweet_id} ({round(jc_coef,2)})") x=tweet_content.split(' ') p=' ' for i in range(len(x)): if len(p)+len(x[i])+1<=print_width: p+=' '*int(i!=0)+x[i] else: print(p) p=' '+x[i] print(p) #--------------------------------------------
# 6330406621 (16.25) 245 (2021-03-01 23:28) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): wordss = [] for i in words_1: if i in words_2: wordss.append(i) wordsf = [] for j in wordss: if j not in wordsf: wordsf.append(j) words_3 = words_1 + words_2 r = [] for k in words_3: if k not in r: r.append(k) a = len(wordsf) b = len(r) if b != 0: jaccard_coef = a/b return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): tweet_id = [] jaccard1 = [] top = [] top0 = [] l1 = [] a = [] r = [] rep = [] rep1 = [] tt = [] vv = [] target = [] temp = [] count = 0 for i in range(len(norm_tweets)): jaccard1 += [jaccard((norm_tweets[i]), norm_query)] tweet_id += [[i]] for t in range(len(tweet_id)): for j in range(len(jaccard1)): if t == j: if jaccard1[j] != 0.0: top += [tweet_id[t]+[jaccard1[j]]] top.sort(key = lambda v:v[1]) #print(top) tt = top[-1:-len(top):-1] #print(tt) for ll in range(len(tt)): if tt[ll] not in vv: vv.append(tt[ll]) #print(vv) if len(vv) != 0: target = [] temp = [] variable = vv[0][1] for i in range(len(vv)): if vv[i][1] == variable: temp.append(vv[i]) else: target.append(temp) temp = [vv[i]] variable = vv[i][1] #print(target) for rr in target: if rr not in rep1: rep1.append(rr) #print(rep1) for ss in rep1: ss.sort() for xx in ss: top0 += [xx] #print(top0) top_n = top0[:n] #print(top_n) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(" ") print('#'+str(tweet_id)+' '+"("+str(round(jc_coef, 2))+")") x = len(tweet_content) #print(x) 72 y = len(tweet_content.split(" ")) ysplit = tweet_content.split(" ") #print(y) 15 i = 0 a = ' ' for i in range(len(ysplit)): if (len(a)+1+len(ysplit[i])) <= print_width: a += " " + ysplit[i] else: print(" "+a.strip()) a = " " + ysplit[i] print(" "+a.strip()) #--------------------------------------------
# 6330407221 (17.00) 246 (2021-02-27 17:41) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): same = 0 notsame = 0 different = [] for i in words_1 : if i in words_2: same +=1 for i in words_1: if i not in different: different.append(i) notsame += 1 for j in words_2: if j not in different: different.append(j) notsame += 1 jaccard_coef = int(same)/int(notsame) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): list1 = [] list2 = [] top_n = [] for i in norm_tweets: list1.append(jaccard(i,norm_query)) for j in range(len(norm_tweets)): list2.append([-1*list1[j],j]) list2.sort() for z in range(n): top_n.append([list2[z][1],-1*list2[z][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id), '('+str(round(jc_coef,2))+')') t = tweet_content.split(' ') width = 0 list1 = [] list2 = [] limit = print_width-2 for i in t: if width+len(i) <= limit: list1.append(i) width += len(i)+1 else : width = 0 list2.append(list1) list1 = [] list1.append(i) width += len(i)+1 for j in list2: print(' '+' '.join(j)) print(" "+" ".join(list1)) #--------------------------------------------
# 6330408921 (18.35) 247 (2021-02-26 00:29) def get_unique( words ): unique_words = words.copy() words.sort() for i in range(len(words) - 1): if words[i] == words[i + 1]: unique_words.remove(words[i]) return unique_words def jaccard(words_1, words_2): jaccard_coef = 0 for s in words_1: if s in words_2: jaccard_coef += 1 jaccard_coef = jaccard_coef / (len(words_1) + len(words_2) - jaccard_coef) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] a = [] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i], norm_query) > 0: a.append(i) a.append(jaccard(norm_tweets[i], norm_query)) top_n.append(a) a = [] top_n = sorted(top_n, reverse=True, key=lambda x: x[1]) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): a = tweet_content.split(' ') b = 2 c = [] d = [] for i in range(len(a)): if a[i] == '': b += 1 b += len(a[i]) + 1 if b <= print_width + 1: d.append(a[i]) else: c.append(d) d = [a[i]] b = 2 + len(a[i]) c.append(d) print('') print('#' + str(tweet_id), '(' + str(round(jc_coef, 2)) + ')') for s in c: print(' ' + ' '.join(s)) #--------------------------------------------
# 6330409521 (10.52) 248 (2021-02-28 15:22) def get_unique( words ): unique_words = [''] for i in range(len(words)): if not words[i] in unique_words: unique_words.append(words[i]) unique_words.remove('') return unique_words def jaccard(words_1, words_2): words_3 = [''] for i in range(len(words_1)): if words_1[i] in words_2: words_3.append(words_1[i]) words_3.remove('') words_4 = words_1.copy() for i in range(len(words_2)): if not words_2[i] in words_4: words_4.append(words_2[i]) jaccard_coef = round((len(words_3)/len(words_4)),2) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] u = [] z = [] k = 0 for i in range(len(norm_tweets)): x=jaccard(norm_tweets[i],norm_query) y = i z.append(y) z.append(x) for i in range(int(len(z)/2)): u.append(z[2*i+1]) u.sort() while k < n : x = u.pop() top_n.append('') for i in range(int(len(z)/2)): if x == 0: break if k >0: if x == z[2*i+1]: z.pop(2*i+1) t = [] l = z.pop(2*i) t.append(l) t.append(x) top_n.append(t) top_n.remove('') k += 1 break elif x == z[2*i+1]: z.pop(2*i+1) t = [] l = z.pop(2*i) t.append(l) t.append(x) top_n.append(t) top_n.remove('') k += 1 break return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('-'*print_width) print('') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') t = tweet_content.split(' ') length = 0 limit = print_width -2 a = [] b = [] for i in t: if length + len(i) <= limit: length += len(i) + 1 a.append(i) else: length = 0 b.append(a) a = [] a.append(i) length += len(i) +1 for j in b: print(' '+' '.join(j)) print(' '+' '.join(a)) #--------------------------------------------
# 6330410021 (15.38) 249 (2021-02-27 12:46) def get_unique( words ): unique_words=[] for i in words: if not i in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): ja1=[] if len(words_1)>len(words_2): for i in range(len(words_2)): if words_2[i] in words_1: ja1.append(words_2[i]) else: for i in range(len(words_1)): if words_1[i] in words_2: ja1.append(words_1[i]) ja2 = get_unique(words_1+words_2) jaccard_coef = len(ja1)/len(ja2) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n =[['','']]*len(norm_tweets) for tweet_id in range(len(norm_tweets)): jaccards= jaccard((norm_tweets[tweet_id]),norm_query) top_n[tweet_id]=[jaccards,tweet_id] top_n.sort() top_n=top_n[::-1] for i in range(len(top_n)-3): if top_n[i][0]==top_n[i+1][0]==top_n[i+2][0]==top_n[i+3][0]: if top_n[i][1] >= top_n[i+3][1]: top_n[i][1],top_n[i+3][1]=top_n[i+3][1],top_n[i][1] if top_n[i][0]==top_n[i+1][0]==top_n[i+2][0] : if top_n[i][1] >= top_n[i+2][1]: top_n[i][1],top_n[i+2][1]=top_n[i+2][1],top_n[i][1] if top_n[i][0]==top_n[i+1][0] and top_n[i][1] >= top_n[i+1][1] : top_n[i][1],top_n[i+1][1]=top_n[i+1][1],top_n[i][1] for i in range(len(top_n)): top_n[i][0],top_n[i][1]=top_n[i][1],top_n[i][0] top_n =top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content=tweet_content.split(' ') print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') x7=' ' x8=' ' x9=' ' x10=' ' x11=' ' x12=' ' x13=' ' for i in range(len(tweet_content)): if len(x7) <= print_width-len(tweet_content[i]) and len(x8)==2: x7+=tweet_content[i] x7+=' ' elif len(x8)<=print_width-len(tweet_content[i]) and len(x9)==2: x8+=tweet_content[i] x8+=' ' elif len(x9)<=print_width-len(tweet_content[i])and len(x10)==2: x9+=tweet_content[i] x9+=' ' elif len(x10)<=print_width-len(tweet_content[i])and len(x11)==2: x10+=tweet_content[i] x10+=' ' elif len(x11)<=print_width-len(tweet_content[i])and len(x12)==2: x11+=tweet_content[i] x11+=' ' elif len(x12)<=print_width-len(tweet_content[i])and len(x13)==2: x12+=tweet_content[i] x12+=' ' elif len(x13)<=print_width-len(tweet_content[i]): x13+=tweet_content[i] x13+=' ' print(x7) if len(x8)>2: print(x8) if len(x9)>2: print(x9) if len(x10)>2: print(x10) if len(x11)>2: print(x11) if len(x12)>2: print(x12) if len(x13)>2: print(x13) #--------------------------------------------
# 6330411721 (17.43) 250 (2021-02-28 15:48) def get_unique( words ): unique_words=[] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): s = 0 word = max(len(words_1),len(words_2)) for i in range(word): if len(words_1)>len(words_2): if words_1[i] in words_2: s += 1 else: if words_2[i] in words_1: s += 1 a = len(words_1)+len(words_2)-s jaccard_coef = s / a return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): top_n.append([jaccard(norm_tweets[i], norm_query),-i]) top_n.sort() top_n.reverse() for i in range(len(top_n)): top_n[i][0],top_n[i][1] = top_n[i][1],top_n[i][0] top_n[i][0] = abs(top_n[i][0]) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#" + str(tweet_id) + " " + "("+str(round(jc_coef,2))+")") x = tweet_content.split(' ') for i in range(len(x)-1): x.insert(2*i+1,' ') a = ' ' for i in range(len(x)): if len(a)+len(x[i]) <= print_width: a += x[i] else: print(a) if x[i] == ' ': a = ' ' else: a = ' ' + x[i] print(a) #--------------------------------------------
# 6330412321 (18.01) 251 (2021-03-01 17:06) def get_unique( words ): unique_words = [] for x in words: if x not in unique_words: unique_words.append(x) return unique_words def jaccard(words_1, words_2): words_1 = get_unique(words_1) words_2 = get_unique(words_2) a = words_1 + words_2 lower = [] for b in a: if b not in lower: lower.append(b) c = len(lower) upper = [] for d in a: if d in words_1 and d in words_2: if d not in upper: upper.append(d) e = len(upper) jaccard_coef = e/c return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a = [] for i in range (len(norm_tweets)): s = jaccard(norm_tweets[i], norm_query) s = -1*s a.append([s,i]) b = sorted(a) top_n = [] for e in b[:n]: u = e[1] v = -1*e[0] top_n.append([u,v]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') j = '('+str(round(jc_coef,2))+')' print('#'+str(tweet_id),j) x = tweet_content.split(' ') a = ' ' for e in x: a+=' '+e if len(a)>print_width: print(a[:len(a)-len(e)]) a = ' '+e print(a) #--------------------------------------------
# 6330413021 (20.00) 252 (2021-02-28 16:30) def get_unique( words ): b = [] for e in words : if e not in b: b.append(e) unique_words = b return unique_words def jaccard(words_1, words_2): coef = 0 for e in get_unique(words_1) : if e in get_unique(words_2) : coef += 1 div = len(words_1)+len(words_2) - coef jaccard_coef = coef/div return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a = [] for i in range(len(norm_tweets)): e = jaccard(norm_tweets[i],norm_query) if e != 0 : a.append([i,e]) top_n = sorted(a,key=lambda x:x[1],reverse=True)[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(f'\n#{tweet_id} ({round(jc_coef,2)})') a = ' ' x = tweet_content.split(' ') leng = 0 for i in range(len(x)) : leng += len(x[i])+1 if leng < print_width : a += x[i]+' ' else : print(a) leng = len(x[i])+1 a = ' '+x[i]+' ' if i + 1 == len(x) : print(a) #--------------------------------------------
# 6330415221 (14.92) 253 (2021-03-01 23:14) def get_unique( words ): unique_words = [] for i in range(len(words)) : if not words[i] in unique_words : unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): a = [] b = [] w = words_1 + words_2 for i in range(max(len(words_1),len(words_2))) : if w[i] in words_1 and w[i] in words_2 : a.append(w[i]) for i in range(len(w)) : if not w[i] in b : b.append(w[i]) jaccard_coef = len(a)/len(b) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] def Sorts(a) : for i in range(0,len(a)): for j in range(0,len(a)-i-1): if(a[j][1]<a[j+1][1]): temp=a[j] a[j]=a[j+1] a[j+1]=temp return a for i in range(len(norm_tweets)) : tweet_id = i p = jaccard(norm_tweets[i],norm_query) if p>0 : top_n.append([tweet_id,p]) top_n = Sorts(top_n) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') w = ' ' tweet_content = tweet_content.split(' ') for i in range(1,len(tweet_content)) : w += ' ' + tweet_content[i-1] if i==len(tweet_content)-1 : if len(w)+len(tweet_content[-1])+1 <= print_width : w += ' '+tweet_content[-1] print(w) elif len(w)+len(tweet_content[-1])+1 > print_width : print(w) w = ' ' w += ' '+tweet_content[-1] print(w) elif len(w)+len(tweet_content[i])+1 > print_width : print(w) w = ' ' #--------------------------------------------
# 6330416921 (16.94) 254 (2021-02-28 02:20) def get_unique( words ): unique_words=[] for i in range(len(words)): if words[i] in unique_words: pass else: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): t=get_unique( words_1 )#['x', 'y', 'z', 'xyz'] o=get_unique( words_2 )#['y', 'x', 'w'] n=0 g=t.copy() for p in range(len(t)): if t[p] in o: n+=1 for i in range(len(o)): if o[i] in g: pass else: g.append(o[i]) jaccard_coef = n/len(g) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): b=[] c=0 for i in norm_tweets : a=[c,jaccard(i,norm_query)] b.append(a) c+=1 b.sort(key=lambda x: x[1]) d=[] for e in range (1,len(b)+1): d.append(b[-e]) for f in range(len(d)): if f!= len(d)-1: g=1 while d[f][1]==d[f+g][1]: if d[f+g][0]<d[f][0] : d[f],d[f+g]=d[f+g],d[f] g+=1 if g+f == len(d): break else: pass top_n=[] for j in range (n): top_n.append(d[j]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): a=tweet_content.split() print() print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') b=' ' for i in range(len(a)): b=b+a[i]+' ' if i < len(a)-1: if len(b)+len(a[i+1])>=print_width+1 : print(b) b=' ' else: pass print(b) #--------------------------------------------
# 6330417521 (17.92) 255 (2021-03-01 02:12) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): a = 0 for e in words_1: if e in words_2: a += 1 b = (len(words_1)+len(words_2)) - a jaccards_coef = round(a/b,2) return jaccards_coef def top_n_similarity(norm_tweets, norm_query, n): chec = [] for i in range(len(norm_tweets)): a = jaccard(norm_tweets[i],norm_query) if a != 0: chec.append([-1*a,i]) chec.sort() top = chec[:n] top_n = [[m,-1*n] for n,m in top] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): li_wo = tweet_content.split(' ') print() print('#'+str(tweet_id), '('+str(round(jc_coef,2))+')') tex = ' ' n = 1 for e in li_wo: n += len(e)+1 if n > print_width: tex += '\n' tex += ' '+e n = 2+len(e) else: tex += ' ' +e print(tex) #--------------------------------------------
# 6330418121 (19.15) 256 (2021-02-27 17:27) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): w = words_1 for i in words_2: w.append(i) a = [] for i in w: if i not in a: a.append(i) jaccard_coef = (len(w)-len(a)) / len(a) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): tweet_id = i j = jaccard(norm_tweets[i],norm_query) if j > 0: top_n.append([-j,i]) top_n.sort() for e in top_n: e[0], e[1] = e[1], e[0] e[1] = -e[1] top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(' ') print('#' + str(tweet_id) + ' ' + '(' + str(round(jc_coef,2)) + ')') string = ' ' t = tweet_content.split(' ') for e in t: if len(string) + len(e) <= print_width: string += e+' ' else: print(string) string = ' ' +e print(string) #--------------------------------------------
# 6330420321 (20.00) 257 (2021-02-26 21:54) def get_unique( words ): unique_words = [] for i in words : if not i in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): totall = [] count_top = 0 for i in words_1 : if i in words_2: count_top += 1 totall.append(i) for i in words_2: totall.append(i) count_bott = len(get_unique(totall)) jaccard_coef = count_top / count_bott return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] j = 0 for i in norm_tweets: jaccardss = jaccard(i,norm_query) if jaccardss > 0 : top_n.append([j,jaccardss]) j += 1 top_n.sort() top_n.sort(key=lambda x: x[1],reverse=True) top_n =top_n[:n:] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): width = print_width tweet_content = tweet_content.split(' ') print() print('#'+str(tweet_id), '('+str(round(jc_coef,2))+')') sadkvak = ' ' sadkvakthis = ' ' for i in range (1,len(tweet_content)) : thisIm1w = tweet_content[i-1] thisIw = tweet_content[i] sadkvak += ' '+thisIm1w sadkvakthis = sadkvak + ' ' + thisIw if len(sadkvakthis) > width : if not any(c.isalpha() for c in sadkvak.lstrip()) : sadkvak = ' ' sadkvakthis = ' ' if i != len(tweet_content)-1 : continue if any(c.isalpha() for c in sadkvak.lstrip()) : print(' '+sadkvak.lstrip()) sadkvak = ' ' sadkvakthis = ' ' if i == len(tweet_content)-1 : sadkvakthis = sadkvak + ' ' + thisIw print(' '+sadkvakthis.lstrip()) #--------------------------------------------
# 6330422621 (17.95) 258 (2021-02-28 23:14) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): ns = len(get_unique(words_1 + words_2)) ne = len(words_1 + words_2)-ns jaccard_coef = ne/ns return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] jj = [] tweet_id = [] for i in range(len(norm_tweets)): j = jaccard(norm_tweets[i], norm_query) jj.append(j) j = sorted(jj)[::-1][:n] kk = jj.copy() for i in j : tweet_id.append(jj.index(i)) jj.insert(jj.index(i),2) jj.pop(jj.index(i)) for z in range(len(j)): top_n.append([tweet_id[z],j[z]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') t = tweet_content.split() paragraph = '' for i in range(len(t)): paragraph += str(' '+t[i]) if i+1 == len(t): print(' '+paragraph.lstrip()) break elif len(paragraph)+len(t[i+1]) < print_width-1: pass else: print(' '+paragraph.lstrip()) paragraph = '' #--------------------------------------------
# 6330423221 (6.58) 259 (2021-03-01 23:59) def get_unique( words ): unique_words=[] for e in words: if e not in unique_words : unique_words.append(e) return unique_words def jaccard(words_1, words_2): A=words_1 + words_2 A.sort() t=get_unique(A) if len(A)>0: jaccard_coef= len(t)/len(A) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for e in norm_tweets: bank=[] tweet_id= norm_tweets.index(e) jac=jaccard(e,norm_query) bank.append(jac) bank.append(tweet_id) top_n.append(bank) top_n.sort() #เรียง top_n=top_n[:n:-1] #เอาjac เยอะขึ้นมาก่อน bank=[] for [e,t] in top_n: #2 top_n=[ [y,x], [y,x] ] bank.append([t,e]) top_n=bank return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#"+str(tweet_id)+" "+"("+str(round(jc_coef,2))+")") print(" "+ tweet_content) #--------------------------------------------
# 6330424921 (18.01) 260 (2021-02-26 16:02) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): word = words_1 for i in words_2: word.append(i) unique = [] for i in word: if not(i in unique) : unique.append(i) jaccard_coef = (len(word)-len(unique))/len(unique) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top=[[jaccard(norm_tweets[i],norm_query),-i] for i in range(len(norm_tweets)) ] top.sort(reverse=True) top_n= [[-top[e][1],top[e][0]] for e in range(len(top))][:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(' ') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') t = tweet_content.split(' ') string = ' ' left = print_width-len(string) for e in t: if left >= len(e)+1: string+=' ' + e left = print_width-len(string) if e== t[-1]: print(string) else: print(string) string =' ' +e left = print_width-len(string) if e== t[-1]: print(string) #--------------------------------------------
# 6330425521 (20.00) 261 (2021-03-01 18:16) def get_unique( words ): #words = ['x', 'y', 'z', 'y', 'xyz', 'z'] unique_words = [] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) #print(unique_words) return unique_words #-------------------------------------------------------- def jaccard(words_1, words_2): #words_1 = ['x', 'y', 'z', 'xyz'] #words_2 = ['y', 'x', 'w'] words_1 = get_unique(words_1) words_2 = get_unique(words_2) up = 0 for i in range(len(words_1)): if words_1[i] in words_2: up += 1 words_u = words_2 for i in range(len(words_1)): if words_1[i] not in words_2: words_u.append(words_1[i]) down = len(words_u) jaccard_coef = up/down #print(jaccard_coef) return jaccard_coef #-------------------------------------------------------- def top_n_similarity(norm_tweets, norm_query, n): pre_topn = [] for tweet_id in range(len(norm_tweets)): Jack = jaccard(norm_tweets[tweet_id],norm_query) if Jack > 0: pre_topn.append([-Jack, tweet_id]) pre_topn.sort() for e in pre_topn: e[0],e[1] = e[1],-e[0] top_n = pre_topn[:n:] return top_n #-------------------------------------------------------- def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tw_cont = tweet_content.split(' ') show_tweet = ' ' n = 2 for e in tw_cont: if n == 2: show_tweet += e n += len(e) elif 2 < n+len(e)+1 <= print_width : show_tweet += ' '+e n += 1+len(e) elif n+len(e)+1 > print_width: show_tweet += '\n '+e n = 2+len(e) print('') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') print(show_tweet) #--------------------------------------------
# 6330426121 (20.00) 262 (2021-03-01 03:14) def get_unique( words ): unique_words = [] for word in words: if word not in unique_words: unique_words.append(word) return unique_words def jaccard(words_1, words_2): c = 0 for word in words_1: if word in words_2: c += 1 jaccard_coef = c/(len(words_1)+len(words_2)-c) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): x = [] for i in range(len(norm_tweets)): j = jaccard(norm_tweets[i], norm_query) if j > 0: x.append([j,i]) x = sorted(x, reverse = True) top_n = [] ar = [] for i in range(len(x)): ar.append(x[i][::-1]) if i < len(x)-1: if x[i][0] != x[i+1][0]: top_n += sorted(ar) ar = [] else: top_n += sorted(ar) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n#{:d} ({:.2f})'.format(tweet_id,round(jc_coef,2))) pt = tweet_content.split(' ') L = print_width-2 pl = ' ' final_print = [] i = 0 while i < len(pt): if len(pt[i]) <= L: pl += pt[i] L -= len(pt[i]) if L > 0: pl += ' ' L -= 1 else: if pl == ' ':pl += pt[i] else:i -= 1 final_print.append(pl) pl = ' ' L = print_width-2 i += 1 final_print.append(pl) for e in final_print:print(e) #--------------------------------------------
# 6330427821 (13.75) 263 (2021-02-27 19:29) def get_unique(words): unique_words = [] words.sort() words.append('') for i in range(len(words)-1): if words[i] != words[i+1]: unique_words.append(words[i]) return unique_words #-------------------------------------------------------- def top_n_similarity(norm_tweets, norm_query, n): tweet_id = [] jaccard = [] top_n = [] for i in range(len(norm_tweets)): tweet_id.append(i) for i in range(len(norm_tweets)): jaccard.append(jack_card(norm_tweets[i],norm_query)) for i in range(len(jaccard)): if jaccard[i] != 0 : top_n.append([jaccard[i],tweet_id[i]]) top_n.sort(reverse = True) for i in range(len(top_n)): top_n[i][0],top_n[i][1] = top_n[i][1],top_n[i][0] a = [] if len(top_n) > n : for i in range(n): a.append(top_n[i]) else: for i in range(len(top_n)): a.append(top_n[i]) top_n = a return top_n #-------------------------------------------------------- def show_tweet(tweet_id, tweet_content, jc_coef, print_width): v = ' ' t = tweet_content.split(' ') print('') print('#' + str(tweet_id), '(' + str(round(jc_coef, 2)) + ')') for i in range(len(t)): if len(v)+len(t[i]+' ') <= print_width + 1: v += t[i]+' ' else: print(v) v = ' '+t[i]+' ' print(v) #-------------------------------------------------------- def jack_card(words_1,words_2): a = words_1+words_2 s = len(get_unique(a)) c = 0 u = get_unique(words_1) for i in range(len(u)): if u[i] in words_2: c+=1 else: c+=0 jaccard_coef = c/s return jaccard_coef #--------------------------------------------------------
# 6330428421 (18.01) 264 (2021-02-28 23:24) def get_unique( words ): words_copy=words[:] unique_words=[] #words_copy.sort() #print(words_copy) for i in range (len(words_copy)): if not words_copy[i] in unique_words: unique_words.append(words_copy[i]) #print(unique_words) return unique_words def jaccard(words_1, words_2): overlap=0 union=0 for e in words_2: union+=1 for e in words_1: if not (e in words_2 ): union+=1 if e in words_2: overlap+=1 jaccard_coef=overlap/union return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range(len(norm_tweets)): jaccard_coef=jaccard(norm_tweets[i],norm_query) #print(jaccard_coef) top_n.append([jaccard_coef,(-1)*i]) top_n.sort(reverse=True) top_n=top_n[0:n] for i in range(len(top_n)): top_n[i][0],top_n[i][1]=top_n[i][1]*(-1),top_n[i][0] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') line="" tweet_words=tweet_content.split(' ') for i in range(len(tweet_words)): if len(line+tweet_words[i])+2<=print_width and line=='': line+=' '+tweet_words[i] elif len(line+tweet_words[i])+1<=print_width and line!='': line+=' '+tweet_words[i] elif len(tweet_words[i])+2>print_width and line=='': line+=' '+tweet_words[i] print(line) line='' else: print(line) line=" "+tweet_words[i] if line!='': print(line) #--------------------------------------------
# 6330429021 (18.40) 265 (2021-02-27 21:08) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): i_words = 0 u_words = len(words_1)+len(words_2) for i in range(len(words_1)): if words_1[i] in words_2: i_words += 1 u_words -= 1 jaccard_coef = i_words/u_words return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a = [] b = [] top_n = [] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i],norm_query) > 0: a.append(-i) a.append(jaccard(norm_tweets[i],norm_query)) for e in range(0,len(a)-1,2): b.append([a[e+1],a[e]]) b.sort(reverse = True) top_n = b[:n] for x in range(len(top_n)): top_n[x][0],top_n[x][1] = top_n[x][1]*-1,top_n[x][0] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') x = tweet_content.split(" ") s1 = ' ' for i in x: if len(s1) < print_width: s2 = s1 s3 = ' ' + i s1 += s3 elif len(s1) == print_width: print(s1) s1 = ' ' s1 += (' '+i) elif len(s1) > print_width: print(s2) s1 = ' ' s1 += (s3+' '+i) if len(s1) <= print_width: print(s1) else: print(s2) print(' ' + s3) #--------------------------------------------
# 6330430621 (18.33) 266 (2021-03-01 18:11) def get_unique( words ): unique_words=[] for i in words: if not i in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): c = 0 g = 0 for e in words_1: if e in words_2: c += 1 else: g += 1 for e in words_2: if e in words_1: pass else: g += 1 jaccard_coef = (c)/(c+g) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): q = [] top_n = [] for tweet_id in range(len(norm_tweets)): t = norm_tweets[tweet_id] o = jaccard(t,norm_query) if o == 0: pass else: q.append([-o,tweet_id]) q.sort() for e in q: e[0],e[1]=e[1],-e[0] top_n += q[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): a = round(jc_coef,2) b = str(a) c = str(tweet_id) print('\n'+'#'+c+' ('+b+')') s = tweet_content.split(' ') o = '' index = len(s) i = 0 while i < index: if len(o)+len(s[i]) < print_width-1 : o += ' '+s[i] i+=1 else: print(' '+o) o = '' if i == index: print(' '+o) #--------------------------------------------
# 6330431221 (20.00) 267 (2021-03-01 04:11) def get_unique( words ): unique_words= [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): a = 0 for i in words_1: if i in words_2: a += 1 x = len(words_1) + len(words_2)-a jaccard_coef = a/x return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] def select(a): return a[1] for i in range(len(norm_tweets)) : jc_cof = jaccard(norm_tweets[i],norm_query) if jc_cof > 0 : top_n += [[i,jc_cof]] top_n.sort(reverse = True,key = select) return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n') print('#'+str(tweet_id)+' (' + str(round(jc_coef,2)) + ')') x = tweet_content.split(' ') L = [' '] for words in x : L += [words] if len(' '.join(L)) > print_width: L = L[:-1] print(' '.join(L)) L = [' '+words] print(' '.join(L)) #--------------------------------------------
# 6330432921 (17.50) 268 (2021-03-01 14:47) def get_unique( words ): unique_words = [] for i in range(len(words)): if not words[i] in unique_words : unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): repeat_1 = 0 not_repeat_1 = 0 repeat_2 = 0 not_repeat_2 = 0 for i in range(len(words_1)): if words_1[i] in words_2: repeat_1 += 1 else: not_repeat_1 += 1 for i in range(len(words_2)): if not words_2[i] in words_1: not_repeat_2 += 1 jaccard_coef = (repeat_1)/(repeat_1 + not_repeat_1 + not_repeat_2) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] top_m = [] for i in range(len(norm_tweets)): a = [-jaccard(norm_tweets[i],norm_query),i] top_m.append(a) top_m.sort() for i in range(n): if top_m[i][1] == 0: break b = [top_m[i][1],-top_m[i][0]] top_n.append(b) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print_width = print_width-2 print("") x = "#"+str(tweet_id) y = "("+str(round(jc_coef,2))+")" print(x,y) while True: if len(tweet_content) <= print_width: print(" ",tweet_content) break if tweet_content[print_width] == " ": a = tweet_content[:print_width+1] tweet_content = tweet_content[print_width+1:] print(" ",a) else: for i in range(0,print_width): if tweet_content[print_width-i] == " ": a = tweet_content[:print_width+1-i] tweet_content = tweet_content[print_width+1-i:] print(" ",a) break #--------------------------------------------
# 6330433521 (18.33) 269 (2021-03-01 23:25) def get_unique( words ): unique = [] for w in words: if w not in unique: unique.append(w) unique_words = unique return unique_words def jaccard(words_1, words_2): count = 0 for word in words_1: if word in words_2: count += 1 jaccard_coef = count/(len(words_1)+len(words_2)-count) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = [] for i in range(len(norm_tweets)): jac = jaccard(norm_tweets[i],norm_query) if jac > 0: top.append([-jac,i]) top.sort() topn = [] for a1,a2 in top: topn.append([a2,-a1]) top_n = topn[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): t = ' #'+str(tweet_id) + ' ('+str(round(jc_coef,2))+')' print(t) tc = tweet_content.split(' ') sp = print_width - 2 pans = [] ans = ' ' i = 0 while i < len(tc): if len(tc[i]) <= sp: ans += tc[i] sp -= len(tc[i]) if sp > 0: sp -= 1 ans += ' ' else: if sp == ' ': ans += tc[i] else : i -= 1 pans.append(ans) ans = ' ' sp = print_width -2 i += 1 pans.append(ans) for e in pans: print(e) #--------------------------------------------
# 6330434121 (20.00) 270 (2021-03-01 18:30) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words : unique_words.append(e) return unique_words def jaccard(words_1, words_2): jc = 0 for i in range(len(words_1)): if words_1[i] in words_2: jc += 1 jaccard_coef = jc/((len(words_1)+len(words_2)-jc)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): b = [] for i in range(len(norm_tweets)): a = jaccard(norm_tweets[i], norm_query) #norm_tweets[i] norm_query if a >0: b.append([i,a]) top_n = sorted(b,key= lambda x:x[1],reverse=True)[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(" ") print("#"+str(tweet_id),"("+str(round(jc_coef,2))+")") a = tweet_content.split(' ') e =" " for i in a: if len(e)+len(i)+1<= print_width: a = e e = e+" "+i if len(e)> print_width: e = a else: print(e) e=" "+i print(e) #--------------------------------------------
# 6330435821 (20.00) 271 (2021-02-26 01:22) def get_unique( words ): unique_words=[] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): #intersec intersec=0 for i in words_1: if i in words_2: intersec+=1 #union union=len(words_2)+len(words_1)-intersec jaccard_coef=intersec/union return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top=[] for tweets_id in range(len(norm_tweets)): top.append([jaccard(norm_query,norm_tweets[tweets_id]),(-1)*tweets_id]) top.sort() top=top[::-1] top_n=[] for i in top[0:n]: if i[0]!=0: top_n.append([(-1)*i[1],i[0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') word_list=tweet_content.split(' ') display=' ' for i in word_list: if len(display)+len(i)+1<=print_width: display +=' '+i else: print(display) display=' '+i print(display) #--------------------------------------------
# 6330436421 (17.00) 272 (2021-02-27 23:07) def get_unique(words): """words เป็นลิสต์ที่เก็บสตริง ต้องทำ: ตั้งค่าให้ตัวแปร unique_words ที่เก็บสตริงได้มาจาก words แต่ไม่มีตัวซ้ำ (คือตัวไหนมีซ้ำใน words จะมีตัวนั้นแค่ตัวเดียวใน unique_words) Doctest : >>> words = ['x', 'y', 'z', 'y', 'xyz', 'z'] >>> get_unique(words) ['x', 'y', 'z', 'xyz'] """ unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): """words_1 และ words_2 เป็นลิสต์ของคำต่าง ๆ (ไม่มีคำซ้ำใน words_1 และ ไม่มีคำซ้ำใน words_2) ต้องทำ: ตั้งตัวแปร jaccard_coef ให้มีค่าเท่ากับ Jaccard similarity coefficient ที่คำนวณจากค่าใน words_1 และ words_2 ตามสูตรที่แสดงไว้ก่อนนี้ Doctest : >>> words_1 = ['x', 'y', 'z', 'xyz'] >>> words_2 = ['y', 'x', 'w'] >>> jaccard(words_1,words_2) 0.4 """ # Check intersect in_other = 0 for i in words_1: if i in words_2: in_other += 1 # Make list of total member in both list both_list = [] for i in words_1: if i not in both_list: both_list.append(i) for i in words_2: if i not in both_list: both_list.append(i) jaccard_coef = in_other / len(both_list) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): """norm_tweets เป็นลิสต์ที่ภายในเก็บลิสต์ของคำต่าง ๆ [ [w00,w01,...], [w10,w11,...], ... ] norm_query เป็นลิสต์ของคำต่าง ๆ n เป็นจำนวนเต็ม ต้องทำ: ตั้งค่าให้ตัวแปร top_n ที่เก็บลิสต์ขนาดไม่เกิน n ช่อง แต่ละช่องเก็บลิสต์ย่อยขนาดสองช่อง [ [tweet_id, jaccard], ... ] tweet_id คือเลขอินเด็กซ์ของทวีตใน norm_tweets jaccard คือค่า Jaccard coefficient ของ norm_tweets[tweet_id] กับ norm_query โดยจะเลือกทวีตที่มีค่า Jaccard มากกว่า 0 และติดอันดับมากสุด n ตัวแรก ในกรณีที่มีค่า Jaccard เท่ากัน ให้เลือกอันที่มี tweet_id น้อยกว่าก่อน """ index_list = [] jaccard_list = [] for i in range(len(norm_tweets)): index_list.append(i + 1) jaccard_list.append(jaccard(norm_tweets[i], norm_query)) top_n = [] result_list = [] number_list = [] sort_jaccard = sorted(jaccard_list, reverse=True) for i in range(len(sort_jaccard)): for j in range(len(jaccard_list)): if (sort_jaccard[i] == jaccard_list[j]) and (jaccard_list[j] != 0) and (j not in number_list): number_list.append(j) result_list.append([j, sort_jaccard[i]]) for i in range(n): top_n.append(result_list[i]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): """tweet_id เป็นจำนวนเต็มแทนเลขอินเด็กซ์ของทวีต tweet_content เป็นสตริงเก็บข้อความของทวีตที่ต้องการแสดง jc_coef เป็นจำนวนจริงแทนค่า Jaccard coefficient print_width เป็นจำนวนเต็มแทนจำนวนตัวอักษรที่แสดงได้ในหนึ่งบรรทัด ต้องทำ: นำข้อมูลทั้งหลายที่ได้รับมาแสดงทางจอภาพในรูปแบบที่แสดงในตัวอย่าง ฟังก์ชันนี้ไม่คืนผลอะไร Doctest : >>> t = 'I promise you that as president, I will always appeal to the best in us.' >>> show_tweet(1076, t, 0.222222, 40) <BLANKLINE> #1076 (0.22) I promise you that as president, I will always appeal to the best in us. >>> show_tweet(1076, t, 0.222222, 30) <BLANKLINE> #1076 (0.22) I promise you that as president, I will always appeal to the best in us. >>> show_tweet(1076, t, 0.222222, 20) <BLANKLINE> #1076 (0.22) I promise you that as president, I will always appeal to the best in us. """ print() print(f"#{tweet_id} ({round(jc_coef, 2)})") word_list = tweet_content.split(" ") print_list = [] word_to_print = "" for i in range(len(word_list)): if i == len(word_list) - 1: if (len(word_to_print + word_list[i]) + 2) <= print_width: word_to_print += f"{word_list[i]} " print_list.append(word_to_print) continue else: print_list.append(word_to_print) print_list.append(f"{word_list[i]} ") continue elif (len(word_to_print + word_list[i]) + 2) <= print_width: word_to_print += f"{word_list[i]} " else: print_list.append(word_to_print) word_to_print = f"{word_list[i]} " for word in print_list: print(f" {word[:-1]}") #--------------------------------------------
# 6330437021 (20.00) 273 (2021-03-01 16:40) def get_unique( words ): unique_words = [] for i in words: if not i in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): A=0 B=0 C=0 for i in words_1: if i not in words_2: A+=1 for i in words_2: if i not in words_1: B+=1 for i in words_1: if i in words_2: C+=1 jaccard_coef=C/(A+B+C) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range(len(norm_tweets)): D=jaccard(norm_tweets[i], norm_query) if D>0: top_n.append([D,-i]) top_n.sort(reverse=True) top_n=top_n[:n] for i in range(len(top_n)): top_n[i] = [-top_n[i][1], (top_n[i][0])] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#"+str(tweet_id)+" "+"("+str(round(jc_coef,2))+")") E=tweet_content.split(" ") F="" for G in E: if (len(F)+1+len(G))<=(print_width-1): F+=" "+G else: print(" "+F) F="" F+=" "+G if not F=="": print(" " + F) #--------------------------------------------
# 6330438721 (19.48) 274 (2021-03-01 22:46) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): w1 = get_unique(words_1) w2 = get_unique(words_2) c = 0 for e in w1: if e in w2: c += 1 if len(get_unique(w1+w2)) != 0: jaccard_coef = c/int(len(get_unique(w1+w2))) else: jaccard_coef = 0 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i], norm_query)>0: top_n.append([i,jaccard(norm_tweets[i], norm_query)]) t = [] for e in top_n: e = [-e[1],e[0]] t.append(e) t.sort() top_n = [] for m in t: m = [m[1],-m[0]] top_n.append(m) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content = tweet_content.split(" ") print(" ") print("#"+str(tweet_id)+" ("+str(round(jc_coef,2))+")") co = [] n = 0 for i in range(len(tweet_content)): z=int(len(co)) + 1 + n if z > print_width: print(" "+" ".join(co[:-1])) co = [co[-1]] n = len(tweet_content[i-1]) co += [tweet_content[i]] n += len(tweet_content[i]) else: co += [tweet_content[i]] n += len(tweet_content[i]) if not len(co) + 1 + n <= print_width: print(" "+" ".join(co[:-1])) print(" "+co[-1]) else: print(" "+" ".join(co)) #--------------------------------------------
# 6330439321 (18.44) 275 (2021-02-25 23:42) def get_unique(words): unique = [] for w in words: if w not in unique: unique.append(w) return unique def jaccard(words_1, words_2): same_word1 = words_1 + words_2 same_word1 = get_unique(same_word1) same_word2 = [] for e in same_word1: if e in words_1 and e in words_2: same_word2.append(e) if same_word1: return len(same_word2)/len(same_word1) else: return 0 def top_n_similarity(norm_tweets, norm_query, n): topN = [] # [[jcd1, id1], [jcd2,id2]...] for idx, tweet in enumerate(norm_tweets): # find jcd jcd = jaccard(tweet, norm_query) # append to topN if jcd != 0: if len(topN) < n: topN.append([jcd, idx]) else: # sort before for dummy in range(n): for j in range(n-(dummy+1)): if topN[j][0] < topN[j+1][0] or (topN[j][0] == topN[j+1][0] and topN[j][1] > topN[j+1][1]): topN[j], topN[j+1] = topN[j+1], topN[j] # pop the worst least_jcd = topN[-1][0] least_jcd_id = topN[-1][1] if least_jcd < jcd or (least_jcd == jcd and least_jcd_id > idx): topN[-1] = [jcd, idx] # reverse the data final_topN = [] for data in topN: final_topN.append(data[::-1]) return final_topN def show_tweet(tweet_id, tweet_content, jc_coef, print_width): output_stack = [] words = tweet_content.split() sentence = '' for word in words: if len(sentence) + len(word) > print_width-2: output_stack.append(sentence[:len(sentence)-1]) sentence = '' sentence += word + ' ' # check remaining if sentence != '': output_stack.append(sentence) # printer print('\n#{} ({})'.format(tweet_id, round(jc_coef, 2))) # loop each while output_stack: line = output_stack.pop(0) print(' '*2 + line) # --------------------------------------------
# 6330440921 (17.78) 276 (2021-02-27 00:37) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] not in words[i+1:]: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): same = 0 for word in words_1: if word in words_2: same += 1 jaccard_coef = same/(len(words_1) + len(words_2) - same) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [0]*len(norm_tweets) for i in range(len(norm_tweets)): tweet_id = i jaccards = jaccard(norm_tweets[tweet_id], norm_query) if jaccards >= 0: top_n[i] = [-jaccards, tweet_id] top_n.sort() for i in range(len(top_n)): top_n[i] = [top_n[i][1], -top_n[i][0]] if [0, 0.0] not in top_n[:n]: return top_n[:n] else: return [] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): words = tweet_content.split(" ") jc_coef = round(jc_coef, 2) print(end = '\n') print('#' + str(tweet_id) + ' (' + str(jc_coef) + ')') sentence = [] total = 0 while total < len(tweet_content): if len(words) == 0: break line = [] word = words[0] length_line = len(word) + 1 while length_line < print_width: words = words[1:] line.append(word) if len(words) == 0: break word = words[0] length_line += len(word) + 1 line = " ".join(line) sentence.append(line) total += len(line) for i in range(len(sentence)): print(' ' + sentence[i]) #--------------------------------------------
# 6330441521 (16.36) 277 (2021-03-01 21:02) def get_unique( words ): unique_words = [] for d in words: if d not in unique_words: unique_words.append(d) return unique_words def jaccard(words_1, words_2): jcd = [] jcu = [] for d in words_1: if d not in jcd: jcd.append(d) for d in words_2: if d not in jcd: jcd.append(d) for d in jcd: if d in words_1 and d in words_2: jcu.append(d) jaccard_coef = len(jcu)/len(jcd) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] v = [] for d in range (len(norm_tweets)): x = jaccard(norm_tweets[d],norm_query) if x > 0: top_n.append( [x,d] ) top_n.sort() top_n.reverse() for d in top_n: d[0],d[1] = d[1],d[0] for d in range (len(top_n)-1): if top_n[d][1] == top_n[d+1][1] and top_n[d][0] > top_n[d+1][0]: top_n[d],top_n[d+1] = top_n[d+1],top_n[d] for d in range (len(top_n)-1): if top_n[d][1] == top_n[d+1][1] and top_n[d][0] > top_n[d+1][0]: top_n[d],top_n[d+1] = top_n[d+1],top_n[d] for d in range (len(top_n)-1): if top_n[d][1] == top_n[d+1][1] and top_n[d][0] > top_n[d+1][0]: top_n[d],top_n[d+1] = top_n[d+1],top_n[d] for d in range (0,n): v.append(top_n[d]) top_n = v return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): x = tweet_content.split() print(' ') o = ' ' print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') for d in x: if len(o)+len(d) > print_width: print(o) o = ' ' if len(o) < print_width: o += d+' ' if len(o) >= print_width: print(o) o = ' ' if len(o) > 0 and x != ' ': print(o) #--------------------------------------------
# 6330443821 (18.01) 278 (2021-03-01 10:14) def get_unique( words ): if words: unique_words_lst = [] for word in words: if word not in unique_words_lst: unique_words_lst.append(word) return unique_words_lst def jaccard(words_1, words_2): #ppp = get_unique(words_1+words_2) same_words = 0 for i in words_1: if i in words_2: same_words += 1 jaccard_coef = same_words/(len(words_1)+len(words_2)-same_words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): global percent percent = 0 top_n = [] for i in range(len(norm_tweets)): try: percent = jaccard(norm_query, norm_tweets[i]) top_n.append([i, percent]) except: pass top_n.sort(key=lambda x: x[1], reverse=True) top_n = top_n[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("\n", end = '') print("#"+str(tweet_id)+" "+"("+str(round(jc_coef, 2))+")") n = print_width print(" ", end = '') for i in range(len(tweet_content.split(' '))): if len(tweet_content.split(' ')[i]) < n-1: print(tweet_content.split(' ')[i], end = ' ') n -= (len(tweet_content.split(' ')[i])+1) else: print("\n", end = ' '+ tweet_content.split(' ')[i]+ ' ') n = print_width n -= (len(tweet_content.split(' ')[i])+1) print("\n", end = '') #--------------------------------------------
# 6330444421 (19.95) 279 (2021-03-01 18:02) def get_unique( words ): unique_words = [] for word in words: if word not in unique_words: unique_words.append(word) return unique_words def jaccard(words_1, words_2): dup = 0 for word in words_1: if word in words_2: dup += 1 totalLenth = len(words_1) + len(words_2) - dup jaccard_coef = dup / totalLenth return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): jac = jaccard(norm_tweets[i], norm_query) if jac == 0: continue top_n.append([i, jac]) top_n.sort(key=lambda x: (-x[1], x[0]), reverse = False) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): words = tweet_content.split() print("\n#{} ({})".format(tweet_id, round(jc_coef, 2))) i = 0 cursor = 2 while i < len(words): if cursor == 2: print(" ", end="") if cursor + len(words[i]) < print_width: print(words[i], end = " ") cursor += len(words[i]) + 1 i += 1 elif cursor + len(words[i]) == print_width: print(words[i]) i += 1 cursor = 2 else: cursor = 2 if len(words[i]) + 2 > print_width: print(words[i]) i += 1 else: print("") #--------------------------------------------
# 6330445021 (19.19) 280 (2021-02-28 21:57) def get_unique( words ): unique_words = [] for i in words: if not i in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): all_words = words_1 + words_2 jaccard_coef = (len(all_words))/len(get_unique(all_words)) - 1 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = [] tweet_id = 0 for tweet in norm_tweets: jc = jaccard(tweet,norm_query) if jc > 0: top.append([1-jc,tweet_id]) tweet_id += 1 top.sort() top_n = top[:n] for i in top_n: top_n[top_n.index(i)] = [i[1],1-i[0]] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): jc = round(jc_coef,2) c = 0 line = ' ' tweet = tweet_content.split() print() print('#' + str(tweet_id) + ' (' + str(jc) + ')') for word in tweet: l = len(line) if l == 2: line += word elif l + len(word) < print_width: line += ' ' + word else: print(line) line = ' ' + word print(line) #--------------------------------------------
# 6330446721 (20.00) 281 (2021-02-27 17:06) def get_unique( words ): unique_words = [] for i in words : if i not in unique_words : unique_words.append(i) return unique_words def jaccard(words_1, words_2): a = words_1 + words_2 b = [] c = 0 for i in a : if i not in b : b.append(i) c = len(a)-len(b) jaccard_coef = c / len(b) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] top_m = [] for e in norm_tweets : tweet_id = norm_tweets.index(e) a = jaccard(norm_tweets[tweet_id], norm_query) if a > 0 : top_m.append([-a,tweet_id]) top_m.sort() for x in top_m : x[0] = -x[0] top_n.append([x[1],x[0]]) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(" ") print("#"+str(tweet_id)+" "+"("+(str(round(jc_coef,2))+")")) s = " " t = tweet_content.split(" ") for e in t : if len(s) + len(e) <= print_width: s += e + " " else: print(s) s = " " s += e + " " print(s) #--------------------------------------------
# 6330447321 (14.45) 282 (2021-02-28 17:20) def get_unique( words ): unique_words = [] for e in words: if not e in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): a = list(words_1) for e in words_2: if not e in a: a.append(e) #union c = [] for e in words_1: if e in words_2: c.append(e) #intersect jaccard_coef = len(c)/len(a) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): L = [] for i in range(len(norm_tweets)): L.append([-jaccard(norm_tweets[i],norm_query),i]) L.sort() L2 = L[:n] top_n = [[a,-b] for [b,a] in L2] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("\n#"+str(tweet_id),'('+str(round(jc_coef,2))+')') w = [str(e) for e in tweet_content.split(' ')] #split word i = 0 pri = "" while True: if i == len(w): if len(pri) != 0 : print(' '+pri) break if len(pri) + len(w[i]) + 1 <= print_width - 2: if len(pri) != 0: pri += ' ' + w[i] else: pri = w[i] i += 1 else : print(' '+pri) pri = "" #--------------------------------------------
# 6330448021 (20.00) 283 (2021-03-01 23:46)
# 6330449621 (18.50) 284 (2021-03-01 21:43) def get_unique( words ): unique_words = [] for i in words : if i not in unique_words : unique_words.append(i) return unique_words def jaccard(words_1, words_2): same = [i for i in words_1 if i in words_2] n = [e for e in words_2 if e not in words_1] w = words_1 + n jaccard_coef = len(same)/len(w) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = [] for i in range(len(norm_tweets)) : top.append([jaccard(norm_tweets[i],norm_query),-i]) top.sort() x = top[len(top)-n:] top_n = [[abs(a),b] for b,a in x][::-1] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n#{} ({})'.format(tweet_id, round(jc_coef, 2))) content = tweet_content.split(' ') a = ' ' for i in range(len(content)) : if len(a)+len(content[i])+1 <= print_width : a+=' '*int(i!=0) + content[i] else: print(a) a = ' ' + content[i] print(a) #--------------------------------------------
# 6330450121 (20.00) 285 (2021-03-01 09:37) def get_unique( words ): unique_words=list() for x in words: if x in unique_words: pass else: unique_words.append(x) return unique_words def jaccard(words_1, words_2): same=0 alll=0 for x in words_1: if x in words_2: same=same+1 for x in words_2: if x not in words_1: alll=alll+1 alll=alll+len(words_1) if same==0: return 0 else: jaccard_coef=(same)/(alll) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a=list() for i in range(len(norm_tweets)): j = jaccard(norm_tweets[i], norm_query) if j == 0: pass else: a.append([-j, i]) #sort by -j so that the top will appear on the left #with the appropiate i arranging a.sort() all_top_n=a[:n] lenn=len(all_top_n) for i in range(lenn): all_top_n[i]=[all_top_n[i][1],(-(all_top_n[i][0]))] top_n=all_top_n return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' ('+str(round(jc_coef, 2))+')') #split content content=tweet_content.split(' ') output='' lenn=0 for x in content: l=len(x) #1 =len leftt if (lenn+1+l)<=(print_width-1): lenn=len(output)+1+len(x) output=output+' '+(x) else: print(' '+output) output= '' lenn = len(x)+1 output=output+' '+(x) if len(output)==0: pass else: print(' '+output) #--------------------------------------------
# 6330452421 (18.50) 286 (2021-02-26 16:01) def get_unique( words ): unique_words =[] words.sort() for i in range(len(words)-1): if words[i]!=words[i+1]: unique_words.append(words[i]) if len(words) !=0: unique_words.append(words[-1]) return unique_words def jaccard(words_1, words_2): c=0 for e in words_1 : if e in words_2 : c+=1 jaccard_coef = c/(len(words_1)+len(words_2)-c) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top =[] j =[] for i in range(len(norm_tweets)): jac = jaccard(norm_tweets[i], norm_query) if jac >0 : top.append([jac,i]) top.sort(reverse=True) group = [[top[0]]] m=0 for i in range(1,len(top)): if top[i-1][0] == top[i][0]: group[m].append(top[i]) else: group.append([top[i]]) m+=1 output=[] for x in group: x.sort() output+=x top_n = output[:n] for i in top_n : i[0],i[1] = i[1],i[0] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): jc = "("+str(round(jc_coef,2))+")" print() print("#"+str(tweet_id),jc) word = tweet_content.split(" ") c= len(word[0]) show = word[0] if c >= print_width-2: print(" "+show.strip()) for i in range(1,len(word)): c+=(len(word[i])+1) if c == print_width-2: show+=" "+word[i] print(" "+show.strip()) c=-1 show = "" elif c < print_width-2: if c == len(word[i]) : show+=word[i] else: show+=" "+word[i] elif c > print_width-2: print(" "+show.strip()) c = len(word[i]) show = word[i] print(" "+show.strip()) #--------------------------------------------
# 6330453021 (19.51) 287 (2021-02-27 16:22) def get_unique( words ): x = [] for e in range(len(words)): if words[e] not in x: x.append(words[e]) print(x) unique_words = x return unique_words def jaccard(words_1, words_2): words_1.sort() words_2.sort() intersec = 0 if len(words_1) > len(words_2): for i in range(len(words_1)): if words_1[i] in words_2: intersec += 1 else: for i in range(len(words_2)): if words_2[i] in words_1: intersec += 1 total = [] for i in range(len(words_1)): total.append(words_1[i]) for i in range(len(words_2)): if words_2[i] not in total: total.append(words_2[i]) jaccard_coef = intersec/len(total) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n_unfin = [] jaccard_num = 0 for k in range(len(norm_tweets)): tweet_id = norm_tweets.index(norm_tweets[k]) jaccard_num = jaccard(norm_tweets[tweet_id], norm_query) top_n_unfin.append([jaccard_num,tweet_id]) top_n_unfin.sort(reverse=True) for i in range(len(top_n_unfin)): top_n_unfin[i][0]*= -1 top_n_unfin.sort() for i in range(len(top_n_unfin)): top_n_unfin[i][0]*= -1 for j in top_n_unfin : j[0],j[1] = j[1],j[0] if float(top_n_unfin[0][1]) == 0.0: top_n_unfin = [] top_n = top_n_unfin[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print("#"+str(tweet_id),"("+str(round(jc_coef,2))+")") full = tweet_content.replace(' ','/').split('/') n = 0 p_sent = [] sent = [] for i in full: n += len(i) if n <= print_width-2 : n += 1 p_sent.append(i) else : sent.append(p_sent) p_sent = [] p_sent.append(i) n = len(i)+1 for i in sent: print(" "+" ".join(i)) print(" "+" ".join(p_sent)) #--------------------------------------------
# 6330454721 (18.33) 288 (2021-02-28 11:05) def get_unique( words ): unique_words = [] for e in words : if e not in unique_words : unique_words.append(e) return unique_words def jaccard(words_1, words_2): a = words_1 + words_2 b = [] for e in a : if e not in b : b.append(e) t = len(b) s = 0 for k in words_1 : for g in words_2 : if k == g : s += 1 jaccard_coef = s/t return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): x = [] for i in range(len(norm_tweets)) : num = jaccard(norm_tweets[i],norm_query) if num > 0 : x.append([-num,i]) x.sort() for a in x : a[0],a[1] = a[1],a[0] a[1] = -a[1] top_n = x[:n:1] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#"+str(tweet_id),"("+str(round(jc_coef,2))+")") printwidth = print_width - 2 tweet_content = tweet_content.split(' ') num = 0 word = "" i = 0 while i < len(tweet_content) : num = len(word) + len(tweet_content[i]) if num > printwidth : print(" " + word.strip()) word = "" num = 0 else : word += " " + tweet_content[i] i += 1 print(" " + word.strip()) #--------------------------------------------
# 6330455321 (20.00) 289 (2021-02-27 00:40) def get_unique( words ): unique_words = [] for e in words : if e not in unique_words : unique_words.append(e) return unique_words def jaccard(words_1, words_2): z = [] for e in words_1 : if e in words_2 : z.append(e) y = [] y += words_1 y += words_2 a = [] for e in y : if e not in a: a.append(e) jaccard_coef = len(z)/len(a) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): y = [] x = [] a = [] b = [] top_n = [] jaccardsa = 0 for i in range(len(norm_tweets)) : tweet_id = norm_tweets.index(norm_tweets[i]) jaccardsa = jaccard(norm_tweets[tweet_id], norm_query) y.append([jaccardsa,tweet_id]) y.sort() y.reverse() y.append([0,0]) # print(y) for i in range(len(y)-1) : if y[i][0] == y[i+1][0] : a.append(y[i]) elif y[i][0] != y[i+1][0] and i != 0: a.append(y[i]) # print(a) a.sort() # print(a) b.append(a) a = [] else : b.append([y[i]]) for e in b : for i in range(len(e)): top_n.append(e[i]) for e in top_n : e[0],e[1] = e[1],e[0] top_n = top_n[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print("#"+str(tweet_id),"("+str(round(jc_coef,2))+")") liss = tweet_content.split(' ') n = 0 y = [] x = [] for i in liss: n += len(i) if n <= print_width-2 : n += 1 y.append(i) else : x.append(y) y = [] y.append(i) n = len(i)+1 for i in x: print(" "+" ".join(i)) print(" "+" ".join(y)) #--------------------------------------------
# 6330458221 (20.00) 290 (2021-02-26 15:27) def get_unique( words ): unique_word = [] for i in range(len(words)): if words[i] not in unique_word: unique_word.append(words[i]) return unique_word def jaccard(words_1, words_2): u = 0 for i in range(len(words_1)): for s in range(len(words_2)): if words_1[i] == words_2[s]: u += 1 break d = (len(words_1)+len(words_2)) - u jaccard_coef = u / d return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): tw_coef = [] for i in range(len(norm_tweets)): tw_coef.append(jaccard(norm_tweets[i], norm_query)) top_n = [] for i in range(n): if max(tw_coef) > 0: top_n.append([tw_coef.index(max(tw_coef)),max(tw_coef)]) tw_coef[tw_coef.index(max(tw_coef))] = 0 else: break return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#' + str(tweet_id) + ' ' + '(' + str(round(jc_coef,2)) + ')') ct_sp = tweet_content.split() s = '' for i in range(len(ct_sp)): if i != len(ct_sp)-1: x = ' '*((tweet_content.index(ct_sp[i+1]))-((tweet_content.index(ct_sp[i]))+len(ct_sp[i]))) s += ct_sp[i] + x tweet_content = tweet_content[(tweet_content.index(ct_sp[i]))+len(ct_sp[i]):] if (print_width-2)-len(s) < len(ct_sp[i+1]): s = s[:-(len(x))] print(' ' + s) s = '' else: s += ct_sp[i] print(' ' + s) #--------------------------------------------
# 6330459921 (20.00) 291 (2021-02-28 23:16) def get_unique( words ): unique_words = [] for e in words : if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): word = 0 total_word = 0 for e in words_2 : if e in words_1 : word = word + 1 word_t = words_1 + words_2 total_word = len(word_t) - word jaccard_coef = word / total_word return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)) : j = jaccard(norm_tweets[i], norm_query) if j > 0 : top_n.append([-j,i]) top_n.sort() top_n = top_n[:n] for i in range(len(top_n)) : top_n[i] = [top_n[i][1] , (-top_n[i][0])] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#' + str(tweet_id) + ' (' + str(round(jc_coef, 2)) + ')') content = tweet_content.split(" ") out_put = "" l = 0 for e in content : if (l + (1+ len(e))) <= (print_width - 1): out_put += " " + e l = len(out_put) else : print(" " + out_put) out_put = "" out_put += " " + e l = len(out_put) if len(out_put) != 0 : print(" " + out_put) #--------------------------------------------
# 6330460421 (20.00) 292 (2021-02-27 03:14) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): words_1 = get_unique(words_1) words_2 = get_unique(words_2) allst = [] allst += words_2 for e in words_1: if e not in words_2: allst.append(e) samest = [] for e in words_1: if e in words_2: samest.append(e) jaccard_coef = len(samest) / len(allst) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] tweet_id = [] for e in norm_tweets: Jaccard = jaccard(e, norm_query) if Jaccard > 0: tweet_id.append([-Jaccard, norm_tweets.index(e)]) tweet_id.sort() for i in range(len(tweet_id)): tweet_id[i][0], tweet_id[i][1] = tweet_id[i][1], tweet_id[i][0] tweet_id[i][1] *= -1 top_n = tweet_id[:n:] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(" ") print("#" + str(tweet_id) + " " + "(" + str(round(jc_coef, 2)) + ")") tweet_content = tweet_content.split(" ") n = " " for e in tweet_content: n += " " + e if len(n) > print_width: print(n[:-len(e):]) n = " " + e print(n) #--------------------------------------------
# 6330461021 (20.00) 293 (2021-03-01 03:24) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): intersect = 0 for i in words_1: if i in words_2: intersect += 1 sum = len(words_1) + len(words_2) - intersect jaccard_coef = intersect/sum return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] b = [] p = [] for i in range(len(norm_tweets)): b.append([jaccard(norm_tweets[i], norm_query),i]) for i in b: if i[0] > 0: p.append(i) p.sort() for k in range(len(p)): if p[k-1][0] == p[k][0] and p[k-1][1] < p[k][1]: p[k-1],p[k] = p[k],p[k-1] if len(p) > n: top_n_ = p[-n::] else: top_n_ = p for i in range(len(top_n_)): top_n_[i][0],top_n_[i][1] = top_n_[i][1],top_n_[i][0] for i in range(len(top_n_)): top_n.insert(0,top_n_[i]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#" + str(tweet_id) + " (" + str(round(jc_coef,2)) +")") a = tweet_content.split(' ') b = [] c = 2 for i in range(len(a)): c += len(a[i]) if c > print_width: print(" " + " ".join(b)) c = len(a[i]) + 3 b = [a[i]] else: b.append(a[i]) c += 1 print(" " + " ".join(b)) #--------------------------------------------
# 6330462721 (20.00) 294 (2021-03-01 23:16) def get_unique( words ): unique_words = [] for i in range(len(words)) : if not words[i] in unique_words : unique_words += [words[i]] return unique_words def jaccard(words_1, words_2): u = len(get_unique(words_1 + words_2)) jaccard_coef = (len(words_1)+len(words_2)-u)/u return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] ; x = [] for tweet_id in range(len(norm_tweets)) : jc = jaccard(norm_query,norm_tweets[tweet_id]) if jc != 0 : x.append([-jc,tweet_id]) x.sort() for i in range(len(x)) : top_n.append([x[i][1],-x[i][0]]) top_n=top_n[:n:] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#"+str(tweet_id)+" "+"("+str(round(float(jc_coef),2))+")") word = tweet_content.split(" ")+[""] x = word[0] for i in range(len(word)-1) : a = x +" "+word[i+1] if len(a) <= print_width-2 : x += " " x += word[i+1] if i == len(word)-2 : print(" "+x) else : print(" "+x) x = word[i+1] #--------------------------------------------
# 6330463321 (20.00) 295 (2021-03-01 23:41) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words += [e] return unique_words def jaccard(words_1, words_2): a = [] b = [] for e in words_1: if e not in b: b += [e] if e in words_2 and e not in a : a += [e] for e in words_2: if e not in b: b += [e] if e in words_1 and e not in a : a += [e] j = len(a) s = len(b) jaccard_coef = j/s return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): l = [] for i in range(len(norm_tweets)): jec = jaccard(norm_tweets[i],norm_query) if jec>0: l.append([jec,-i]) l.sort(reverse=True) for i in range(len(l)): l[i][0],l[i][1] = -l[i][1],l[i][0] top_n =[] top_n += l[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') a = tweet_content.split(' ') b = [] for i in range(len(a)): k = len(' '.join(b))+len(a[i])+1 if k <= print_width-2: b.append(a[i]) if k > print_width-2 or i == len(a)-1: print(' '+' '.join(b)) if i == len(a)-1 and a[i] not in b: print(' '+a[i]) b = [a[i]] #--------------------------------------------
# 6330464021 (17.86) 296 (2021-02-28 23:16) def get_unique( words ): unique_words=[] for e in words: if not e in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): b=[];nsw=[];sw=[] for a1 in words_1: b.append(a1) for a2 in words_2: b.append(a2) for c in b: if not c in nsw: nsw.append(c) for g in words_1: if g in words_2: sw.append(g) jaccard_coef=round(len(sw)/len(nsw),2) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): m=[] for i in range(len(norm_tweets)): tweet_id=i jac=jaccard(norm_tweets[tweet_id],norm_query) if jac>0: m.append([jac,-tweet_id]) m.sort(reverse=True) j=m[0:n] top_n=[] for k in range(len(j)): (j[k][0]),(j[k][1])=(-1*j[k][1]),(j[k][0]) top_n.append([(j[k][0]),(j[k][1])]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): y=' ' print() print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') t=tweet_content.split() for w in t: l=len(y) if l == 1 or (l+len(w)<print_width): y+=' '+w else: print(y) y = ' '+w print(y) #--------------------------------------------
# 6330465621 (18.44) 297 (2021-03-01 16:12) def get_unique( words ): unique_words = [] for e in (words) : if not e in unique_words : unique_words.append(e) return unique_words def jaccard(words_1, words_2): k = 0 for e in words_1 : if e in words_2 : k += 1 m = len(words_1) + len(words_2) - k jaccard_coef = k/m return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = [] top1 = [] top_n = [] for tweet_id in range(len(norm_tweets)) : Jaccard = jaccard(norm_tweets[tweet_id],norm_query) jac = -Jaccard if Jaccard > 0 : top.append([jac, tweet_id]) top.sort() for g in top : g[0],g[1] = g[1],g[0] g[1] = -g[1] for i in range(len(top)) : if len(top_n) != n : if top[i][0] == top[i+1][0] and top[i][0] == top[i-1][0]: top.remove(top[i]) top.remove(top[i+1]) elif top[i][0] == top[i+1][0] : top_n.append(top[i]) top.remove(top[i+1]) else : top_n.append(top[i]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(' ') print("#" + str(tweet_id),'(' + str(round(jc_coef, 2)) + ')') tc = tweet_content.split() z = [] for e in range(len(tc)) : z.append(tc[e]) y = (' ').join(z) if e == len(tc) - 1 and len(y) <= print_width - 2 : print(" " + y ) elif len(y) == print_width - 2 : print(' ' + y) z = [] elif len(y) > print_width - 2 : if e != 0: a = z.pop(-1) y = (' ').join(z) print(' ' + y ) z = [a] if e == len(tc) - 1 : print(' '+ a) else : print(' '+ y ) z = [] #--------------------------------------------
# 6330466221 (19.10) 298 (2021-03-01 23:50) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): upper = [] tot = words_1 + words_2 uniq = [] for i in range(len(tot)): if tot[i] not in uniq: uniq.append(tot[i]) for i in range(len(words_1)): if words_1[i] in words_2: if words_1[i] not in upper: upper.append(words_1[i]) jaccard_coef = len(upper)/len(uniq) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] new = [] jac = [] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i], norm_query) > 0: new.append([i,jaccard(norm_tweets[i], norm_query)]) jac.append(jaccard(norm_tweets[i], norm_query)) if len(new) > 0: for i in range(n): top_n.append([new[jac.index(max(jac))][0],max(jac)]) jac.insert(jac.index(max(jac)), -1) jac.remove(max(jac)) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): words = tweet_content.split(' ') w = ' ' if round(jc_coef,2) > 0: print() print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') for i in words: if len(w)+len(i) <= print_width: w += i + ' ' else: print(w) w = ' '+i+' ' print(w) #--------------------------------------------
# 6330467921 (20.00) 299 (2021-02-26 21:26) def get_unique( words ): words.sort() repeat_words = [] for i in range(1,len(words)): if words[i] == words[i-1]: repeat_words.append(words[i]) for a in repeat_words: words.remove(a) unique_words = words return unique_words def jaccard(words_1, words_2): intercept = [] plus = words_1 + words_2 uniplus = get_unique(plus) for e in uniplus: if e in words_1 and e in words_2: intercept.append(e) jaccard_coef = len(intercept)/len(uniplus) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): tw = [] for i in range(len(norm_tweets)): jc = jaccard(norm_tweets[i],norm_query) if jc > 0: tw.append([-jc,i]) tw.sort() for e in tw: e[0], e[1] = e[1], e[0] e[1] = -e[1] top_n = tw[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') t = tweet_content.split(' ') show = [] i = 0 while i < len(t): js = ' '.join(show) if len(t) == 1: n = 0 else: n = len(t[1]) if len(js)+2 < print_width: show.append(t.pop(0)) js = ' '.join(show) if len(js)+3+n > print_width: print(' '+js) show = [] print(' '+' '.join(show)) #--------------------------------------------
# 6330468521 (17.00) 300 (2021-02-28 02:08) def get_unique( words ): words.sort() unique_words = [] unique_words[:] = words i = 0 while i < len(unique_words)-1 : if unique_words[i] == unique_words[i+1] : unique_words.remove(unique_words[i]) else : i += 1 return unique_words def jaccard(words_1, words_2): t = [] for e in words_1 : t.append(e) for e in words_2 : t.append(e) t.sort() s = [] i = 0 while i < len(t)-1 : if t[i] == t[i+1] : x = t.pop(i) s.append(x) else : i += 1 jaccard_coef = len(s)/len(t) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): topn = [] for i in range(len(norm_tweets)) : tweet_id = i t = [] for e in norm_tweets[i] : t.append(e) for e in norm_query : t.append(e) t.sort() s = [] i = 0 while i < len(t)-1 : if t[i] == t[i+1] : y = t.pop(i) s.append(y) else : i += 1 jaccard = len(s) / len(t) topn.append([jaccard,tweet_id]) top_n1 = [] for i in range(len(topn)) : if topn[i][0] > 0 : top_n1.append(topn[i]) top_n2 = [] for i in range(len(top_n1)) : w = -1*top_n1[i][0] top_n2.append([w,top_n1[i][1]]) top_n2.sort() top_n3 = [] for i in range(len(top_n2)) : z = -1*top_n2[i][0] top_n3.append([top_n2[i][1],z]) top_n = [] for i in range(n) : top_n.append(top_n3[i]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(" "*print_width) print("#"+str(tweet_id)+" ("+str(round(jc_coef,2))+")") t_c = tweet_content.split(" ") show = " " for e in t_c : if len(show)+len(e) > print_width : print(show) show = " "+e+" " else : show += e+" " print(show) #--------------------------------------------
# 6330469121 (19.38) 301 (2021-03-01 03:09) def get_unique( words ): unique_words=[] for i in range(len(words)): if not words[i] in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): n=[] for e in words_1: if e in words_2: n.append(e) jaccard_coef=len(n)/(len(words_1)+len(words_2)-len(n)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a=[] for i in range(len(norm_tweets)): norm_tweets[i] x=jaccard(norm_tweets[i],norm_query) if x>0: a.append([-x,i]) a.sort() for e in a: e[0],e[1]=e[1],-e[0] top_n=a[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): x=tweet_content.split(" ") tweet_id=str(tweet_id) jc_coef=str(round(jc_coef,2)) print() print('#'+tweet_id+' '+'('+jc_coef+')') y='' for i in range(len(x)): if len(y)+len(x[i])<print_width-1: y=y+' '+(x[i]) else: print(' '+y) y='' y=' '+(x[i]) print(' '+y) #--------------------------------------------
# 6330470721 (19.75) 302 (2021-03-01 11:49) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): result=[] words_all=words_1+words_2 same=0 for i in range(len(words_1)): if words_1[i] in words_2: same+=1 for i in words_all: if i not in result: result.append(i) jaccard_coef = float(same/len(result)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for k in norm_tweets: tweet_id =norm_tweets.index(k) cal_jac = jaccard(norm_tweets[tweet_id],norm_query) if cal_jac > 0: list_sub = [cal_jac,-tweet_id] top_n.append(list_sub) top_n.sort(reverse=True) for i in range(len(top_n)): top_n[i] = [-top_n[i][1],top_n[i][0]] top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): x = tweet_content.split() y = ' ' print('') print('#'+str(tweet_id),round(jc_coef,2)) for i in range(len(x)): if len(' ') + len(x[i]) + len(y) > print_width: print(y) y = ' ' + x[i] else: y = y + ' ' + x[i] print(y) #--------------------------------------------
# 6330471321 (20.00) 303 (2021-02-28 19:47) def get_unique(words): res = [] for i in words: if i not in res: res.append(i) return res def jaccard(words_1, words_2): equal = 0 for i in range(len(words_1)): for e in range(len(words_2)): if words_1[i] == words_2[e]: equal+=1 if(len(words_1) + len(words_2) == equal): return 1 return equal / (len(words_1) + len(words_2) - equal) def top_n_similarity(norm_tweets, norm_query, n): Foo = [] for i in range(len(norm_tweets) - 1, -1, -1): val = jaccard(norm_query, norm_tweets[i]) if(val > 0): Foo.append([i, val]) for i in range(len(Foo)): for j in range(len(Foo) - i - 1): if (Foo[j][1] > Foo[j + 1][1]): tempo = Foo[j] Foo[j] = Foo[j + 1] Foo[j + 1] = tempo return Foo[::-1][0:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("\n #" + str(tweet_id) + " (" + str(round(jc_coef, 2)) + ')') line = " " Foo = tweet_content.split(' ') for i in range(len(Foo)): if(len(' ') + len(Foo[i]) + len(line) > print_width): print(line) line = " " + Foo[i] else: line = line + ' '+ Foo[i] print(line) #--------------------------------------------
# 6330472021 (18.78) 304 (2021-03-01 23:11) def get_unique( words ): unique_words=[] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): c=0 for i in range(len(words_1)): if words_1[i] in words_2: c+=1 w=words_1+words_2 a=len(w) jaccard_coef=c/(a-c) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range(len(norm_tweets)): jac = jaccard(norm_tweets[i],norm_query) if jac>0: top_n.append([-jac,i]) s=[] top_n.sort() for jac,i in top_n: s.append([i,-jac]) top_n=s[:n:] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): z=tweet_content.split() print(''*print_width) print('#'+str(tweet_id) + '(' + str(round(jc_coef,2)) + ')') x = ' ' for e in z: if len(x+e) > print_width: print(' '+x[:print_width:]) x = ' ' x += e + ' ' print(' '+x) #--------------------------------------------
# 6330473621 (15.67) 305 (2021-02-27 23:03) def get_unique( words ): unique_words = [] for i in range(len(words)) : if words[i] not in unique_words : unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): s = words_1+words_2 summ = [] re = [] for i in range(len(words_1)) : if words_1[i] in words_2 : re.append(words_1[i]) for i in range(len(s)) : if s[i] not in summ : summ.append(s[i]) jaccard_coef = round(len(re)/len(summ),2) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): j = [] for tweet_id in range(len(norm_tweets)) : s = norm_tweets[tweet_id]+norm_query summ = [] re = [] for i in range(len(norm_tweets[tweet_id])) : if norm_tweets[tweet_id][i] in norm_query : re.append(norm_tweets[tweet_id][i]) for i in range(len(s)) : if s[i] not in summ : summ.append(s[i]) jac = len(re)/len(summ) j.append([jac,tweet_id]) j.sort() j = j[::-1] a =[j[0]] b = [] top_n = [] for i in range(len(j)-1) : a.append(j[i+1]) for i in range(len(a)) : b.append(a[i][::-1]) for i in range(len(b)) : for i in range(len(b)-2,-1,-1) : if b[i][1] == b[i+1][1] and b[i][0] > b[i+1][0] : b[i],b[i+1] = b[i+1],b[i] for i in range(n) : top_n.append(b[i]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width) : print('') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef, 2 ))+')') j = 0 a = tweet_content.split(' ') b = ' ' for i in range(len(a)) : j += len(a[i]) if j > print_width-2 : print(b) b = ' ' j = len(a[i]) j += 1 b += a[i]+' ' if i == len(a)-1 : print(b) #--------------------------------------------
# 6330474221 (17.60) 306 (2021-03-01 23:58) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): y = words_1 + words_2 S = [] h = len(words_1) i = 0 N = [] for x in y: if x not in S: S.append(x) while True: if i == h: break if words_1[i] in words_2: if words_1[i] not in N: N.append(words_1[i]) i += 1 else: i += 1 continue return(len(N)/len(S)) def top_n_similarity(norm_tweets, norm_query, n): new = [] jac = [] top_n = [] for i in range(len(norm_tweets)): new.append([i]) jac.append(jaccard(norm_tweets[i],norm_query)) if len(new) > 0: for i in range(n): top_n.append([new[jac.index(max(jac))][0],max(jac)]) jac.insert(jac.index(max(jac)), 0) jac.remove(max(jac)) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): if round(jc_coef,2) > 0: print() print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') words = tweet_content.split(' ') word = ' ' for k in words: if not len(k)+len(word) > print_width: word += k + ' ' else: print(word) word = ' '+k+' ' print(word) #--------------------------------------------
# 6330475921 (18.01) 307 (2021-03-01 18:14) def get_unique( words ): unique_words=[] i=0 while i<len(words): if not words[i] in unique_words: unique_words+=[words[i]] i+=1 #print(unique_words) return unique_words def jaccard(words_1, words_2): words_3=[] i=0 while i<len(words_1): if words_1[i] in words_2: words_3+=[words_1[i]] i+=1 jaccard_coef=len(words_3)/len(get_unique(words_1+words_2)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a=[] tweet_id=0 while tweet_id< len(norm_tweets): jackky=jaccard(norm_tweets[tweet_id], norm_query) a.append([-jackky,tweet_id]) tweet_id+=1 a.sort() b=[] i=0 while i < len(a): b.append([a[i][1],-a[i][0]]) i+=1 top_n=b[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') tweet_content=tweet_content.split(' ') space=' ' for e in tweet_content: space+=' '+e if print_width<len(space): space=space[:len(space)-len(e)-1] print(space) space=' '+e print(space) #--------------------------------------------
# 6330476521 (18.01) 308 (2021-03-01 17:30) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): words = len(words_1)+len(words_2) s = 0 for e in words_1: if e in words_2: s += 1 words -= 1 jaccard_coef = s/words return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] rev_topn = [] for i in range(len(norm_tweets)): tweet_id = norm_tweets[i] jaccards = jaccard(tweet_id,norm_query) rev_topn.append([jaccards,-i]) rev_topn.sort() rev_topn = rev_topn[-1::-1] for [a1,a2] in rev_topn: top_n.append([-a2,a1]) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') words = tweet_content.split(' ') s = '' a = 0 i = 0 for w in words: if len(w)+a+2 <= print_width: s += w+' ' a = len(s) else: print(' '+s) s = '' s += w+' ' a = len(s) if len(s) > 0: print(' '+s) #--------------------------------------------
# 6330477121 (20.00) 309 (2021-03-01 01:07) def get_unique( words ): SpecialWord = [] for w in words: if w in SpecialWord: pass else: SpecialWord.append(w) return SpecialWord def jaccard(words_1, words_2): SameWord = 0 All = 0 for w in words_1: if w in words_2: SameWord = SameWord+1 else: pass All = (len(words_1) + len(words_2)) - SameWord jaccard_coef = (SameWord / All) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): Top = [] for i in range(len(norm_tweets)): j = jaccard(norm_tweets[i], norm_query) if j > 0: Top.append([-j, i]) else: pass Top.sort() Top = Top[:n] for i in range(len(Top)): Top[i] = [Top[i][1], (-Top[i][0])] top_n = Top return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(" ") Show = ('#' + str(tweet_id) + ' (' + str(round(jc_coef, 2)) + ')') print(Show) X = tweet_content.split(' ') Space = '' l = 0 for i in X: if (l + 1 + len(i)) > (print_width - 1): print(' ' + Space) Space = '' Space += ' ' + i l = len(Space) else: Space += ' ' + i l = len(Space) if len(Space) != 0: print(' ' + Space) #--------------------------------------------
# 6330478821 (20.00) 310 (2021-02-28 23:07) def get_unique( words ): unique_words=[] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): a=words_1+words_2 sum=get_unique(a) m=[] for i in range(len(words_1)): if words_1[i] in words_2: m.append(words_1[i]) jaccard_coef=len(m)/len(sum) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range(len(norm_tweets)): j=jaccard(norm_tweets[i], norm_query) if j>0: top_n.append([-j,i]) top_n.sort() top_n=top_n[:n] for i in range(len(top_n)): top_n[i]=[top_n[i][1],(-top_n[i][0])] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') t=tweet_content.split(' ') e=' ' for i in range(len(t)): e+=' '+t[i] if len(e)>print_width: e=e[0:len(e)-len(t[i])] print(e) e=' '+t[i] print(e) #--------------------------------------------
# 6330481621 (17.95) 311 (2021-03-01 23:40) def get_unique( words ): unique_words=[] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): a=len(get_unique(words_1+words_2)) b=len(words_1)+len(words_2)-a jaccard_coef=b/a return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range(len(norm_tweets)): a=jaccard(norm_tweets[i],norm_query) top_n.append([-a,i]) top_n.sort() for e in top_n: e[0],e[1]=e[1],-e[0] topest_n=top_n[0:n] return topest_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content=tweet_content.split() n=0 print(' ') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') for i in tweet_content: if len(i)>=print_width-2 and n==0: print(' '*2+i) continue if len(i)<print_width-2 and n==0: print(' '*2+i+' ',end='') n+=1+len(i) continue if len(i)<=print_width-2-n: print(i+' ',end='') n+=1+len(i) continue if len(i)>print_width-n-2: print() n=0 if len(i)<print_width-2 and n==0: print(' '*2+i+' ',end='') n+=1+len(i) continue if len(i)>=print_width-2 and n==0: print(' '*2+i) continue print() #--------------------------------------------
# 6330482221 (19.95) 312 (2021-02-26 22:32) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): w1 = get_unique(words_1); w2 = get_unique(words_2) n = 0 for i in w1: if i in w2: n+=1 jaccard_coef = n/len(get_unique(w1+w2)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): jc = -1*jaccard(norm_tweets[i],norm_query) if jc != 0: top_n.append([jc,i]) top_n = sorted(top_n) top_n = [[e[1],-e[0]] for e in top_n] top_n = top_n[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(" ") print(f'#{tweet_id} ({round(jc_coef,2)})') twc = tweet_content.split() ans = " " for i in range(len(twc)): ans += " "+twc[i] if len(ans) > print_width: ans = ans[0:len(ans)-len(twc[i])] print(ans) ans=" "+twc[i] print(ans) #--------------------------------------------
# 6330483921 (17.60) 313 (2021-03-01 23:46) def get_unique( words ): unique_words = [] for i in range(len(words)) : if words[i] not in unique_words : unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): word_3=words_1+words_2 word_3.sort() c=0 for i in range (len(word_3)-1) : if word_3[i] == word_3[i+1] : c=c+1 jaccard_coef=c/(len(word_3)-c) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] t=[] for i in range (len(norm_tweets)) : if jaccard(norm_tweets[i],norm_query) > 0 : t.append([jaccard(norm_tweets[i],norm_query)-i*(10**(-10)),i]) for i in range (n) : c=max(t) d=[c[0]+(c[1]*(10**(-10))),c[1]] top_n.append(d) t.remove(max(t)) t.insert(1,[-1,-1]) for i in range (len(top_n)) : top_n[i]=top_n[i][::-1] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): word = tweet_content.split(' ') a = ' ' if round(jc_coef,2) > 0 : print(' ') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') for i in word : if len(a)+len(i) <= print_width : a += i + ' ' else : print(a) a = ' '+i+' ' print(a) #--------------------------------------------
# 6330485121 (20.00) 314 (2021-02-28 22:28) def get_unique( words ): words.sort() unique_words = [] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): d = words_1+words_2 d.sort() d.append(d[0]) s = [] for i in range(len(d)-1): if d[i] != d[i+1]: s.append(d[i]) sym = [] for e in words_1: if e in words_2: sym.append(e) jaccard_coef = len(sym)/len(s) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a = [] for tweet_id in range(len(norm_tweets)): jac = jaccard(norm_tweets[tweet_id],norm_query) if jac>0: a.append([jac,-tweet_id]) a = sorted(a,reverse=True) for i in range(len(a)): a[i][0],a[i][1] = -a[i][1],a[i][0] top_n = a[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') d = tweet_content.split(' ') s = ' ' for i in range(len(d)): x = s s = s+' '+d[i] if len(s) > print_width: print(x) s = ' '+d[i] print(s) #--------------------------------------------
# 6330486821 (20.00) 315 (2021-02-28 22:16) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words #-------------------------------------------------------- def jaccard(words_1, words_2): s = len(get_unique(words_1)) + len(get_unique(words_2)) k = len(get_unique(words_1 + words_2)) return (s-k)/k #-------------------------------------------------------- def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): jc_coef = jaccard(norm_tweets[i],norm_query) if jc_coef>0: top_n.append([-jc_coef,i]) top_n.sort() top_n = [[e[1],-e[0]] for e in top_n][:n] return top_n #-------------------------------------------------------- def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content = tweet_content.split(' ') print(f'\n#{tweet_id} ({round(jc_coef,2)})') text = ' ' for e in tweet_content: t = text text += ' ' + e if len(text)>print_width: print(t) text = ' ' + e print(text) #--------------------------------------------
# 6330487421 (20.00) 316 (2021-03-01 22:49) def get_unique( words ): x = [] for each in words: if each not in x: x.append(each) return x def jaccard(words_1, words_2): s = 1 t=1 for each in words_1: if each in words_2: s += 1 t = len(words_1)+len(words_2)-(s-1) jaccard_coef = (s-1) / t return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): tp = [] for i in range(len(norm_tweets)): j = jaccard(norm_tweets[i], norm_query) if j > 0: tp.append([-j, i]) tp.sort() tp = tp[:n] for i in range(len(tp)): tp[i] = [tp[i][1], (-tp[i][0])] return tp def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#' + str(tweet_id) + ' (' + str(round(jc_coef, 2)) + ')') c = tweet_content.split(' ') o = '' l = 0 for each in c: if (l + 1 + len(each)) <= (print_width - 1): o += ' ' + each l = len(o) else: print(' ' + o) o = '' o += ' ' + each l = len(o) if len(o) != 0: print(' ' + o) #--------------------------------------------
# 6330488021 (18.01) 317 (2021-02-26 00:09) def get_unique( words ): unique_words=[] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(w1, w2): sum= w1 + w2 a=get_unique( sum ) b=[] for i in range(len(w1)): if w1[i] in w2: b.append(w1[i]) #print(sum) #print(a) #print(b) jaccard_coef= (len(b))/(len(a)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): t=[] for i in range(len(norm_tweets)): tweet_id= i jac= jaccard(norm_tweets[tweet_id], norm_query) t.append([-jac,tweet_id]) t.sort() a=[] for j,i in t: a.append([i,-j]) top_n=a[: n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') tweet_content=tweet_content.split(' ') text=' ' for e in tweet_content: text+=' '+e if len(text)>print_width: text=text[:len(text)-len(e)-1] print(text) text=' '+e print(text) #--------------------------------------------
# 6330489721 (18.50) 318 (2021-03-01 15:59) def get_unique( words ): unique_words = [] for i in range(len(words)): if not words[i] in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): x = words_1+words_2 x.sort() x.append(x[0]) y = [] for i in range(len(x)-1): if x[i] != x[i+1]: y.append(x[i]) sym = [] for z in words_1: if z in words_2: sym.append(z) jaccard_coef = len(sym)/len(y) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): top_n.append([-jaccard(norm_tweets[i],norm_query),i]) top_n = sorted(top_n) for i in top_n: i[0],i[1] = i[1],-i[0] the_list = [] for i in range(len(top_n)): if top_n[i][1] > 0: the_list.append(top_n[i]) very_top_n = [] if len(the_list) != 0 : for i in range(n): very_top_n.append(the_list[i]) return very_top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): jround=round(jc_coef,2) print(' ') print("#"+str(tweet_id)+" ("+str(jround)+")") text = tweet_content.split(" ") current_width = 0 first_word = True for i in text: if current_width+len(i) <= print_width: if first_word == True: print(" ", end="") current_width += 2 print(i, end=" ") current_width += len(i)+1 first_word = False else: print() current_width = 0 first_word = True if current_width+len(i) > print_width: print(" " + i) current_width = 0 continue else: print(" ", end="") current_width += 2 print(i, end=" ") current_width += len(i)+1 first_word = False print() #--------------------------------------------
# 6330491921 (19.51) 319 (2021-03-01 22:26) def get_unique( words ): unique_words = [] for word in words: if word not in unique_words: unique_words.append(word) return unique_words def jaccard(words_1, words_2): unique = [] interception = 0 for word in words_1: if word in words_2: interception += 1 all_data = words_1 + words_2 unique = get_unique(all_data) if len(unique) == 0: jaccard_coef = 0 else: jaccard_coef = interception/len(unique) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for index in range(0,len(norm_tweets)): data = [] result = jaccard(norm_tweets[index],norm_query) data.extend([index,result]) top_n.append(data) # sort l = len(top_n) for i in range(0, l): for j in range(0, l-i-1): if (top_n[j][1] < top_n[j + 1][1]): tempo = top_n[j] top_n[j]= top_n[j + 1] top_n[j + 1]= tempo if top_n[0][1] == 0.0: top_n=[] return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#{} ({})".format(tweet_id, round(jc_coef,2))) count=0 print(" ", end = " ") for word in tweet_content.split(" "): # print(" ", end = " ") if count + len(word) + 1 < print_width: print(word, end = " ") count += len(word) + 1 else: print() print(" ", end = " ") if word == " " : print("\n", end = " ") else: print(word, end = " ") count=len(word)+1 print() #--------------------------------------------
# 6330492521 (18.01) 320 (2021-03-01 21:04) def get_unique( words ): unique_words=[] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): inter = [] uni = [] for i in range(len(words_1)): if words_1[i] in words_2 and words_1[i] not in inter: inter.append(words_1[i]) for i in range(len(words_2)): if words_2[i] in words_1 and words_2[i] not in inter: inter.append(words_2[i]) w = words_1+words_2 uni = get_unique(w) jaccard_coef = len(inter)/len(uni) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n =[] for i in range(len(norm_tweets)): j = jaccard(norm_tweets[i],norm_query) t=[i,j] top_n.append(t) top_n.sort(key = lambda x: x[1],reverse=True) return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') tweet_content= tweet_content.split(' ') st = ' ' for i in range(len(tweet_content)): if len(st)+1+len(tweet_content[i])<=print_width: st=st+' '+tweet_content[i] else: print(st) st = ' '+tweet_content[i] print(st) #--------------------------------------------
# 6330494821 (18.01) 321 (2021-02-27 23:17) def get_unique( words ): unique_words=[] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): a=0 for x in words_1: if x in words_2: a+=1 b=len(words_1)+len(words_2)-a jaccard_coef=a/b return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n1=[] a=[] top_n=[] for i in range(len(norm_tweets)): top_n1.append([jaccard(norm_tweets[i],norm_query),i]) top_n1.sort(reverse=True) a.append([top_n1[0][1],top_n1[0][0]]) for i in range(1,len(top_n1)): if i!=len(top_n1)-1: if top_n1[i][0]==top_n1[i-1][0]: a.append([top_n1[i][1],top_n1[i][0]]) else: a.sort() for m in range(len(a)): top_n.append([a[m][0],a[m][1]]) a.clear() a.append([top_n1[i][1],top_n1[i][0]]) else: if top_n1[i][0]==top_n1[i-1][0]: a.append([top_n1[i][1],top_n1[i][0]]) a.sort() top_n.append(a) else: a.sort() for m in range(len(a)): top_n.append([a[m][0],a[m][1]]) top_n.append([top_n1[i][1],top_n1[i][0]]) top_n=top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): y=tweet_content.split(' ') print('') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') x=' ' for k in y: if len(x+' '+str(k))<=print_width: x+=' ' x+=str(k) else: print(x) x=' '+str(k) print(x) #--------------------------------------------
# 6330495421 (17.95) 322 (2021-03-01 20:27) def get_unique( words ): unique_words=[] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): c=0 n=len(words_1)+len(words_2) for e in words_1: if e in words_2: c+=1 jaccard_coef=c/(n-c) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range (len(norm_tweets)): tweet_id=i jaccard_=jaccard(norm_tweets[tweet_id],norm_query) top_n.append([tweet_id,jaccard_]) a=sorted([[-c[1],c[0]] for c in top_n]) d=[[b[1],-b[0]] for b in a] top_n =d[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): result = "#"+str(tweet_id)+" "+"("+str(round(jc_coef,2))+ ")"+"\n" s=tweet_content.split() start=" " for ss in s: if len(start)+len(ss)>print_width: result+=start+"\n" start=" " start+=ss+" " result+=start print("") print(result) #--------------------------------------------
# 6330496021 (14.98) 323 (2021-03-01 23:59) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] in unique_words: continue else: unique_words = unique_words+[words[i]] return unique_words def jaccard(words_1, words_2): n = 0 for i in range(len(words_1)): if words_1[i] in words_2: n = n+1 jaccard_coef = n/((len(words_1)-n)+(len(words_2)-n)+n) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a = [[i,jaccard(norm_tweets[i],norm_query)] for i in range(len(norm_tweets))] top_n = [[no,-jc] for jc,no in sorted([[-a[i][1],a[i][0]] for i in range(len(a))])[:4]] d = [] for i in range(len(top_n)): if top_n[i][1] <= 0.0: d = d + [top_n[i]] for i in range(len(d)): top_n.remove(d[i]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') a = ' ' x = tweet_content.split(' ') for i in x: a = a+' '+i if len(a) > 20: a = a[:-len(i)] print(a) a = ' '+i print(a) #--------------------------------------------
# 6330497721 (20.00) 324 (2021-02-25 20:15) def get_unique( words ): unique_words=[] for i in range(len(words)): if i==0: unique_words.append(words[i]) else: if words[i] in unique_words: pass else: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): numerator=0 for i in range(len(words_1)): if words_1[i] in words_2: numerator+=1 count=[] combine=words_1+words_2 for i in range(len(combine)): if i==0: count.append(combine[i]) else: if combine[i] in count: pass else: count.append(combine[i]) denominator=len(count) jaccard_coef=numerator/denominator return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): info=[] for i in range(len(norm_tweets)): tweet_id=i j=jaccard(norm_tweets[i], norm_query) if j>0: info.append([tweet_id,j]) check=sorted([[-info[1],info[0]] for info in info])[:n] top_n=[[check[1],-check[0]] for check in check] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') tweet_content=tweet_content.split(' ') while True: output='' while len(output)+2<print_width: if tweet_content==[]: break output+=' '+tweet_content[0] tweet_content=tweet_content[1:] if tweet_content==[]: break elif len(output)+2+len(tweet_content[0])>print_width: break print(' '+output) if tweet_content==[]: break #--------------------------------------------
# 6330498321 (20.00) 325 (2021-02-28 22:17) def get_unique( words ): s=[] for i in words: if i not in s: s.append(i) return s def jaccard(words_1, words_2): top=0 for i in words_1: if i in words_2: top+=1 bot=len(get_unique(words_1+words_2)) jaccard_coef=top/bot return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): alltwe=[] n2=len(norm_tweets) for i in range(0,n2): if jaccard(norm_tweets[i],norm_query)>0: alltwe.append([i,jaccard(norm_tweets[i],norm_query)]) al1=sorted(alltwe,key=lambda x:x[0]) al1p=al1[::-1] al2=sorted(al1p,key=lambda x:x[1]) topn1=al2[::-1] top_n=topn1[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') twc=tweet_content.split(' ') i=0 line=' ' for i in range(0,len(twc)): if len(line+twc[i])>print_width: print(line) line=' ' line+=(twc[i]+' ') print(line+'\n') #--------------------------------------------
# 6330499021 (19.45) 326 (2021-03-01 11:12) def get_unique( words ): x = [] for c in words: if c not in x: x.append(c) unique_words = x return unique_words def jaccard(words_1, words_2): y = [] for i in words_1: if i in words_2: y.append(i) len_y = len(y) words_mix = words_2 + words_1 x = [] for i in words_mix: if i not in x: x.append(i) len_x = len(x) jaccard_coef = len_y / len_x return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): norm_holder = [] if n > len(norm_tweets): for i in range(len(norm_tweets)): tweet_id = i jaccard_co = jaccard(norm_tweets[tweet_id], norm_query) norm_holder.append([jaccard_co, tweet_id]) norm_sort = sorted(norm_holder, key=lambda x: x[0], reverse=True) if n < len(norm_tweets): for i in range(len(norm_tweets)): tweet_id = i jaccard_co = jaccard(norm_tweets[tweet_id], norm_query) if jaccard_co>0: norm_holder.append([jaccard_co, tweet_id]) norm_sort_1 = sorted(norm_holder, key=lambda x: x[0], reverse=True) norm_sort = norm_sort_1[:n] top_n = [[i, j] for j, i in norm_sort] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): x = '\n#{} ({})'.format(tweet_id, round(jc_coef, 2)) print(x) ans = "" m = 0 for word in tweet_content.split(): n = m + len(word) + 2 if m != 0: n += 1 if n > print_width: ans += '\n ' m = 0 if m != 0: ans += ' ' m += 1 ans += word m += len(word) print(' ' + ans) #--------------------------------------------
# 6330500921 (20.00) 327 (2021-03-01 23:10) def get_unique( words ): unique_words=[] for i in range(len(words)): if words[i] in unique_words : unique_words=unique_words else : unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): x=0 for w in range(len(words_1)): if words_1[w] in words_2 : x=x+1 m=len(words_1)+(len(words_2)-x) jaccard_coef=x/m return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a=[] top_n=[] for c in range(len(norm_tweets)): tweet_id=c jacc = jaccard(norm_tweets[c],norm_query) if jacc != 0: a.append([jacc, tweet_id]) for i in a: top_n.append([-1*(i[0]),i[1]]) top_n.sort() top_n=top_n[0:n] for i in top_n: i[0],i[1]=i[1],i[0] i[1]=i[1]*(-1) k=[1,0] for i in top_n : if i[1]==0: top_n.remove(i) if top_n==[k]: top_n.remove(k) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') x=tweet_content.split(' ') a=' ' for i in range(len(x)): if len(a)+len(x[i])<=print_width-1: if x[i]==x[-1]: a=a+' '+x[i] print(a) a=a+' '+x[i] else : print(a) a=' '+x[i] if x[i]==x[-1]: print(a) #--------------------------------------------
# 6330501521 (20.00) 328 (2021-02-27 10:44) def get_unique( words ): unique_words=[] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): p=[] for n in words_1: p.append(n) for nn in words_2: if nn not in words_1: p.append(nn) s=len(words_1)+len(words_2)-len(p) jaccard_coef=s/len(p) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): r=[] for tweet_id in range(len(norm_tweets)): ming=-jaccard(norm_tweets[tweet_id],norm_query) if ming!=0: r.append([ming,tweet_id]) r.sort() o=r[0:n:] top_n=[[o[i][1],-o[i][0]] for i in range(len(o))] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content=tweet_content.split(' ') print() print('#'+str(tweet_id)+' ('+str(round(jc_coef, 2 ))+')') eark=' ' for e in tweet_content: eark+=' '+e if len(eark)>print_width: eark=eark[:len(eark)-len(e):] print(eark) eark=' '+e print(eark) #--------------------------------------------
# 6330502121 (16.12) 329 (2021-03-01 00:21) def get_unique( words ): c=[] unique_words='' for i in range(len(words)): if words[i] in words[i+1:]: c.append(words[i]) for i in range(len(c)): words.remove(c[i]) unique_words=words return unique_words def jaccard(words_1, words_2): x=0 y=0 for i in words_1: for a in words_2: if i == a: x+=1 z=words_1+words_2 w=len(get_unique(z)) jaccard_coef= x/w return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): x=[] for i in range(len(norm_tweets)): tweet_id=i y=jaccard(norm_tweets[i], norm_query) x.append([y,tweet_id]) for a in x: a[1]=a[1]*-1 sort=sorted(x,reverse=True) for a in x: a[0],a[1]=a[1],a[0] for a in x: a[0]=a[0]*-1 top_n=sort[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n#'+str(tweet_id),'('+str(round(jc_coef,2))+')') c=tweet_content.split(' ') i=0 b=True while b==True: s=' ' while len(s) < print_width and i<len(c): s+=(' '+c[i]) i+=1 if len(s) > print_width: s=s[:-len(c[i-1])-1] i-=1 print(s) if i==len(c): b=False #--------------------------------------------
# 6330503821 (3.94) 330 (2021-03-01 18:36) def get_unique( words ): return set( words ) def jaccard(words_1, words_2): intersect_words = words_1.intersection(words_2) union_words = words_1.union(words_2) jaccard_coef = len(intersect_words)/len(union_words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for index, tweet in enumerate(norm_tweets): jaccard_coef = jaccard(tweet, norm_query) if jaccard_coef > 0: top_n.append([index, jaccard_coef]) top_n = sorted(top_n, key=lambda item: (item[1], 9999999999-item[0]), reverse=True) return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(f"\n#{tweet_id} ({round(jc_coef,2)})") index = 0 words = tweet_content.split() while index < len(words): line = " " while index < len(words) and len(line+words[index]) <= print_width: line += words[index]+" " index += 1 print(line) # --------------------------------------------
# 6330504421 (20.00) 331 (2021-03-01 01:21) def get_unique( words ): words.sort() unique_words = [] for i in range(len(words)): if i == 0: unique_words += [words[0]] else: if words[i] != words[i-1] : unique_words += [words[i]] return unique_words def jaccard(words_1, words_2): a = 0 b = [] if len(words_1) > len(words_2): b += words_1 for i in words_2: if i in words_1: a += 1 else: b += [i] else: b += words_2 for i in words_1: if i in words_2: a += 1 else: b += [i] if len(b) == 0: jaccard_coef = 0 else: jaccard_coef = a/len(b) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] pretop_n = [] for i in range(len(norm_tweets)): x = jaccard(norm_tweets[i],norm_query) if x > 0: pretop_n = [x,i] top_n.append(pretop_n) top_n = sorted(top_n, key=lambda x: x[0],reverse = True) top_n = top_n[:n] for i in range(len(top_n)): top_n[i][0],top_n[i][1] = top_n[i][1],top_n[i][0] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') x = tweet_content.split(' ') pre = ' ' show = ' ' for i in range(len(x)): pre += ' '+x[i] if len(pre) <= print_width: show = pre else : print(show) pre = ' ' + x[i] show = ' '+ x[i] if i == len(x)-1: print(show) #--------------------------------------------
# 6330505021 (15.75) 332 (2021-03-01 19:26) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): k = 0 for i in words_1 : if i in words_2: k += 1 m = len(words_1) + len(words_2) n = m - k jaccard_coef = k / n return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(0,len(norm_tweets)): eieiza = jaccard(norm_tweets[i], norm_query) if eieiza >= 0 : top_n.append([-eieiza, i]) top_n.sort() for i in range(0,len(top_n)): top_n[i] = [top_n[i][1], -top_n[i][0]] if [0.0] not in top_n[:n][1]: return top_n[:n] else: return [] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() also_thekidknow = round(jc_coef,2) powerup = ("#"+str(tweet_id) + ' ('+str(also_thekidknow)+')') print(powerup) eieihahaha = tweet_content.split(" ") mvpking = 0 niceza007 = [] for i in eieihahaha : gamer = i mvpking = mvpking + len(gamer) + 1 if mvpking <= print_width - 2 : niceza007 += [gamer] else : if mvpking > print_width-2 : niceza007 = ' '.join(niceza007) print(" " + niceza007) mvpking = 0 niceza007 =[] mvpking += len(gamer) + 1 niceza007 = niceza007 +[gamer] if gamer == eieihahaha[-1]: niceza007 = ' '.join(niceza007) print(" "+niceza007) niceza007 =[] #--------------------------------------------
# 6330507321 (19.95) 333 (2021-02-28 16:27) def get_unique( words ): unique_words=[] for e in words: if not e in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): c=0 for e in words_1: if e in words_2: c+=1 n=len(words_1)+len(words_2)-c jaccard_coef=c/n return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): b=[] a=[] for i in range(len(norm_tweets)): if jaccard(norm_query,norm_tweets[i])>0: b.append(jaccard(norm_query,norm_tweets[i])) a.append(i) c=[] for i in range(len(b)): c.append([b[i],-(a[i])]) c=sorted(c,reverse=True) c=c[0:n] c=[[-cc[1],cc[0]]for cc in c] top_n=c return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): m=tweet_content.split() print(' ') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') l=0 s=[] for i in range(len(m)-1): l+=len(m[i])+1 s.append(m[i]) if l+len(m[i+1])+1>=print_width: print(' ',' '.join(s)) l=0 s=[] s.append(m[-1]) print(' ',' '.join(s)) #--------------------------------------------
# 6330508021 (17.70) 334 (2021-03-01 09:52) def get_unique( words ): if words != []: a = sorted(words) unique_words = [a[0]] for i in range(len(a)): if i != len(a)-1: if a[i+1] != a[i]: unique_words += [a[i+1]] return unique_words def jaccard(words_1, words_2): same = 0 if words_1 != None: for i in range(len(words_1)): if words_1[i] in words_2: same += 1 sum = words_1 + words_2 else : same = 0 sum = words_2 d =len(get_unique(sum)) jaccard_coef = same/d return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): list =[] for i in range (len(norm_tweets)): if jaccard(norm_tweets[i],norm_query) > 0: list += [[round(jaccard(norm_tweets[i],norm_query),2),-i]] newlist = sorted(list,reverse=True)[:n] for i in range (len(newlist)): newlist[i][1] = newlist[i][1]*-1 newlist[i][1],newlist[i][0] = newlist[i][0],newlist[i][1] top_n = newlist return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') k = tweet_content.split(' ') sentence = ' '+k[0] for i in range(len(k)): if i != len(k)-1: sentence += ' '+k[i+1] if len(sentence) <= print_width : real = sentence if i == len(k)-2: print(real) else: print(real) sentence = ' '+k[i+1] if i == len(k)-2: print(sentence) #--------------------------------------------
# 6330509621 (18.82) 335 (2021-03-01 02:26) def get_unique( words ): unique_words = [] for n in range(len(words)) : if not(words[n] in unique_words): unique_words += [words[n]] return unique_words def jaccard(words_1, words_2): words_1 = get_unique(words_1) words_2 = get_unique(words_2) w1nw2 = [e for e in words_1 if e in words_2] w12 = words_1+words_2 w12_ = get_unique(w12) jaccard_coef = len(w1nw2)/len(w12_) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): l1 = [] for i in range(len(norm_tweets)) : j = jaccard(norm_tweets[i],norm_query) if j > 0 : l1.append([i,j]) top_n = [[e[1],-e[0]] for e in sorted([[-l[1],l[0]] for l in l1])][:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): t = tweet_content.split(' ') #['tab','a','sadasd'] print(' ') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') n=2 a=[] for e in t : a += [e] n += len(e)+1 if n > print_width : a.pop(-1) print(' '+' '.join(a)) a = [e] n = 2 + len(e) print(' ' +' '.join(a)) #--------------------------------------------
# 6330510121 (20.00) 336 (2021-02-27 10:18) def get_unique( words ): unique_words=[] for x in words : if x not in unique_words: unique_words.append(x) return unique_words def jaccard(words_1, words_2): a=0 b=0 for x in words_2: if x in words_1: a+=1 else : b+=1 jaccard_coef=a/(b+len(words_1)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for x in range(len(norm_tweets)): a = jaccard(norm_tweets[x],norm_query) if a >0 : top_n.append([x,a]) for i in range(len(top_n)): top_n[i][1]*=-1 top_n=sorted(top_n,key=lambda top: top[1]) for i in range(len(top_n)): top_n[i][1]*=-1 top_n=top_n[0:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n#'+str(tweet_id),'('+str(round(jc_coef,2))+') ') x=tweet_content.split(' ') a=0 b=' ' for i in x: if len(b)+len(i)+1<=print_width: if i==x[-1]: b+=' '+i print(b) b+=' '+i else: print(b) b=' '+i if i==x[-1]: print(b) #--------------------------------------------
# 6330511821 (18.44) 337 (2021-02-27 23:46) def get_unique( words ): unique_words = [] for i in range(len(words)): if not words[i] in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): sum_of_same = 0 the_list = [] for i in words_1 + words_2 : if not i in the_list: the_list.append(i) people_who_live_in_the_list = len(the_list) if len(words_1) < len(words_2): for i in range(len(words_1)): if words_1[i] in words_2: sum_of_same += 1 if len(words_1) >= len(words_2): for i in range(len(words_2)): if words_2[i] in words_1: sum_of_same += 1 unique_words = sum_of_same/people_who_live_in_the_list return unique_words def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): top_n.append([-jaccard(norm_tweets[i],norm_query),i]) top_n = sorted(top_n) for i in top_n: i[0],i[1] = i[1],-i[0] the_list = [] for i in range(len(top_n)): if top_n[i][1] > 0: the_list.append(top_n[i]) very_top_n = [] if len(the_list) != 0 : for i in range(n): very_top_n.append(the_list[i]) return very_top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content = tweet_content.split() the_string = '' count = 0 print(' ') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') for i in tweet_content: if len(i) >= print_width-2 and count == 0: print(' '*2+i) continue if len(i) < print_width-2 and count == 0: print(' '*2+i+' ',end = '') count += 1 + len(i) continue if len(i) <= print_width-2-count: print(i+' ',end = '') count += 1 + len(i) continue if len(i) > print_width-count-2: print() count = 0 if len(i) >= print_width-2 and count == 0: print(' '*2+i) continue if len(i) < print_width-2 and count == 0: print(' '*2+i+' ',end = '') count += 1 + len(i) continue print() #--------------------------------------------
# 6330512421 (16.00) 338 (2021-03-01 22:37) def get_unique( words ): unique_words = [] for e in words : if e not in unique_words : unique_words.append(e) return unique_words def jaccard(words_1, words_2): a = 0 for e in word_1 : if e in words_2 : a += 1 x = [] for e in (words_1+words_2) : if e not in x : x.append(e) b = len(x) jaccard_coef = a/b return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = []*n for tweet_id in range(len(norm_tweets)) : a = 0 for e in norm_tweets[tweet_id] : if e in norm_query : a += 1 x = [] for e in (norm_tweets[tweet_id]+norm_query) : if e not in x : x.append(e) b = len(x) jaccard_ = a/b top_n.append([jaccard_,tweet_id]) top_n = sorted(top_n) top_n = [[-e[0],e[1]] for e in sorted([[-e[0],e[1]] for e in top_n])] top_n = [[e[1],e[0]] for e in top_n] top_n = top_n[:n] for i in range(len(top_n)) : if top_n[i][1] == 0 : top_n = top_n[:i] break return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#"+str(tweet_id)+" "+"("+str(round(jc_coef,2))+")") m=tweet_content.split(' ') k=' ' for i in range (0,len(m)): if len(m[i])+len(k)+1 > print_width: print(k) k=' ' k=k+' '+m[i] if m[-1] == m[i]: print(k) else : pass else: k=k+' '+m[i] if m[-1] == m[i]: print(k) else : pass #--------------------------------------------
# 6330513021 (18.01) 339 (2021-02-26 23:00) def get_unique( words ): unique_words=[] for i in range(len(words)): if not words[i] in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): swords = [] awords = words_1+words_2 twords = get_unique(awords) for i in range(len(twords)): if twords[i] in words_1 and twords[i] in words_2: swords.append(twords[i]) jaccard_coef = len(swords)/len(twords) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n1 = [] list_all = [] for i in range(len(norm_tweets)): list_all.append([-jaccard(norm_tweets[i], norm_query), i]) list_all.sort() top_n1 = list_all[:n] top_n = [] for i in range(len(top_n1)): data = top_n1[i] m = [data[1], abs(data[0])] # beware forget to swap top_n.append(m) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print_width -= 2 words = tweet_content.split(' ') n_words = len(words) count = 1 all_role = list() role = '' for w in words: if len(role) + len(w) <= print_width: role += w else: role = role[:len(role)-1] all_role.append(role) role = w if count == n_words: all_role.append(role) else: role += ' ' count += 1 print(' ') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') for role in all_role: print(' ' * 2 + role) #--------------------------------------------
# 6330514721 (20.00) 340 (2021-03-01 00:18) def get_unique( words ): a=words unique_words=[] for i in range(len(a)) : if not a[i] in unique_words: unique_words+=[a[i]] return unique_words def jaccard(words_1, words_2): x=len(get_unique(words_1+words_2)) y=(len(words_1)+len(words_2)-x) jaccard_coef=y/x return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): w=[] for tweet_id in range(len(norm_tweets)): z=jaccard(norm_tweets[tweet_id],norm_query) if z>0: w+=[[z,-tweet_id]] m=sorted(w,reverse=True) for i in range(len(m)): m[i][0],m[i][1]=-m[i][1],m[i][0] top_n=m[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#"+str(tweet_id)+" "+"("+str(round(jc_coef,2))+")") s=tweet_content.split(" ") c=(" ") for i in range(len(s)): d=c c=c+" "+s[i] if len(c)>print_width: print(d) c=(" "+s[i]) print(c) #--------------------------------------------
# 6330515321 (18.33) 341 (2021-03-01 22:51) def get_unique( words ): unique_words =[] words_ =[] for i in words : if i not in words_ : words_.append(i) list_ = [] for i in words_ : list_.append([len(i),i]) list_.sort() for i in range(len(list_)) : words_[i] = list_[i][1] unique_words = words_ return unique_words def jaccard(words_1, words_2): sum_ = words_1+words_2 sum_1 = [] sum_2 = [] re = [] for i in sum_ : if i not in sum_1 : sum_1.append(i) else: sum_2.append(i) sum_2.sort() for i in sum_2 : if i not in re : re.append(i) J = len(sum_2)/len(sum_1) jaccard_coef = J return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): jaccard_ = [] jaccard_n = [] jeccard_topn = [] for i in range(len(norm_tweets)) : tweet_id = i def_j = [jaccard(norm_tweets[tweet_id],norm_query),-tweet_id] jaccard_.append(def_j) jaccard_.sort(reverse = True) jaccard_ = jaccard_[:n] for x in jaccard_ : if x[0] > 0 : jaccard_n.append(x) for e in jaccard_n : id_ = -e[1] jac = e[0] defj = [id_,jac] jeccard_topn.append(defj) top_n = jeccard_topn return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#"+str(tweet_id)+" ""("+str(round(jc_coef,2))+")") words_list = tweet_content.split(" ") s= " " while len(words_list) > 0 : if len(s + words_list[0]) < print_width : s += words_list[0] + " " words_list.pop(0) elif len(s+words_list[0]) == print_width : s += words_list[0] print(s) words_list.pop(0) s = " " else : print(s) s = " " if len(s) > 2 : print(s)
# 6330516021 (16.94) 342 (2021-02-26 23:00) def get_unique( words ): unique_words = [] for e in words : if not e in unique_words : unique_words.append(e) return unique_words def jaccard(words_1, words_2): S = [] words = words_1 + words_2 for e in words_1 : if e in words_2 : S.append(e) for e in words_2 : if e in words_1 : S.append(e) if len(get_unique(words)) > 0 : jaccard_coef = len(get_unique(S)) / len(get_unique(words)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = [] top_n = [] for i in range(len(norm_tweets)) : tweet_id = -i jaccards = jaccard(norm_tweets[i],norm_query) if jaccards > 0 : top.append([jaccards,tweet_id]) top.sort(reverse = True) for i in range(n) : tweet_id = -top[i][1] jaccards = top[i][0] top_n.append([tweet_id,jaccards]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): Atext = tweet_content.split() print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') Line = [] for i in range(len(Atext)) : if len(Line)+len(Atext[i]) <= print_width-2 : Line += Atext[i]+' ' else : print(' '+''.join(Line)) Line = [] Line += Atext[i] + ' ' print(' '+''.join(Line)) #--------------------------------------------
# 6330517621 (19.68) 343 (2021-03-01 01:18) def get_unique( words ): unique_words = [] i = 0 while i < len(words) : if words[i] not in unique_words : unique_words.append(words[i]) i += 1 return unique_words #-------------------------------------------------------- def jaccard(words_1, words_2): x = [] y = [] i = 0 di = 0 for e in words_1 : if e in words_2 : x.append(e) for i in words_1 : if i not in y : y.append(i) for di in words_2 : if di not in y : y.append(di) if len(y) != 0 : jaccard_coef = len(x)/len(y) else : jaccard_coef = 0.0 return jaccard_coef #-------------------------------------------------------- def top_n_similarity(norm_tweets, norm_query, n): x = [] y = [] for l in range(len(norm_tweets)) : tweet_id = l x.append([jaccard(norm_tweets[l],norm_query), tweet_id]) for e in x : y.append([-float(e[0]), e[1]]) y.sort() for i in y : i[0],i[1] = i[1],i[0] i[1] = -i[1] top_n = y[:n] for i in top_n : if i[1] == 0.0 : top_n.remove(i) if top_n == [[1, 0.0]] : top_n.remove([1, 0.0]) return top_n #-------------------------------------------------------- def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(' ') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') x = tweet_content.split(' ') sp = ' ' c = [] i = -1 total_length = 0 while i < len(x)-1 : i += 1 if total_length + len(x[i]) > print_width - 2 : v = ' '.join(c) co = sp+v print(co) c = [] total_length = 0 if total_length + len(x[i]) <= print_width - 2 : c.append(x[i]) total_length += len(x[i])+1 c.append(x[i]) c = c[:-1:] i += 1 v = ' '.join(c) co = sp+v print(co) #--------------------------------------------
# 6330518221 (20.00) 344 (2021-02-27 23:25) def get_unique( words ): words.sort() unique_words = list() for m in words: if m not in unique_words: unique_words.append(m) return unique_words def jaccard(words_1, words_2): m = words_1 + words_2 k = get_unique(m) n = sum([1 for q in range(len(k)) if k[q] in words_1 and k[q] in words_2]) jaccard_coef = n / len(k) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): p_n = [[w,jaccard(norm_tweets[w], norm_query)] for w in range(len(norm_tweets))] k = [[-w[1],w[0]] for w in sorted([[i[1],-i[0]] for i in p_n], reverse = True)] k = k[:n] top_n = list() for m in k: if m[1] > 0: top_n.append(m) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() s = round(jc_coef,2) print('#'+str(tweet_id),'('+str(s)+')') d = print_width - 2 k = tweet_content.split(' ') f = '' for e in k: if len(f) == 0 and len(e) > d: print(e) elif len(f) + len(e) <= d: f += e + ' ' else: print(' '+ f[:-1]) f = e + ' ' if len(f) != 0: print(' '+ f[:-1]) #--------------------------------------------
# 6330519921 (16.94) 345 (2021-03-01 19:07) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): newword = words_1+ words_2 newword_2=[] for i in newword: if i not in newword_2: newword_2.append(i) y=len(newword_2) z=len(newword)-y jaccard_coef =z/y return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): t_x = [] top_n = [] for i in range(len(norm_tweets)) : tweet_id = -i jaccards = jaccard(norm_tweets[i],norm_query) if jaccards > 0 : t_x.append([jaccards,tweet_id]) t_x.sort(reverse = True) for i in range(n) : tweet_id = -t_x[i][1] jaccards = t_x[i][0] top_n.append([tweet_id,jaccards]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): T = tweet_content.split() print(' ') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') Line = [] for i in range(len(T)) : if len(Line)+len(T[i])+2 <= print_width : Line += T[i]+' ' else : print(' '+''.join(Line)) Line = [] Line += T[i] + ' ' print(' '+''.join(Line)) #--------------------------------------------
# 6330520421 (9.33) 346 (2021-03-01 23:58) def get_unique( words ): unique_words = [i for j, i in enumerate(words) if i not in words[:j]] return unique_words def jaccard(words_1, words_2): a=set.intersection(set(words_1),set(words_2)) b=set.union(set(words_1),set(words_2)) na=len(a) nb=len(b) jaccard_coef=na/nb return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): for i in range(len(norm_tweets)): tweet_id=norm_tweets[i] t=True a=jaccard(norm_tweets[tweet_id], norm_query) if a > 0: top_n=sorted([[-a,tweet_id[i]] for i in range(len(norm_tweets))])[:n] else: t=False return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): s=tweet_content.split(' ') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') sp=' ' while len(tweet_content)< print_width: for i in s: sp += ' '+i if len(s) > print_width: print(sp[:-len(s)]) ss=s.split() ss=str(ss)[2:-2] sp=' '+ss print(sp) #--------------------------------------------
# 6330521021 (16.12) 347 (2021-02-27 14:42) def get_unique( words ): unique_words=[] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): twords=[] for i in range(len(words_1)): if words_1[i] not in twords: twords.append(words_1[i]) for i in range(len(words_2)): if words_2[i] not in twords: twords.append(words_2[i]) swords=len(words_1)+len(words_2)-len(twords) jcard=swords/len(twords) return jcard def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): tid = i jcard = jaccard(norm_tweets[tid], norm_query) if jcard >= 0: top_n.append([-jcard, tid]) top_n.sort() for i in range(len(top_n)): top_n[i] = [top_n[i][1], -top_n[i][0]] if [0.0] in top_n[:n][1]: return [] else: return top_n[:n] def show_tweet(tweet_id, wordss, jc_coef, length): def count_len(list_): k = 0 for i in range(len(list_)): k += len(list_[i]) return k words = wordss.split(" ") jc_coef = round(jc_coef, 2) print(end = '\n') print('#' + str(tweet_id) + ' (' + str(jc_coef) + ')') string = [] total = 0 while total < len(wordss): k = 0 count = 0 sentence = [] if len(words) == 0: break for i in range(len(words)): k += len(words[i]) + 1 if k < length: sentence.append(words[i]) else: count += i break line = ' '.join(sentence) string.append(line) words = words[count:] total += count_len(sentence) + count if string[-2] == string[-1]: for i in range(len(string)-1): print(' ' + string[i]) else: for i in range(len(string)): print(' ' + string[i]) #--------------------------------------------
# 6330522721 (18.01) 348 (2021-03-01 18:26) def get_unique( words ): unique_words = [] for i in range(len(words)): if not words[i] in unique_words : unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): n1 = len(words_1) n2 = len(words_2) n1_i_n2 = 0 for e in words_1: if e in words_2: n1_i_n2 += 1 jaccard_coef = (n1_i_n2) / ( n1 + n2 - n1_i_n2 ) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for tweet_id in range(len(norm_tweets)): top_n.append( [ -jaccard( norm_query , norm_tweets[tweet_id] ) , tweet_id ] ) top_n = [[top[1],-top[0]] for top in sorted(top_n)][:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#' + str(tweet_id) , '(' +str(round(jc_coef,2)) +')' ) tweet_content = tweet_content.split(' ') t =' '+tweet_content[0] for i in range(1,len(tweet_content)): if len(t+' '+tweet_content[i]) > print_width and i != len(tweet_content)-1 : print(t) t = ' '+tweet_content[i] elif len(t+' '+tweet_content[i]) > print_width and i == len(tweet_content)-1: print(t) t = ' '+tweet_content[i] print(t) elif len(t+' '+tweet_content[i]) <= print_width and i != len(tweet_content)-1 : t += ' '+tweet_content[i] else: t += ' '+tweet_content[i] print(t) #--------------------------------------------
# 6330523321 (17.72) 349 (2021-02-28 17:36) def get_unique( words ): unique_words = [] for i in words: if not i in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): words1 = [] for i in words_1: if i in words_2: words1.append(i) for i in words_2: if not i in words_1: words_1.append(i) jaccard_coef = len(words1)/len(words_1) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] temp_top_n = [] for tweet_id in range(len(norm_tweets)): item = norm_tweets[tweet_id] temp_jaccard = jaccard(item,norm_query) temp_result = [tweet_id,temp_jaccard] if(temp_jaccard > 0 ): temp_top_n.append(temp_result) def take_jaccard(elem): return elem[1] top_n =sorted(temp_top_n,key=take_jaccard,reverse=True)[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(' ') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') split_word = tweet_content.split(' ') all_lines = [] current_lines_index = 0 for word in split_word: if len(all_lines) <= 0 : all_lines.append(' ') cur_lines = all_lines[current_lines_index] if len(word) + len(cur_lines)+1 <= print_width : all_lines[current_lines_index] = cur_lines+' '+word else : current_lines_index = current_lines_index+1 all_lines.append(' ') cur_lines = all_lines[current_lines_index] all_lines[current_lines_index] = cur_lines+' '+word for curlines in all_lines: print(curlines) #--------------------------------------------
# 6330524021 (20.00) 350 (2021-02-26 16:09) def get_unique( words ): unique_words=[] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): c=0 words_0=words_1+words_2 words_0.sort() for i in range(1,len(words_0)): if words_0[i]==words_0[i-1]: c+=1 words_0 = get_unique(words_0) jaccard_coef=c/len(words_0) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range(len(norm_tweets)): c= [-i,jaccard(norm_tweets[i],norm_query)] if c[1]>0: top_n.append(c[::-1]) top_n=[[-e[1],e[0]] for e in sorted(top_n,reverse=True)[:n]] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print();print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') words = tweet_content.split(' ') c=[] for i in words: c.append(i) if len(' '.join(c))+2>print_width: print(' ',' '.join(c[:-1])) c=[i] print(' ',' '.join(c)) #--------------------------------------------
# 6330525621 (18.01) 351 (2021-02-28 19:00) def get_unique( words ): unique_words = [] for e in words: if not e in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): jac = 0 unq_words = [] for e in words_1: if not e in unq_words: unq_words.append(e) for e in words_2: if e in words_1: jac += 1 else: unq_words.append(e) jaccard_coef = jac/len(unq_words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = [] top_n = [] for i in range(len(norm_tweets)): jac = 0 unq_words = [] for a in norm_tweets[i]: if not a in unq_words: unq_words.append(a) for b in norm_query: if b in unq_words: jac += 1 else: unq_words.append(b) jaccard_co = jac/len(unq_words) top.append([-jaccard_co,i]) top.sort() for i in range(len(top)): if i < n : top_n.append([top[i][1],-top[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content = tweet_content.split(' ') tweet = [] a = 0 print(' ') print('#'+str(tweet_id)+' ('+str(round(float(jc_coef),2))+')') for i in range(len(tweet_content)): tweet.append(tweet_content[i]) if 2 + len(' '.join(tweet)) > print_width: print(' '+' '.join(tweet[0:a])) tweet = [tweet[-1]] a = 0 if tweet_content[i] == tweet_content[-1]: print(' '+' '.join(tweet)) a +=1 #--------------------------------------------
# 6330526221 (17.75) 352 (2021-02-27 15:37) def get_unique( words ): words.sort() x = [] if words == [] : words = [] else: e = words[0] c = 1 for i in range(1,len(words)): if words[i] == e : c += 1 else : x.append(e) e = words[i] c = 1 x.append(e) unique_words = x return unique_words def jaccard(words_1, words_2): a = 0 b = 0 for i in words_1 : if i in words_2 : a += 1 b += 1 else : b += 1 b += len(words_2)-a jaccard_coef = a/b return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): x = [] top_n = [] for i in range(len(norm_tweets)) : Jaccrad_coefficient = jaccard(norm_tweets[i],norm_query) if Jaccrad_coefficient > 0: x.append([Jaccrad_coefficient,i]) x.sort(reverse=True) for i in range(len(x[:n])) : top_n.append([x[i][1],x[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): a = tweet_content.split(' ') c = '' print('') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') for i in range(len(a)) : if len(c) <= print_width - len(a[i])-2 : c += a[i] + ' ' else : print(' '+c) c = ''+a[i]+' ' print(' '+ c) #--------------------------------------------
# 6330527921 (19.95) 353 (2021-03-01 20:00) def get_unique( words ): words.sort() i = 1 while i < len(words) : if words[i] == words[i-1] : words.remove(words[i-1]) else : i += 1 unique_words = words return unique_words def jaccard(words_1, words_2): mix = words_1 + words_2 get_unique(mix) jaccard_coef = (len(words_1) + len(words_2) - len(mix))/len(mix) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = sorted([[jaccard(norm_tweets[tweet_id],norm_query), -tweet_id] for tweet_id in range(len(norm_tweets))],reverse=True) top_n = [[-top[1], top[0]] for top in top][:n] i = 0 while i < len(top_n) : if float(top_n[i][1]) == 0 : top_n.remove(top_n[-1]) else : i += 1 return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') k = tweet_content.split() i = 0 n = 0 m = 0 while i < len(k) : n += len(k[i])+1 if n > print_width-1 : print(' '+' '.join(k[m:i])) m = i n = len(k[i])+1 i += 1 print(' '+' '.join(k[m:i])) #-------------------------------------------
# 6330528521 (20.00) 354 (2021-03-01 00:19) def get_unique( words ): unique_words=[] for i in words: if i in unique_words: unique_words=unique_words else : unique_words.append(i) return unique_words def jaccard(words_1, words_2): words_12=[] words1_2=[] for i in words_1: if i in words_12: words_12=words_12 else : words_12.append(i) for i in words_2: if i in words_12: words_12=words_12 else: words_12.append(i) for i in words_1: if i in words_2: words1_2.append(i) a=len(words_12) b=len(words1_2) jaccard_coef=b/a return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] list_=[] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i],norm_query)>0: list_.append([i,jaccard(norm_tweets[i],norm_query)]) for i in range(len(list_)): list_[i][0],list_[i][1]=-list_[i][1],list_[i][0] list_.sort() top_n=list_[:n] for i in range(len(top_n)): top_n[i][0],top_n[i][1]=top_n[i][1],-top_n[i][0] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): earth = tweet_content.split(' ') print('\n'+'#'+str(tweet_id),'('+str(round(jc_coef,2))+')') c=' ' for e in earth: c += ' '+e if len(c) > print_width: print(c[:-len(e)]) l=e.split() l=str(l)[2:-2] c=' '+l print(c) #--------------------------------------------
# 6330529121 (18.01) 355 (2021-03-01 16:03) def get_unique( words ): words.sort() unique_words =[] if len(words) != 0 : unique_words = [words[0]] for i in range(1,len(words)) : if words[i] != words[i-1] : unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): sade=[] for ch in words_1 : if ch in words_2: sade.append(ch) words_3 = words_1 + words_2 words_3.sort() suan=[words_3[0]] for i in range(1,len(words_3)) : if words_3[i] != words_3[i-1] : suan.append(words_3[i]) jaccard_coef = len(sade)/len(suan) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n= [] for i in range(len(norm_tweets)) : jaccard_coef = jaccard(norm_tweets[i],norm_query) top_n.append([jaccard_coef,-i]) top_n.sort(reverse=True) for e in top_n : e[0],e[1] = -e[1],e[0] top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') tweet_content = tweet_content.split(' ') sentence = '' for ch in tweet_content : if len(sentence)==0 : sentence = sentence + ' ' + ' ' + ch if len(sentence) >= print_width: print(sentence) sentence = '' elif len(sentence)+len(ch)+1 <= print_width : sentence = sentence +' ' + ch elif len(sentence)+len(ch)+1 >= print_width : print(sentence) sentence=' ' + ch if len(sentence) >= print_width: print(sentence) sentence = '' if len(sentence) != 0 : print(sentence) #--------------------------------------------
# 6330530721 (20.00) 356 (2021-02-28 23:04) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): intersec_words = [] union_words = [] for i in words_1 : if i not in union_words: union_words.append(i) for i in words_2: if i not in union_words: union_words.append(i) for x in words_1: if x in words_2: intersec_words.append(x) jaccard_coef = (len(intersec_words)/len(union_words)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] a = [] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i],norm_query) > 0: a.append([i,jaccard(norm_tweets[i],norm_query)]) a.sort(key=lambda x:x[1],reverse=True) top_n = a[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n' + '#' + str(tweet_id) + ' (' + str(round(jc_coef,2)) + ')') tweet_content = tweet_content.split(' ') t_space = ' ' for i in range(len(tweet_content)): t_space += ' ' + tweet_content[i] if len(t_space) > print_width: print(t_space[:-len(tweet_content[i])]) q = tweet_content[i].split() q = str(q)[2:-2] t_space = ' '+q print(t_space) #--------------------------------------------
# 6330531321 (19.95) 357 (2021-03-01 02:02) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words #------------------------------------------------------------------------# def jaccard(words_1, words_2): yahoo1 = 0 for e in words_1: if e in words_2: yahoo1 += 1 k = words_1 + words_2 hewkaew = [] for e in k: if e not in hewkaew: hewkaew.append(e) yahoo2 = len(hewkaew) jaccard_coef = yahoo1/yahoo2 return jaccard_coef #------------------------------------------------------------------------# def top_n_similarity(norm_tweets, norm_query, n): lizze = [] for i in range(len(norm_tweets)): jaccardy = jaccard(norm_tweets[i],norm_query) if jaccardy == 0.0: continue else: lizze.append([jaccardy,i]) lizze.sort(reverse = not not not not not not not not not not not not not not not not not not not not not not not not not not not not not not not not not not not not not not not not True) for e in lizze: e[0],e[1] = e[1],e[0] top_n = lizze for e in top_n: if e[1] == 0: top_n.remove(e) def somchai(spj): return spj[0] top_n.sort(key = somchai) def som(chai): return chai[1] top_n.sort(reverse = True ,key=som) top_n = top_n[:n] return top_n #------------------------------------------------------------------------# def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("#"+str(tweet_id),"("+str(round(jc_coef,2))+")") tweety= tweet_content.split() tweeter = [] ig = [] for e in tweety: if len(e)+len("0".join(tweeter)) < print_width-2: tweeter.append(e) else: ig.append(tweeter) tweeter = [] tweeter.append(e) if tweeter != []: ig.append(tweeter) tinder = [] for e in ig: facebook = " ".join(e) tinder.append(facebook) for e in tinder: print(" ",e) #--------------------------------------------
# 6330532021 (18.50) 358 (2021-02-28 22:38) def get_unique( words ): x = sorted(words) for i in range(len(words)): if x[i]==x[i-1]: words.remove(x[i]) unique_words = words return unique_words def jaccard(words_1, words_2): n1 = 0 for i in words_1: for e in words_2: if i == e: n1 += 1 n2 = len(words_1)+len(words_2)-n1 if n1 == 0: jaccard_coef = 0 else: jaccard_coef = n1/n2 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): x = [] for i in range(len(norm_tweets)): x.append([-jaccard(norm_tweets[i], norm_query),i]) y = sorted(x) top_n = [[y[i][1],-y[i][0]] for i in range(len(y))] top_n = top_n[0:n] for i in range(len(top_n)): if top_n[n-i-1][1] == 0: top_n.remove(top_n[n-i-1]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') n = 2 x = tweet_content.split(' ') st = '' for i in range(len(x)): if n+len(x[i]) <= print_width: st += x[i]+' ' n = n+ len(x[i])+1 if i == len(x)-1: print(' '+st[0:-1]) else: print(' '+st[0:-1]) st = x[i]+' ' if i == len(x)-1: print(' '+st[0:-1]) n = 2+ len(x[i])+1 #--------------------------------------------
# 6330533621 (20.00) 359 (2021-02-26 23:02) def get_unique( words ): unique_words=[] for i in words: if not i in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): words_same=[] for i in words_1: if i in words_2: words_same.append(i) jaccard_coef=len(words_same)/(len(words_1)+len(words_2)-len(words_same)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] list_=[] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i],norm_query)>0: list_.append([i,jaccard(norm_tweets[i],norm_query)]) list_.sort(key=lambda x:x[1],reverse=True) top_n=list_[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n'+'#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') tweet_content=tweet_content.split(' ') t=' ' for i in range(len(tweet_content)): t+=' '+tweet_content[i] if len(t)>print_width: print (t[:-len(tweet_content[i])]) a=tweet_content[i].split() a=str(a)[2:-2] t=' '+a print(t) #--------------------------------------------
# 6330534221 (18.01) 360 (2021-03-01 16:04) def get_unique( words ): unique_words=[] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): n=0 for e in words_1: if e in words_2: n+=1 jaccard_coef=n/(len(words_1)+len(words_2)-n) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top=[] for i in range(len(norm_tweets)): e=norm_tweets[i] jac=jaccard(e,norm_query) top.append([i,jac]) for i in range(len(top)): e=top[i] top[i]=[-e[1],e[0]] top.sort() for i in range(len(top)): e=top[i] top[i]=[e[1],-e[0]] top_n=top[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id),"("+str(round(jc_coef,2))+")") word=tweet_content.split(' ') case=[] for i in range(len(word)): e=word[i] case.append(e) a=len(" "+" ".join(case)) if a>print_width: if i==0: print(" "+" ".join(case)) else: print(" "+" ".join(case[:-1])) case=[e] print(" "+" ".join(case)) #--------------------------------------------
# 6330535921 (20.00) 361 (2021-02-27 04:56) def get_unique( words ): x = sorted([len(i),i] for i in words) y = [] if words != []: y = [x[0][1]] for i in range(1,len(x)): if x[i][1] != y[-1] : y.append(x[i][1]) unique_words = y return unique_words def jaccard(words_1, words_2): same = 0 for i in range(len(words_1)): for j in range(len(words_2)): if words_1[i] == words_2[j]: same += 1 jaccard_coef = same/((len(words_1)-same)+len(words_2)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): ids = sorted([[i,jaccard(norm_tweets[i],norm_query)] for i in range(len(norm_tweets))]) topn = [] top_n = [] for i in range(len(ids)): if ids[i][1] > 0 : topn.append(ids[i]) topn = sorted(topn, key=lambda x: x[1],reverse = True) if n < len(topn): for i in range(n): top_n.append(topn[i]) else: for i in range(len(topn)): top_n.append(topn[i]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') tweet_list = tweet_content.split(' ') lis = tweet_content.split(' ') s = ' ' a = [] n = 0 for i in lis: if n == 0: s += i+' ' if len(s)-2 >= print_width: if len(s)==print_width: print(s) else: print(s[:-(len(i)+2)]) n+=1 s =' ' s +=i+' ' else: s += i+' ' if len(s)-2 >= print_width: if len(s)==print_width: print(s) else: print(s[:-(len(i)+2)]) s=' ' s+=i+' ' print(s) #--------------------------------------------
# 6330536521 (20.00) 362 (2021-03-01 08:05) def get_unique( words ): i=0 while i<int(len(words)): if words[i] in words[:i]+words[i+1:]: words=words[:i]+words[i+1:] i-=1 i+=1 unique_words=words return unique_words def jaccard(words_1, words_2): n=0 wo1=get_unique(words_1) wo2=get_unique(words_2) for e in wo1: if e in wo2 : n+=1 jaccard_coef=n/int(len(get_unique(wo1+wo2))) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i], norm_query)>0: top_n+=[[i,jaccard(norm_tweets[i], norm_query)]] top_n=[[k[1],-k[0]] for k in sorted([[-e[1],e[0]] for e in top_n])] top_n=top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): n=0 tweet_content=tweet_content.split(" ") print(" ") print("#"+str(tweet_id)+" ("+str(round(jc_coef,2))+")") con=[] for e in range(len(tweet_content)) : if int(len(con))+n+1>print_width: print(" "+" ".join(con[:-1])) con=[con[-1]] n=len(tweet_content[e-1]) con+=[tweet_content[e]] n+=len(tweet_content[e]) else : con+=[tweet_content[e]] n+=len(tweet_content[e]) if int(len(con))+n+1>print_width: print(" "+" ".join(con[:-1])) print(" "+con[-1]) else: print(" "+" ".join(con)) #--------------------------------------------
# 6330537121 (20.00) 363 (2021-02-28 20:49) def get_unique( words ): unique_words = [] for i in words : if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): union_word = [] intersect_word = [] for i in words_1 : if i not in union_word: union_word.append(i) for i in words_2 : if i not in union_word: union_word.append(i) for i in words_1: if i in words_2: intersect_word.append(i) x = len(union_word) y = len(intersect_word ) jaccard_coef = y/x return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] list_=[] for e in range(len(norm_tweets)): if jaccard(norm_tweets[e],norm_query)>0: list_.append([e,jaccard(norm_tweets[e],norm_query)]) list_.sort(key=lambda x:x[1],reverse=True) top_n=list_[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): con = tweet_content.split(' ') print('') print('\n'+'#'+str(tweet_id),'('+str(round(jc_coef,2))+')') p=' ' for e in range(len(con)): p += ' '+con[e] if len(p) > print_width: print(p[:-len(con[e])]) l=con[e].split() l=str(l)[2:-2] p=' '+l print(p) #--------------------------------------------
# 6330538821 (17.97) 364 (2021-03-01 12:10) def get_unique( words ): words=sorted(words) unique_words = [] for i in range (len(words)) : if i == 0: unique_words += [words[i]] else : if words[i] != words[i-1] : unique_words += [words[i]] return unique_words def jaccard(words_1, words_2): a=[] w=words_1 + words_2 ww=get_unique( w ) down = len(ww) for i in range (len(words_1)): for j in range (len(words_2)): if words_1[i] == words_2[j]: a += [words_1[i]] up = len(a) jaccard_coef=up/down return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): b=[] for i in range (len(norm_tweets)): jaccards=jaccard(norm_tweets[i],norm_query) b.append([i,jaccards]) top=[[i[1],-i[0]] for i in sorted([[-j[1],j[0]] for j in b])[:n]] a=[] for j in top: if j[1] > 0 : a.append(j) top_n= a return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') c=tweet_content.split(" ") cc=0 con=[] for i in c : cc += len(i)+1 if cc <= print_width-2 : con += [i] elif cc > print_width-2 : con=" ".join(con) print(' '+con) con=[] cc=0 con += [i] cc+= len(i)+1 if i==c[-1] : con=" ".join(con) print(' '+con) con=[] cc=0 #--------------------------------------------
# 6330539421 (20.00) 365 (2021-03-01 04:01) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): a = words_1+words_2 b = len(get_unique(a)) if b == 0: jaccard_coef = 0 else: jaccard_coef = (len(a)-b)/b return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): b = [] for i in range(len(norm_tweets)): a = jaccard(norm_tweets[i],norm_query) if a > 0: b.append([i,a]) top_n = [[c[1],-c[0]] for c in sorted([[-t[1],t[0]] for t in b])][:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') a = 0 b = 1 print(' '+tweet_content[0],end='') i = 1 d = 1 while i < len(tweet_content): if b == 1: #print words print(tweet_content[i],end='') d += 1 if i+1 < len(tweet_content): if tweet_content[i+1] == ' ': b = 0 a = 0 i += 1 else: if a != 0: #' '??? while i+a < len(tweet_content) and tweet_content[i+a] != ' ' : a += 1 if d + a <= print_width-2 : b = 1 else: while tweet_content[i+1] == ' ': i += 1 print() print(' ',end='') b = 1 d = -1 a = 0 else: while i+a < len(tweet_content) and tweet_content[i+a] == ' ': a += 1 print() #--------------------------------------------
# 6330540021 (18.33) 366 (2021-02-28 20:00) def get_unique( words ): unique = [] for word in words: if word in unique: continue else: unique.extend([word]) return unique def jaccard(words_1, words_2): words_1, words_2 = get_unique(words_1), get_unique(words_2) both, either = [], [] for word in words_1: if check_exist(word, words_2): both.extend([word]) if not check_exist(word, either): either.extend([word]) for word in words_2: if not check_exist(word, either): either.extend([word]) num_same = len(both) num_all = len(either) return num_same/num_all def top_n_similarity(norm_tweets, norm_query, n): i = 0 top_n = [] for norm_tweet in norm_tweets: jc_coef = jaccard(norm_tweet, norm_query) if jc_coef > 0: top_n.append([i, jc_coef]) i += 1 top_n.sort(key=lambda tweet: (-tweet[1], tweet[0])) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("\n#" + str(tweet_id), '(' + str(round(jc_coef, 2)) + ')') real_print_width = print_width - 2 len_tweet = len(tweet_content) while len_tweet > 0: if len_tweet < real_print_width: print(" ", tweet_content) break elif tweet_content[real_print_width] == ' ': print(" ", tweet_content[:real_print_width]) tweet_content = tweet_content[real_print_width:] else: index_space = tweet_content[:real_print_width].rfind(' ') + 1 print(" ", tweet_content[:index_space]) tweet_content = tweet_content[index_space:] tweet_content = tweet_content.lstrip() len_tweet = len(tweet_content) #-------------------------------------------- def check_exist(word, l): return word in l
# 6330541621 (18.13) 367 (2021-03-01 19:57) def get_unique( words ): unique_words = [] for i in words: if not i in unique_words: unique_words.append(i) else: pass return unique_words def jaccard(words_1, words_2): x = [] y = words_1 + words_2 for j in words_1: for k in words_2: if j == k: x.append(j) for m in words_2: if m in words_1[0:]: y.remove(m) jaccard_coef = len(x)/len(y) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] top = [] z = 0 for a in norm_tweets: Nrm = [z,jaccard(a,norm_query)] if Nrm[1] > 0: top.append(Nrm) z = z + 1 top.sort(key=lambda Text:(-Text[1], Text[0])) for b in top: if not b in top: top_n.append(b) top_n = top[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id),'('+ str(round(jc_coef,2))+')') twtcon = tweet_content.split() twtlen = 0 print_width = print_width-2 print(' ',end ='') for i in twtcon: twtlen = twtlen + len(i) twtcon = tweet_content.split() if twtlen < print_width : print(i,end =' ') else: twtlen = 0 twtcon = twtcon[twtcon.index(i):] print('') print(' ',end ='') print(i,end =' ') twtlen = twtlen+1 print('') #--------------------------------------------
# 6330542221 (20.00) 368 (2021-03-01 18:39) def get_unique( words ): a = sorted(words) b = '' unique_words = [] for i in a: if i != b: unique_words.append(i) b = i return unique_words def jaccard(words_1, words_2): s = get_unique(words_1+words_2) same_words = [] d = '' for i in sorted(words_1) : if i in sorted(words_2) and i!=d: same_words.append(i) d = i #if len(s) != 0 : jaccard_coef = len(same_words)/len(s) #else : # jaccard_coef = 0 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] p = 0 for i in norm_tweets : j = jaccard(i,norm_query) if j > 0 : top_n.append([p,j]) p += 1 top_n = ([[-top[0],top[1]] for top in sorted([[-top[1],top[0]] for top in top_n])]) top_n = [[top[1],top[0]] for top in top_n][:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') s = tweet_content.split(' ') line = ' ' p = 2 for e in s : p = p+len(e) if p > print_width: print(line) line = ' ' p = len(e)+2 line = line + e+' ' p = p+1 print(line) #--------------------------------------------
# 6330543921 (18.82) 369 (2021-02-28 00:08) def get_unique( words ): unique_words = [] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): s = 0 for i in words_1: if i in words_2: s = s+1 jaccard_coef = s/(len(words_1)+len(words_2)-s) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] tweet_id = 0 for r in norm_tweets: jc_coef = jaccard(r, norm_query) if jc_coef != 0: top_n.append([tweet_id, jc_coef]) tweet_id = tweet_id+1 top_n = [[top[1],-top[0]] for top in sorted([[-top[1],top[0]] for top in top_n])][:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): p_width = print_width content = (' ') print(' \n#'+str(tweet_id), '('+str(round(jc_coef,2))+')') contents = tweet_content.split(' ') for i in range(len(contents)): if len(content + contents[i] + ' ') > print_width: print_width = len(content) + p_width + 2 content = content + '\n' + ' ' + contents[i] + ' ' else: content = content + contents[i] + ' ' print(content) #--------------------------------------------
# 6330544521 (18.82) 370 (2021-02-28 00:06) def get_unique( words ): unique_words = [] for word in words: if word not in unique_words: unique_words.append(word) return unique_words def jaccard(words_1, words_2): n = 0 for word in words_2: if word in words_1: n += 1 union = len(words_1) + len(words_2) - n jaccard_coef = n/union return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): tweet_id = 0 top_n = [] while tweet_id < len(norm_tweets): jaccard_no = jaccard(norm_tweets[tweet_id], norm_query) if jaccard_no != 0: top_n.append([tweet_id, jaccard_no]) tweet_id += 1 top_n = [[top[1], -top[0]] for top in sorted([[-top[1], top[0]] for top in top_n])][:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): col_width = print_width content = (' ') print(' \n#'+str(tweet_id), '('+str(round(jc_coef,2))+')') contents = tweet_content.split(' ') count = 0 while count < len(contents): if len(content + contents[count] + ' ') > print_width: print_width = len(content) + col_width + 2 content = content + '\n' + ' ' + contents[count] + ' ' else: content = content + contents[count] + ' ' count += 1 print(content) #--------------------------------------------
# 6330545121 (18.13) 371 (2021-03-01 02:47) def get_unique( words ): unique_words = [] for i in words: if not i in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): [Union,Intercept] = [[],[]] for i in words_1+words_2: if not i in Union: Union.append(i) if i in words_1 and i in words_2 and i not in Intercept: Intercept.append(i) jaccard_coef = len(Intercept)/len(Union) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): [k,top_n0,top_n] = [0,[],[]] for i in norm_tweets: Data = [k,jaccard(i, norm_query)] if Data[1] > 0: top_n0.append(Data) k = k + 1 top_n0.sort(key=lambda Text: (-Text[1], Text[0])) for i in top_n0: if not i in top_n: top_n.append(i) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id),'('+ str(round(jc_coef,2))+')') [Newt,LenNewt,RLenNewt,print_width] = [tweet_content.split(),[ ],0,print_width - 2] print(' ',end ='') for i in Newt: Newt = tweet_content.split() RLenNewt = RLenNewt+len(i) if RLenNewt < print_width : print(i,end =' ') else: Newt = Newt[Newt.index(i):] print('') print(' ',end ='') print(i,end =' ') RLenNewt = 0 RLenNewt = RLenNewt+1 print('') #--------------------------------------------
# 6330547421 (18.72) 372 (2021-03-01 22:41) def get_unique( words ): unique_words = [] for i in range(len(words)) : if not (words[i] in words[i+1:]) : unique_words += [words[i]] else : unique_words += [] return unique_words def jaccard(words_1, words_2): i = 0 if len(words_1) > len(words_2) : min = words_2 max = words_1 else : min = words_1 max = words_2 for q in range(len(min)) : if min[q] in max : i += 1 else : i += 0 mw = words_1 + words_2 b = len(get_unique(mw)) jaccard_coef = i / b return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): j = [] tn = [] tnn = [] top_n = [] for i in range(len(norm_tweets)) : if jaccard(norm_tweets[i],norm_query) != 0.00 : tn.append([(jaccard(norm_tweets[i],norm_query)),i]) elif jaccard(norm_tweets[i],norm_query) == 0.00 : tn += [] tnn = [[-tn[i][0],tn[i][1]] for i in range(len(tn))] tpn = sorted(tnn) top_n = [[tpn[i][1],-tpn[i][0]] for i in range(len(tpn))][:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') tw = tweet_content.split(' ') ss = [] for i in range(len(tw)) : ss += [tw[i]] s = ' '.join(ss) if len(s) == print_width-2 : print(' '+s) ss = [] elif len(s) < print_width-2 and i == len(tw)-1 : print(' '+s) elif len(s) > print_width-2 : if i == len(tw)-1 : if (len(tw[i])+(print_width-2)) > len(s) > print_width-2 : s = ' '.join(ss[:-1]) print(' '+s) print(' '+tw[-1]) else : s = ' '.join(ss[:-1]) print(' '+s) ss = [ss[-1]] #--------------------------------------------
# 6330548021 (20.00) 373 (2021-02-28 20:27) def get_unique( words ): unique_words = [] for i in words : if not i in unique_words : unique_words.append(i) return unique_words def jaccard(words_1, words_2): inters = 0 all = sorted(words_1 + words_2) All = [] for inter in words_1 : if inter in words_2 : inters += 1 for al in all : if not al in All : All.append(al) if len(All) == 0 : jaccard_coef = 0 else : jaccard_coef = inters / len(All) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): Ns = [] Tn = [] for e in range(len(norm_tweets)) : jaccard_coef = jaccard(get_unique(norm_query), get_unique(norm_tweets[e])) Ns.append([jaccard_coef,e]) top_N = [[N[1],-N[0]] for N in sorted([[-N[0],N[1]] for N in Ns])] for top in top_N : if top[1] > 0 : Tn.append(top) top_n = Tn[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') b = [] B = ' ' couns = 2 tweet_content = tweet_content.split(' ') for word in tweet_content : coun = len(word) b.append(coun) for t in range(len(tweet_content)) : if couns + b[t] <= print_width : B += tweet_content[t] + ' ' couns += b[t] + 1 elif couns + b[t] > print_width : print(B) B = ' ' B += tweet_content[t] + ' ' couns = 2 couns += b[t] + 1 print(B) #--------------------------------------------
# 6330549721 (18.50) 374 (2021-03-01 05:34) def get_unique( words ): words.sort() unique_words=[] for i in range(len(words)): if words[i]!=words[i-1]: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): s=[] l=[] m=0 for e in words_1: if e in words_2: m+=1 for e in words_1: s.append(e) for e in words_2: s.append(e) s.sort() for i in range(len(s)): if s[i]!=s[i-1]: l.append(s[i]) jaccard_coef=m/len(l) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a = [] tweet_id = 0 for e in norm_tweets : jac_num = -jaccard(e,norm_query) a.append([jac_num,tweet_id]) tweet_id +=1 a.sort() top_n=[] for i in range(n) : jac_num=a[i][0] if jac_num!=0: w=-1*a[i][0] y=a[i][1] z=[y,w] top_n.append(z) # for m in range(n): # l=[a[m][1],x[m]] #print(x) # if a[i][0] == a[i+1][0] : # top_n.append([a[i+1][1],a[i+1][0]]) # top_n.append([a[i][1],a[i][0]]) # i = 0 #while(i<len(top_n)-1): # if top_n[i] == top_n[i+1]: # top_n = top_n.remove(top_n[i+1]) # i+=1 # top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print("#"+str(tweet_id)+" "+"("+str(round(jc_coef,2))+')') s='' k=0 b=tweet_content.split(' ') for m in b: if k==0: s+=' '+m+' ' k+=len(s) elif 0<len(m)+k<=print_width: s+=m+' ' k+=len(m)+1 elif len(m)+k>print_width: print(s) s=' '+m+' ' k=len(s) print(s) #--------------------------------------------
# 6330550221 (18.50) 375 (2021-03-01 00:49) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] not in words[i+1:]: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): d = words_1+words_2 f = [] h = [] for i in range(len(d)): if d[i] not in d[i+1:]: f.append(d[i]) y = len(f) if d[i] in d[i+1:]: h.append(d[i]) z = len(h) jaccard_coef = z/y return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): p = [] for i in range(len(norm_tweets)): k = jaccard(norm_tweets[i], norm_query) r = [-k,i] p.append(r) p.sort() top_n = [] for i in range(n): if -p[i][0] != 0: top_n.append([p[i][1],-p[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content = tweet_content.split(' ') print(' ') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') d = ' ' for i in range(len(tweet_content)): if len(d)+len(tweet_content[i]) < print_width: d += ' '+tweet_content[i] else: print(d) d = ' '+tweet_content[i] print(d) #--------------------------------------------
# 6330551921 (17.92) 376 (2021-02-26 17:38) def get_unique( words ): unique_words =[] for e in words: if not(e in unique_words): unique_words.append(e) return unique_words def jaccard(words_1, words_2): same =[] for e in words_1: if e in words_2: same.append(e) jaccard_coef = float(len(same)/(len(words_1)+len(words_2)-len(same))) return round(jaccard_coef,2) def top_n_similarity(norm_tweets, norm_query, n): k = sorted([[jaccard(norm_tweets[i],norm_query),-i] for i in range(len(norm_tweets))],reverse = True)[:n] top_n = [[-f[1],f[0]] for f in k] e=0 while e<len(top_n): if top_n[e][1] == 0: top_n.remove(top_n[e]) else: e+=1 return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(' ') print('#'+str(tweet_id), '(' +str(round(jc_coef,2))+')' ) tweet_content = tweet_content.split(' ') show = [' '] for a in tweet_content: left = print_width-len(' '.join(show)) if left-1>=len(a): show.append(a) else : print(' '.join(show)) show = [' ',a] if a == tweet_content[-1]: print(' '.join(show)) #--------------------------------------------
# 6330552521 (14.57) 377 (2021-03-01 19:00) def get_unique( words ): unique_words = [] for k in words: if k not in unique_words: unique_words.append(k) return unique_words def jaccard(words_1, words_2): merged_words = [] merged_words.extend(words_1) merged_words.extend(words_2) gu = get_unique(merged_words) same = [x for x in words_1 if x in words_2] jaccard_coef = len(same) / len(gu) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): ind = [] jc = [] for tweet_id in range(len(norm_tweets)): jcc = jaccard(norm_tweets[tweet_id], norm_query) if jcc > 0: ind.append(tweet_id) jc.append(jcc) mix = [] for tweet_id in ind: for jcc in jc: mix.append([jcc, tweet_id]) mix = sorted(mix, reverse=True) for i in range(0, len(mix)): try: if mix[i][0] == mix[i + 1][0] and mix[i][1] > mix[i + 1][1]: mix[i], mix[i + 1] = mix[i + 1], mix[i] else: pass except IndexError: pass top_n = [w[::-1] for w in mix][:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n') print('#' + str(tweet_id), '(' + str(round(jc_coef, 2)) + ')') tw = [t for t in tweet_content] mns = print_width - 2 for x in tw: print(' ' + ''.join(tw[:mns])) try: tw = tw[mns: ] if IndexError: pass elif tw[mns] == ' ': tw = tw[mns + 1: ] else: break except IndexError: break if len(tw) == 0: break #--------------------------------------------
# 6330553121 (17.00) 378 (2021-03-01 21:32) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): same = [] for e in words_1 : if e in words_2 : same.append(e) jaccard_coef = len(same)/(len(words_1)+len(words_2)-len(same)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): chaos = [] ; top_n = [] for i in range(len(norm_tweets)) : if jaccard(norm_tweets[i],norm_query) > 0 : chaos.append([-jaccard(norm_tweets[i],norm_query),i]) order = sorted(chaos) for i in range(n): top_n.append([order[i][1],-order[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): twt_li = tweet_content.split(' ') print('') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') s_words = [] ; len_sen = 0 i = 0 while i < len(twt_li): if len(twt_li[i])+1 +len_sen <= print_width-1 : s_words.append(twt_li[i]) len_sen += len(twt_li[i])+1 i += 1 else : print(' '+' '.join(s_words)) s_words = [twt_li[i]] len_sen = len(twt_li[i])+1 i += 1 print(' '+' '.join(s_words)) #--------------------------------------------
# 6330554821 (18.33) 379 (2021-02-26 23:48) def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): w = [] c = 0 words_1 = get_unique(words_1) for i in range(len(words_1)): if words_1[i] in words_2: w.append(words_1[i]) c += 1 elif words_1[i] not in words_2: w.append(words_1[i]) for i in range(len(words_2)): if words_2[i] not in w: w.append(words_2[i]) jaccard_coef = c/len(w) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): t = [] for j in range(len(norm_tweets)): jaccard1 = jaccard(norm_tweets[j],norm_query) if jaccard1 > 0: t.append([-jaccard1,j]) t.sort() out = [[d[1],-d[0]] for d in t] top_n = out[:int(n):] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(' ') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') t1 = tweet_content.split(' ') t = ' ' i = 0 while i < len(t1): if len(t)+len(t1[i]) <= int(print_width): t += t1[i]+' ' i += 1 else: print(t) t = ' ' if i == len(t1) and t != ' ': print(t) #--------------------------------------------
# 6330555421 (9.33) 380 (2021-03-01 23:59) def get_unique( words ): words.sort() if len(words) == 0 : unique_words = words else : unique_words = [words[0]] for i in range(len(words)-1) : if words[i] != words[i+1] : unique_words += words[i+1] return unique_words def jaccard(words_1, words_2): x = words_1 + words_2 x.sort() results = [] result = [x[0]] e = x[0] f = x[0] c = 0 for i in range(1,len(x)) : if x[i] == e : c += 1 results += [e] else : e = x[i] c = 1 a = len(results) for i in range(1,len(x)) : if x[i] == f : c += 1 else : f = x[i] c = 1 result += [x[i]] b = len(result) jaccard_coef = b/a try : return jaccard_coef except ZeroDivisionError : return 0 def top_n_similarity(norm_tweets, norm_query, n): info=[] for i in range(len(norm_tweets)): tweet_id=i j=jaccard(norm_tweets[i], norm_query) if j>0: info.append([tweet_id,j]) a = [] for i in info: a.append([-i[1],i[0]]) a.sort() b = a[:n] top_n =[[i[1],-i[0]] for i in b] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): y = tweet_content.split(' ') a = [] space = [' ']*(len(y)-1) print(' ') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') for i in range(len(y)) : a.append(y[i]) b = ' '.join(a) if len(b) > print_width-2 : b = b.split() print(' '+' '.join(b[:len(b)-1])) a = [b[-1]] if len(b) == print_width-2 : print(' '+b) a = [] print(' '+' '.join(a)) #--------------------------------------------
# 6330556021 (20.00) 381 (2021-02-27 06:22) def get_unique( words ): unique_words = [] for i in words: if i in unique_words: unique_words += [] else: unique_words += [i] return unique_words def jaccard(words_1, words_2): both = [] same = [] x = words_1+words_2 for a in x: if a in both: both += [] else: both += [a] if len(words_1) < len(words_2): for c in words_1: if c in words_2: if c in same: same += [] else: same += [c] else: same += [] else: for c in words_2: if c in words_1: if c in same: same += [] else: same += [c] else: same += [] if len(both) == 0: jaccard_coef = 99999999999999999999999999999999999999999991112223330901051800 else: jaccard_coef = len(same)/len(both) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): jac = jaccard(norm_tweets[i], norm_query) x = i if jac > 0: top_n += [[jac, x]] else: top_n += [] top_n = sorted(top_n, reverse = True) z = 1 start = 0 for i in range(1, len(top_n)): if top_n[i-1][0] == top_n[i][0]: z += 1 else: top_n[start:z] = sorted(top_n[start:z]) start = z z += 1 top_n[start:z] = sorted(top_n[start:z]) for i in range(len(top_n)): top_n[i][0], top_n[i][1] = top_n[i][1], top_n[i][0] top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#"+str(tweet_id)+" ("+str(round(jc_coef, 2))+")") w_split = tweet_content.split(" ") show =" " for i in range(len(w_split)): nletter = len(w_split[i]) if len(show)+nletter <= print_width: show += w_split[i]+" " if i == len(w_split)-1 and len(show) == print_width: print(show) else: print(show) show =" "+w_split[i]+" " if nletter > print_width+2: print(w_split[i]) if i == len(w_split)-1 and len(show) < print_width: print(show) #--------------------------------------------
# 6330557721 (11.48) 382 (2021-03-01 21:31) def get_unique( words ): unique_words= [] for i in range(len(words)): if words[i] in unique_words: pass else: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): up =[] for i in range(len(words_1)): for j in range(len(words_2)): if words_1[i] == words_2[j]: up.append(words_1[i]) down =[] down1 = [] for i in range(len(words_1)): down.append(words_1[i]) for i in range(len(words_2)): down.append(words_2[i]) for i in range(len(down)): if down[i] in down1: pass else: down1.append(down[i]) jaccard_coef = len(up)/len(down1) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a1 = [] for i in range(len(norm_tweets)): a1.append(norm_tweets[i]) a1.append(norm_query) a2 = [] for i in range(len(a1)): if i%2 ==0: a2.append(a1[i:i+2]) j =[] for i in range(len(a2)): j.append(i) j1 = jaccard(a2[i][0],a2[i][1]) j.append(j1) ja = [] for i in range(len(j)): if i%2 == 0: ja.append([j[i+1],j[i]]) ja.sort(reverse=True) for i in range(len(ja)): ja[i][0],ja[i][1] = ja[i][1],ja[i][0] jaa = ja[0:n] top_n = jaa top_n.sort(key = lambda x: x[1],reverse=True) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): t = tweet_content.split(' ') print() print("#"+str(tweet_id)+" "+"("+str(round(jc_coef,2))+")") c = [" "] for i in range(len(t)): if len(c)+1+len(t[i])<=print_width: c.append(t[i]) else: print(c) c.append(t[i]) c =[" "] print(" ".join(c)) #--------------------------------------------
# 6330558321 (18.33) 383 (2021-02-28 21:13) def get_unique( words ): unique_list = [] for w in words: if not contains(unique_list, w): unique_list.extend([w]) return unique_list def jaccard(words_1, words_2): words_1 = get_unique(words_1) words_2 = get_unique(words_2) intersect = [] for w in words_2: if contains(words_1, w): intersect.extend([w]) union_count = len(words_1)+len(words_2)-len(intersect) return len(intersect)/union_count def top_n_similarity(norm_tweets, norm_query, n): top_n = [[i, jaccard(tweet, norm_query)] for i, tweet in enumerate(norm_tweets) if jaccard(tweet, norm_query) > 0] top_n = sorted(top_n, key=lambda x: (-x[1], x[0])) return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(f'\n#{tweet_id} ({round(jc_coef, 2)})') while len(tweet_content): if len(tweet_content) < print_width-2: print(f' {tweet_content}') break elif tweet_content[print_width-2] == ' ': print(f' {tweet_content[:print_width-2]}') tweet_content = tweet_content[print_width-2:] else: trim = tweet_content[:print_width-2] w_index = trim.rfind(' ')+1 print(f' {tweet_content[:w_index]}') tweet_content = tweet_content[w_index:] tweet_content = tweet_content.lstrip() #-------------------------------------------- def contains(l, w): for words in l: if words == w: return True return False
# 6330559021 (15.72) 384 (2021-02-28 00:23) def get_unique( words ): words.sort() words.append('9999999999999') unique_words =[] for i in range(len(words)-1) : if words[i] != words[i+1] : unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): words_3 = words_1 + words_2 words_3.sort() j = [] for i in range(len(words_3)-1) : if words_3[i] == words_3[i+1] : j.append(words_3[i]) jaccard_coef = len(j)/(len(words_1)+len(words_2)-len(j)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): x = [] for i in range(len(norm_tweets)) : x.append([i,jaccard(norm_tweets[i],norm_query)]) y = sorted([[number[1],number[0]*-1] for number in x],reverse=True) z = [[number[1]*-1,number[0]] for number in y] top_n = z[:n:] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(" ") t = tweet_content.split(" ") print("#" + str(tweet_id)+ " " + "("+ str(round(jc_coef,2))+")") x = [] y = [] c = 0 n = 0 for i in t : if len(i) > print_width - 2: print(i) if c + len(i) < print_width - 2 : x.append(i) c += len(i) + 1 else : print(" "+" ".join(x)) c = 1 x =[i] print(" " + " ".join(x)) #--------------------------------------------
# 6330560521 (20.00) 385 (2021-03-01 01:04) def get_unique( words ): words.sort() unique_words = [] for i in range(len(words)) : if i == 0 : unique_words.append(words[i]) elif words[i-1] != words[i] : unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): all_words = [] if len(words_1)<len(words_2): for e in words_1 : if e not in words_2 : all_words.append(e) same = len(words_1)-len(all_words) all_words += words_2 if len(all_words) == 0 : jaccard_coef = 0 else : jaccard_coef = same/len(all_words) else : for e in words_2 : if e not in words_1 : all_words.append(e) same = len(words_2)-len(all_words) all_words += words_1 if len(all_words) == 0 : jaccard_coef = 0 else : jaccard_coef = same/len(all_words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for tweet_id in range(len(norm_tweets)) : jaccard_ = jaccard(norm_tweets[tweet_id],norm_query) if jaccard_ > 0 : in_top_n = [jaccard_,tweet_id] top_n.append(in_top_n) top_n = sorted(top_n, key=lambda x: x[0],reverse = True) top_n = top_n[:n] for i in range(len(top_n)) : top_n[i][0],top_n[i][1] = top_n[i][1],top_n[i][0] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') t = tweet_content.split(' ') start = 2 line = [] each_line = ' ' print_width -= 2 for e in t : if len(each_line) >= print_width-len(e) : print(' '+each_line) line = [] line.append(e) each_line = ' '.join(line) elif len(each_line) < print_width-len(e) : line.append(e) each_line = ' '.join(line) each_line = ' '.join(line) print(' '+each_line) #--------------------------------------------
# 6330561121 (17.03) 386 (2021-02-25 22:21) def get_unique( words ): unique_words = [] for i in range(len(words)) : x = words.pop(0) if x not in words : unique_words.append(x) return unique_words def jaccard(words_1, words_2): up = [] down = words_1 + words_2 for i in range(len(words_1)) : if words_1[i] in words_2 : up.append(words_1[i]); down.remove(words_1[i]) jaccard_coef= len(up)/len(down) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): i=0 top_a = [] while i < len(norm_tweets) : up = [] down = norm_tweets[i] + norm_query for e in range(len(norm_tweets[i])) : if norm_tweets[i][e] in norm_query : up.append(norm_tweets[i][e]); down.remove(norm_tweets[i][e]) top_a.append([i,len(up)/len(down)]) ; i+=1 top_b =sorted([[d[1],d[0]*-1] for d in top_a],reverse=True) top_n =[[d[1]*-1,d[0]] for d in top_b][:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): nt = tweet_content.split(' ') wdc = 0 word=[] print() print('#'+ str(tweet_id)+ ' ('+ str(round(jc_coef, 2))+ ')') for i in range(len(nt)) : if len(nt[i]) >= print_width-2 : print(" "+nt[i]) else: wdc += len(nt[i]) ; word.append(nt[i]) if wdc == print_width-2 : print(' '+" ".join(word)) wdc = 0 ; word = [] elif wdc > print_width-2 : x = word.pop(-1) print(' '+" ".join(word)) word = [] ; wdc = 0 word.append(x) ; wdc += len(x)+1 else : wdc += 1 if len(word) != 0 : print(' '+" ".join(word)) #--------------------------------------------
# 6330562821 (16.30) 387 (2021-03-01 00:51) def get_unique( words ): unique_words=[] for e in words : if e not in unique_words : unique_words.append(e) return unique_words def jaccard(words_1, words_2): words_rep = [] for e in words_1 : if e in words_2 : words_rep.append(e) jaccard_coef = len(words_rep)/(len(words_1)+len(words_2)-len(words_rep)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = [] for tweet_id in range (len(norm_tweets)) : Jacc = jaccard(norm_tweets[tweet_id], norm_query) top.append([-Jacc, tweet_id]) top.sort() top_n = [] for i in range (n) : if top[i][0] != 0 : top_n.append([top[i][1], -top[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content = tweet_content.split(' ') print() print('#'+str(tweet_id), '('+str(round(jc_coef,2))+')') show = ' ' for e in tweet_content : if len(show)+len(e) < print_width-1 : show += ' ' + e else : print(show) show = ' ' + e print(show) #--------------------------------------------
# 6330563421 (20.00) 388 (2021-02-28 01:17) def get_unique( words ): unique_words=[] for k in words: if k not in unique_words: unique_words.append(k) return unique_words def jaccard(words_1, words_2): words_intersect=[] for k in words_1: if k in words_2: words_intersect.append(k) jaccard_coef=len(words_intersect)/(len(words_1+words_2)-len(words_intersect)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): list_n=[] for i in range(len(norm_tweets)): if jaccard(norm_tweets[i],norm_query)>0: list_n.append([i,jaccard(norm_tweets[i],norm_query)]) list_n=[[k[1],-k[0]] for k in sorted([[-k[1],k[0]] for k in list_n])] top_n=list_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') print_tweet=tweet_content.split(' ') print_output=' ' letter_count=0 space_count=0 for i in range(len(print_tweet)): if letter_count+space_count+len(print_tweet[i])<=print_width-2: print_output+=print_tweet[i] letter_count+=len(print_tweet[i]) if letter_count+space_count<=print_width-2: print_output+=' ' space_count+=1 else: print(print_output) print_output=' '+print_tweet[i] letter_count=len(print_tweet[i]) space_count=0 if letter_count+space_count<=print_width-2: print_output+=' ' space_count+=1 print(print_output) #--------------------------------------------
# 6330565721 (18.01) 389 (2021-03-01 00:49) def get_unique( words ): unique_words = list(dict.fromkeys(words)) return unique_words def jaccard(words_1, words_2): word1plus2 = words_1+words_2 unique_words = [] for i in range(len(word1plus2)): if word1plus2[i] not in unique_words: unique_words.append(word1plus2[i]) ; jaccard_coef = (len(word1plus2)-len(unique_words))/len(unique_words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): c = [] for i in range(len(norm_tweets)) : c.append(jaccard(norm_tweets[i],norm_query)) a = [[-c[i],i] for i in range(len(norm_tweets))] a = sorted(a) ; top_n = [[a[i][1],-a[i][0]] for i in range(len(a))][:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print( ) x = "#" + str(tweet_id) y = "(" + str(round(jc_coef,2)) + ")" print(x, y) words = tweet_content.split(' ') ; ans = ' ' + words[0] for z in words[1:]: if len(ans) + len(' ' + z) <= print_width: ans += ' ' + z else: print(ans) ans = ' ' + z print(ans) #--------------------------------------------
# 6330566321 (20.00) 390 (2021-02-28 17:15) def get_unique( words ): for word in words: for i in range(words.count(word)-1): words.remove(word) unique_words = sorted(words) return unique_words def jaccard(words_1, words_2): w1=get_unique(words_1) w2=get_unique(words_2) iw=[] uw=get_unique(w1+w2) for word in w1: if word in w2: iw.append(word) jaccard_coef = len(iw)/len(uw) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): tweets_list = [] for i in range(len(norm_tweets)): tweets_list.append([jaccard(norm_tweets[i],norm_query),-i]) tweets_list.sort(reverse=True) top_n = tweets_list[:n] top_n = [[-id,jac] for jac,id in top_n] for i in range(len(top_n)): jac = top_n[i][1] if jac == 0: top_n[i] = [] c = top_n.count([]) for i in range(c): top_n.remove([]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') words = tweet_content.split(' ') n = 0 line = '' for word in words: if n == 0: line += ' ' + word n = 1 elif len(line) + len(word) + 1 <= print_width: line += ' ' + word else: print(line) line = ' ' + word print(line) #--------------------------------------------
# 6330567021 (14.68) 391 (2021-02-28 18:03) def get_unique( words ): for i in range(len(words)): for j in range(i+1,len(words)): if words[i] == words[j] : words[i] = '0' break words.sort(reverse = True) if '0' in words: q = words.index('0') words = words[:q] unique_words = words return unique_words def jaccard(words_1, words_2): a = 0 for i in words_1 : for j in words_2: if i == j: a += 1 jo1 = len(words_1) jo2 = len(words_2) ujo = jo1 + jo2 - a if ujo == 0: joceof = 0 else: jocoef = a / ujo jaccard_coef = round(jocoef, 2) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): speed = [] for i in range(len(norm_tweets)): if float(jaccard(norm_tweets[i], norm_query)) > 0: jacjac = float(jaccard(norm_tweets[i], norm_query)) speed.append([i,jacjac]) space = [[-speed[1],speed[0]] for speed in sorted(([speed[1],-speed[0]] for speed in speed),reverse = True)] top_n = space[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' ('+str(jc_coef)+')') tweet_content = tweet_content.split() l = len(tweet_content) n = print_width if l % n == 0: stack = l // n else: stack = l//n + 1 aa = ' ' for i in range(l): aa += tweet_content[i] aa += ' ' if len(aa) >= print_width: print(aa) aa = ' ' print(aa) #--------------------------------------------
# 6330568621 (11.50) 392 (2021-02-27 22:27) def get_unique( words ): #COVID economic crisis words.sort() unique_words = [] words.append('?') for i in range(len(words)-1): if words[i] != words[i+1]: unique_words.append(words[i]) else: pass return unique_words def jaccard(words_1, words_2): c = 0 words_1.sort() words_2.sort() t = min(len(words_1),len(words_2)) if len(words_1) > len(words_2): maxx = words_1 minn = words_2 else: maxx = words_2 minn = words_1 for i in range(t): if minn[i] in maxx: c += 1 else: pass happy = words_1 + words_2 happy.sort() h_happy = [] happy.append('?') for i in range(len(happy)-1): if happy[i] != happy[i+1]: h_happy.append(happy[i]) else: pass sad = len(h_happy) jaccard_coef = round((c/sad),2) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] top_n2 = [] for i in range(len(norm_tweets)): p = jaccard(norm_tweets[i],norm_query) top_n2.append([p,i]) top_n2.sort(reverse = True) for t in range(n): top_n.append([top_n2[t][1],top_n2[t][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print(' ') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') words = tweet_content.split() w = '' for i in range(len(words)): if i == len(words)-1 and len(w) + len(words[i]) +1 < print_width: w += words[i] print(w) elif len(w) + len(words[i]) +1 > print_width: print(w) w = ' '+words[i] elif len(w) + len(words[i]) +1 < print_width: w += words[i]+' ' #--------------------------------------------
# 6330570821 (20.00) 393 (2021-02-28 20:22) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words += [e] return unique_words #--------------------------------------------------------# def jaccard(words_1, words_2): total_words = len(get_unique(words_1 + words_2)) c = 0 for e in words_2: if e in words_1: c += 1 jaccard_coef = c/total_words return jaccard_coef #--------------------------------------------------------# def top_n_similarity(norm_tweets, norm_query, n): x = [] top_n = [] for i in range(len(norm_tweets)): x += [[ jaccard(norm_tweets[i], norm_query) , i ]] top = sorted(x,reverse=True) for e in top: if e[0] > 0: top_n.append([ e[1] , e[0] ]) for j in range(len(top_n)): for i in range(len(top_n)-1): if top_n[i][1] == top_n[i+1][1]: if top_n[i][0] > top_n[i+1][0]: [top_n[i] , top_n[i+1]] = [top_n[i+1] , top_n[i]] return top_n[:n] #--------------------------------------------------------# def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n') tweet_content = tweet_content.split(' ') p0 = "#"+str(tweet_id)+" "+"("+str(round(jc_coef,2))+")" print(p0) p =' ' for e in tweet_content: if len(p + e) <= print_width: p += e + ' ' if e == tweet_content[-1]: print(p) elif len(p + e) > print_width: print(p) p = '' p += ' ' + e + ' ' if e == tweet_content[-1]: print(p) #--------------------------------------------
# 6330571421 (19.10) 394 (2021-02-28 11:59) def get_unique( words ): unique_words = [] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): sameword = 0 allword = [] for e in words_1: if e in words_2: sameword += 1 for e in words_1: if e not in allword: allword.append(e) for e in words_2: if e not in allword: allword.append(e) jaccard_coef = sameword/len(allword) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): all_n = [] for i in range(len(norm_tweets)): x = jaccard(norm_tweets[i],norm_query) if x>0: all_n.append([i,x]) top_n = [[-all_n[1],all_n[0]] for all_n in sorted(([all_n[1],-all_n[0]] for all_n in all_n),reverse = True)][:n] return top_n def show_tweet(tweet_id,tweet_content,jc_coef,print_width): c = '' b = [] a = '#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')' tweet_content = tweet_content.split(' ') for e in tweet_content: if len(c) == 0: if len(e) >= print_width-2: b.append(e) else: c = c + e + ' ' else: if len(c)+len(e) <= print_width-2: c = c + e + ' ' else: b.append(c) if len(e) >= print_width-2: b.append(e) else: c = e + ' ' if c != ' ': b.append(c) print('') print(a) for e in b: print(' '+e) #--------------------------------------------
# 6330572021 (17.00) 395 (2021-03-01 09:59) def get_unique( words ): unique_words=[] for w in words: if w not in unique_words: unique_words.append(w) return unique_words def jaccard(words_1, words_2): co=[] s=[] for i in range(len(words_1)): if words_1[i] in words_2: co+=[words_1[i]] else: s+=[words_1[i]] for j in range(len(words_2)): if words_2[j] not in words_1: s+=[words_2[j]] s+=co jaccard_coef=len(co)/len(s) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): j=[] top_n=[] for i in range(len(norm_tweets)): jc=jaccard(norm_tweets[i],norm_query) j.append([jc,i]) j_=[[h[1],-h[0]] for h in sorted([[-h[0],h[1]] for h in j])] for i in range(n): top_n+=[j_[i]] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): sen=tweet_content.split(' ') jac=str(round(jc_coef,2)) print('') print('#'+str(tweet_id)+' ('+ jac +')') ms=' ' for i in range(len(sen)-1): if len(ms+sen[i])<=print_width: ms+=sen[i]+' ' else: print(ms) ms=' '+sen[i]+' ' if len(ms+sen[-1])<=print_width: print(ms+sen[-1]) else: print(ms) print(' '+sen[-1]) #--------------------------------------------
# 6330573721 (17.01) 396 (2021-02-26 18:28) def get_unique( words ): unique_words = list() for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): f = [] for i in words_1: for j in words_2: if i == j: f.append(i) break jaccard_coef = len(f)/((len(words_1)+len(words_2))-len(f)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for index, values in enumerate(norm_tweets): _jaccards = jaccard(norm_tweets[index], norm_query) top_n.append([index, _jaccards]) top_n.sort(key=lambda jaccard: jaccard[1], reverse=True) top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#{} ({})".format(tweet_id, round(jc_coef, 2))) string = ' ' tweet_content = tweet_content.split() k = 1 for i in tweet_content: if len(string) + len(i) > print_width*k: string+='\n' string+=' ' k+=1 string+=i string+=' ' print(string) #--------------------------------------------
# 6330574321 (19.78) 397 (2021-03-01 19:04) def get_unique(words): unique_words=[] words.sort() for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): a1=words_1 a2=words_2 a3=a1+a2 a4=get_unique(a3) k=0 for e in a1: if e in a2: k=k+1 a5=k/len(a4) jaccard_coef=a5 return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a1=norm_tweets a2=norm_query top_n=[] for i in range(len(a1)): b=jaccard(a1[i],a2) if len(top_n)<=n-1: if b>0: top_n.append([i,b]) elif top_n[0][1]<b: top_n[0][0]=i top_n[0][1]=b top_n.sort(key=lambda c:-c[0]) top_n.sort(key=lambda c:c[1]) top_n.sort(key=lambda c:c[0]) top_n.sort(key=lambda c:-c[1]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): a1=tweet_id a2=tweet_content a3=jc_coef a4=print_width b1="#"+str(a1) b2="("+str(round(a3,2))+")" print(" ") print(b1,b2) c1=a2.split(" ") c2=" "+c1[0] for e in c1[1:]: if len(c2)+len(" "+e)<=a4: c2+=" "+e else: print(c2) c2=" "+e print(c2) # --------------------------------------------
# 6330576621 (20.00) 398 (2021-02-28 18:17) def get_unique( words ): unique_words =[] words.sort() for i in range(len(words)): if i==0: unique_words+=[words[i]] elif words[i]!=words[i-1]: unique_words+=[words[i]] return unique_words def jaccard(words_1, words_2): a=[] c=[] for word in words_1: a.append(word) if word in words_2 and (word not in c): c.append(word) for word in words_2: if word not in a: a.append(word) jaccard_coef=(len(c)/len(a)) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range(len(norm_tweets)): jac=jaccard(norm_tweets[i],norm_query) if jac>0: if len(top_n)<n: top_n.append([jac,-i]) else: top_n.sort() if jac>top_n[0][0]: top_n[0][0]=jac top_n[0][1]=-i top_n.sort(reverse=True) top_n=[[-i, j] for j, i in top_n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): ans='' ans+= '#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')' + '\n' words= tweet_content.split(' ') t= ' ' +words[0] for w in words[1:]: n=len(t)+len(w)+1 if n<=print_width: t+= ' ' + w else: ans+= t + '\n' t= ' ' + w ans+= t ans= '\n' + ans print(ans) #--------------------------------------------
# 6330577221 (18.01) 399 (2021-02-28 07:09) def get_unique( words ): unique_words=[] for e in words: if e not in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): both_words=[] for e in words_1: if e in words_2: both_words.append(e) unique_each=[] uw1=get_unique(words_1) uw2=get_unique(words_2) unique_each=uw1+uw2 unique_words=get_unique(unique_each) jaccard_coef=len(both_words)/len(unique_words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): jc_list=[] top_n=[] for i in range(len(norm_tweets)): jc=jaccard(norm_query,norm_tweets[i]) jc_list.append([jc,-i]) jc_list_sorted=sorted(jc_list, reverse=True) for j in jc_list_sorted: top_n.append([-j[1],j[0]]) return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_words=tweet_content.split(' ') line='' i=0 print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') while i<len(tweet_words): if len(line)==0: line+=' '+tweet_words[i] i+=1 elif len(line)!=0 and len(line)+len(tweet_words[i])+1<=print_width: line+=' '+tweet_words[i] i+=1 else: print(line) line='' if line!='': print(line) #--------------------------------------------
# 6330578921 (15.18) 400 (2021-02-26 16:06) def get_unique( words ): unique_words = [] [unique_words.append(x) for x in words if x not in unique_words] return unique_words def jaccard(words_1, words_2): Twords = words_1 + words_2 n1 = [] n2 = 0 for i in range(len(words_2)): if words_2[i] in words_1: n2 += 1 n3 = [n1.append(x) for x in Twords if x not in n1 ] jaccard_coef = float((n2)/(len(n3))) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top1 = sorted([[jaccard(norm_tweets[i],norm_query),-i] for i in range(len(norm_tweets))],reverse = True) top_n = [[-top1[j][1],top1[j][0]] for j in range(len(top1))][:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): rounded_jc_coef=round(jc_coef,2) standard_print_width=print_width+0 allword_tweet_content=tweet_content.split() want_to_print=" " print() print("#"+str(tweet_id),"("+str(rounded_jc_coef)+")") for word in allword_tweet_content: if len(want_to_print)<print_width and len(want_to_print+word)<=print_width+2: want_to_print=want_to_print+word+" " if word==allword_tweet_content[-1]: print(want_to_print) else: print(want_to_print) want_to_print=" "+word+" " if word==allword_tweet_content[-1]: print(want_to_print) return #--------------------------------------------
# 6330579521 (18.01) 401 (2021-03-01 00:53) def get_unique( words ): unique_words=[] for i in range(len(words)): listt=words[:i] if words[i] not in listt: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): intercept,union=0,0 for i in range(len(words_2)): if words_2[i] in words_1: intercept+=1 union=len(words_1)+len(words_2)-intercept jaccard_coef=intercept/union return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n_pre=[] tweet_id=[i for i in range(len(norm_tweets))] for i in range(len(norm_tweets)): jac=jaccard(norm_tweets[i],norm_query) top_n_pre.append([i,jac]) # top_n=[sorted([[data[1]]for data in top_n_pre],reverse=True)[:n]] # # sorted(student_tuples, key=lambda student: student[2]) # top_n=sorted(top_n_pre, key=lambda data: data[1], reverse=True)[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' '+'('+str(round(jc_coef,2))+')') word_of_t = tweet_content.split(' ') count=0 for i in range(len(word_of_t)): if i==0: print(' '+word_of_t[0],end='') count=len(word_of_t[0])+2 else: if len(' '+word_of_t[i])<=print_width-count: print(' '+word_of_t[i],end='') count+=len(' '+word_of_t[i]) else: print('') print(' '+word_of_t[i],end='') count=len(' '+word_of_t[i]) print('') #--------------------------------------------
# 6330580021 (15.45) 402 (2021-02-27 15:43) def get_unique( words ): if len(words)!=0: words.sort() e = words[0] unique_words=[words[0]] for i in range(1,len(words)): if words[i] != e: unique_words.append(words[i]) e = words[i] else: unique_words=[] return unique_words def jaccard(words_1, words_2): words_3=words_1+words_2 words_3.sort() e = words_3[0] T=[words_3[0]] for i in range(1,len(words_3)): if words_3[i] != e: T.append(words_3[i]) e = words_3[i] S=[] for i in range(len(words_1)): if words_1[i] in words_2: S.append(words_1[i]) T=len(T) S=len(S) jaccard_coef=S/T return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): tweet_id=0 top_n=[] for i in range(len(norm_tweets)): tweet_id=i j=jaccard(norm_tweets[tweet_id], norm_query) z=[tweet_id,j] top_n.append(z) x=[] for i in range(len(top_n)): if top_n[i][1]>0: x.append([top_n[i][1],top_n[i][0]]) x.sort(reverse=True) top_n=[] for i in range(len(x[:n])): top_n.append([x[i][1],x[i][0]]) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') b=tweet_content.split(" ") st=0 c=[] d="" for j in range(len(b)): if st+len(b[j])+len(c)<=print_width-2: c.append(b[j]) d+=b[j] st=len(d) else: print(' '+' '.join(c)) st=0 c=[b[j]] d=b[j] print(' '+' '.join(c)) #--------------------------------------------
# 6330583021 (18.50) 403 (2021-02-27 21:57) def get_unique( words ): words.sort() words+=[' '] unique_words =[] for i in range (len(words)-1): if words[i]!=words[i+1]: unique_words.append(words[i]) return unique_words def jaccard(words_1, words_2): words_3= words_1+words_2 words_3.sort() words_3+=[''] bot=0 top=0 for i in range(len(words_3)-1): if words_3[i]!=words_3[i+1]: bot+=1 else: top+=1 jaccard_coef= float(top)/float(bot) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=['']*int(n) jac=[] for i in range(len(norm_tweets)): a= jaccard(norm_tweets[i], norm_query) jac.append(a) x=sorted(jac) x=x[::-1] x= x+[''] for i in range(int(n)): top_n[i]=[jac.index(x[i]),x[i]] if x[i]==x[i+1]: jac[jac.index(x[i])]=-1 if top_n[0][1]==float(0): top_n=[] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): t= tweet_content.split(' ') print('') print('#'+str(tweet_id), '('+str(round(jc_coef,2))+')') print(' ',end="") a='' for s in t: if len(a)+len(s)<=(print_width-2): print(s,end=" ") a= a+ s+' ' else : print('') a='' print(' ',end="") print(s,end=" ") a= a+ s+' ' print('') #--------------------------------------------
# 6330585221 (20.00) 404 (2021-03-01 00:42) def get_unique( words ): unique_words = [] x = sorted(words) if len(x) > 1: for i in range(len(x)-1): if x[i] == x[i+1]: continue unique_words.append(x[i]) unique_words.append(x[i+1]) else: unique_words = x return unique_words def jaccard(words_1, words_2): e=0 d=0 for c in words_2: if c in words_1: e+=1 if c not in words_1: d+=1 jaccard_coef = e/(len(words_1)+d) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top = [] for i in range(len(norm_tweets)): s = jaccard(norm_tweets[i],norm_query) if s > 0: top.append([i, s]) for i in range(len(top)): top[i][0],top[i][1] = -top[i][1],top[i][0] top_n = sorted(top)[:n] for i in range(len(top_n)): top_n[i][0],top_n[i][1] = top_n[i][1],-top_n[i][0] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('\n'+'#'+str(tweet_id), '('+str(round(jc_coef, 2)) + ')') d = tweet_content.split(' ') new = '' for i in range(len(d)): g = 2+len(new) if g+len(d[i]) <= print_width: new += d[i]+' ' elif abs(g+len(d[i])-1) >= print_width: print(' '+new) new = '' new += d[i]+' ' else: new += d[i] print(' '+new) new = '' print(' '+new) #--------------------------------------------
# 6330586921 (18.01) 405 (2021-02-26 22:49) def get_unique( words ): kuy=[] for fun in words: if fun not in kuy: kuy.append(fun) return kuy def jaccard(words_1, words_2): w1=get_unique(words_1) w2=get_unique(words_2) kuy1=0 kuy2=len(w1) for fuck in w2: if fuck in w1: kuy1+=1 for fuck in w2: if fuck not in w1: kuy2+=1 superkuy=kuy1/kuy2 return superkuy def top_n_similarity(norm_tweets, norm_query, n): top_n = [] i = 0 for tweet in norm_tweets: jaccard_coef = jaccard(tweet,norm_query) top_n.append([i,jaccard_coef]) i += 1 top_n.sort(key = lambda x:x[1],reverse=True) return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print("#"+str(tweet_id),'('+str(round(jc_coef,2))+')') str_print = [] tweet_content = tweet_content.split(' ') line = " " for word in tweet_content: if len(line) + len(word) > print_width: str_print.append(line) line = " " line += word + ' ' if len(line) > 2: str_print.append(line) for l in str_print: print(l) #--------------------------------------------
# 6330587521 (20.00) 406 (2021-02-28 22:43) def get_unique( words ): unique_words=[] for e in range(len(words)): if words[e] not in words[e+1::]: unique_words.append(words[e]) return unique_words def jaccard(words_1, words_2): s=0 for e in words_1: if e in words_2: s+=1 t=len(words_1)+len(words_2)-s jaccard_coef = s/t return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): every=[] for tweet_id in range(len(norm_tweets)): cream=jaccard(norm_tweets[tweet_id],norm_query) if cream!=0: every.append([tweet_id,cream]) top_n=[[x[1],-x[0]] for x in sorted([[-k[1],k[0]] for k in every])][:n:] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+' ('+str(round(jc_coef,2))+')') ja=tweet_content.split(' ') h=[' '] for el in ja: h.append(el) if len(' '.join(h)) <= print_width : continue else: c=' '.join(h[:-1]) print(c) h=[' '] h.append(el) print(' '.join(h)) #--------------------------------------------
# 6330588121 (18.44) 407 (2021-03-01 17:32) def get_unique( words ): unique_words=[] for i in words: if i not in unique_words: unique_words.append(i) return unique_words def jaccard(words_1, words_2): c=0 for i in words_1: if i in words_2: c+=1 jaccard_coef=c/(len(words_1)+len(words_2)-c) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): a=[];top_n_prime=[];f=[] for i in range(len(norm_tweets)): b=[-1*jaccard(norm_tweets[i],norm_query),i] #if jaccard(norm_tweets[i],norm_query)!=0: top_n_prime.append(b) top_n_prime.sort() top_n=[] for e in range(n): if top_n_prime[e][0]*-1!=0: p=[top_n_prime[e][1],top_n_prime[e][0]*-1] top_n.append(p) return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id),'('+str(round(jc_coef,2))+')') tweet_content=tweet_content.split() a=' ' for i in tweet_content: if len(a)==1: a+=' '+i elif 2<len(a)+1+len(i)<=print_width: a+=' '+i elif len(a)+1+len(i)>print_width: print(a) a=' '+i print(a) #--------------------------------------------
# 6330589821 (19.95) 408 (2021-02-28 23:06) def get_unique( words ): unique_words = [] for w in words: if not w in unique_words: unique_words.append(w) return unique_words #-------------------------------------------------------- def jaccard(words_1, words_2): union = [] sum_12 = words_1 + words_2 for w in sum_12: if not w in union: union.append(w) intersect = [] for w in words_1: if w in words_2: intersect.append(w) jaccard_coef = len(intersect)/len(union) return jaccard_coef #-------------------------------------------------------- def top_n_similarity(norm_tweets, norm_query, n): alltop_n = [] for i in range(len(norm_tweets)): j=jaccard(norm_tweets[i],norm_query) if j != 0: alltop_n.append([i,j]) alltop = [[h[1],-h[0]] for h in sorted([[-e[1],e[0]] for e in alltop_n])] top_n = alltop[:n] return top_n #-------------------------------------------------------- def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print() print('#'+ str(tweet_id) +' ('+str(round(jc_coef,2))+')') tt = tweet_content.split() l = len(tt[0]) p = ' '+tt[0] index =0 for e in tt[index+1:]: if len(e)+1 <= print_width-len(p): p += ' '+e index += 1 else: print(p) p = ' '+e print(p) #--------------------------------------------
# 6330591021 (20.00) 409 (2021-02-27 22:27) def get_unique( words ): l = len(words) unique_words = [] for i in words : #print(i) if i not in unique_words : unique_words.append(i) return unique_words def jaccard(words_1, words_2): up=0 for i in words_1 : if i in words_2 : up=up+1 #print(i) down=len(words_1)+len(words_2)-up jaccard_coef = float(up/down) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n=[] for i in range (0,len(norm_tweets),1) : j = jaccard(norm_tweets[i] , norm_query) if j>0 : top_n.append([i,j]) top_n =[[x[1],-x[0]] for x in sorted([[-x[1],x[0]] for x in top_n])[:n]] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): tweet_content=tweet_content.split(" ") a=print_width j = round(jc_coef,2) print("") print("#"+str(tweet_id)+" ("+str(j)+")") for i in range (len(tweet_content)) : if a==print_width : print(" ",end="") a=a-2 if a>len(tweet_content[i]) : print(tweet_content[i],end=" ") a=a-len(tweet_content[i])-1 elif a==len(tweet_content[i]) : print(tweet_content[i]) a=print_width elif a<len(tweet_content[i]) : print("\n",end=" ") a=print_width - len(tweet_content[i]) -3 print(tweet_content[i],end=" ") print("") #--------------------------------------------
# 6330592621 (18.01) 410 (2021-02-28 14:21) def get_unique( words ): unique_words = [] for k in words : if k not in unique_words : unique_words.append(k) return unique_words def jaccard(words_1, words_2): c = 0 for i in words_1 : if i in words_2 : c += 1 n = len(words_2) + len(words_1) - c jaccard_coef = c/n return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range (len(norm_tweets)) : top_n.append([i, jaccard(norm_tweets[i],norm_query)]) #right[1,jaccard high to low : 0,index low to high ] top_n = [[jac[1],-jac[0]] for jac in sorted([[-jac[1],jac[0]] for jac in top_n])[:n]] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#"+ str(tweet_id)+" ("+str(round(jc_coef, 2))+")") c = 2 tweet_content = tweet_content.split(" ") print(" ",end="") for i in range(len(tweet_content)) : if tweet_content[i] == "" : print(" ",end="") c += 1 elif c + len(tweet_content[i]) > print_width : if c == 2 : print(tweet_content[i]) else : print("") print(" ",end="") c = 2 print(tweet_content[i] + " ",end="") c += len(tweet_content[i]) + 1 elif c + len(tweet_content[i]) <= print_width : print(tweet_content[i] + " ",end="") c += len(tweet_content[i]) + 1 print("") #--------------------------------------------
# 6330593221 (20.00) 411 (2021-02-28 21:05) def get_unique( words ): words = [[len(i), i] for i in words] words = [i[1] for i in sorted(words)] w = [] unique_words = [] for i in words: if i in w: pass else: unique_words.append(i) w = [] w.append(i) return unique_words def jaccard(words_1, words_2): words = [] for i in words_1: words.append(i) for i in words_2: words.append(i) uppe = [] for i in words_1: for e in words_2: if i == e: uppe.append(i) words = [[len(i), i] for i in words] words = [i[1] for i in sorted(words)] words = get_unique(words) if len(words) == 0: jaccard_coef = 0 else: jaccard_coef = len(uppe) / len(words) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): tweet_id = [] top_n = [] for i in range(len(norm_tweets)): tweet_id.append(i) for i in range(len(tweet_id)): if jaccard(norm_tweets[i], norm_query) > 0: top_n.append([jaccard(norm_tweets[i], norm_query),i]) top_n = [[-i[0], i[1]] for i in top_n] top_n = [[i[1], -i[0]] for i in sorted(top_n)] top_n = top_n[:n] return top_n def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print("#" + str(tweet_id) + " (" + str(round(jc_coef, 2)) + ")") tweet_content = tweet_content.split(" ") w = " " output = [] for i in tweet_content: if len(w + i) > print_width: output.append(w.strip()) w = " " + i + " " else: w += i + " " output.append(w.strip()) for i in output: print(" " + i) # --------------------------------------------
# 6330594921 (20.00) 412 (2021-03-01 14:52) def get_unique( words ): unique_words = [] for e in words: if not e in unique_words: unique_words.append(e) return unique_words def jaccard(words_1, words_2): upper = [] for e in words_1 : if e in words_2: upper.append(e) lower = get_unique( words_1 + words_2 ) jaccard_coef = len(upper) / len(lower) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range(len(norm_tweets)): x = jaccard(norm_tweets[i], norm_query) if x > 0: top_n.append( [(-1)*x, i] ) #max-->min top_n = [[i[1], -i[0]] for i in sorted(top_n)] return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print("") print("#"+str(tweet_id) + " (" + str(round(jc_coef, 2) )+ ")") t = tweet_content.split(' ') w = " " for i in t: if len((w+i)) <= print_width: w += i + " " else: print(w) w = " " + i + " " print(w) #--------------------------------------------
# 6330595521 (15.83) 413 (2021-03-01 23:57) def get_unique( words ): word1 = [] for i in words: if i in word1: pass else: word1.append(i) return word1 def jaccard(words_1, words_2): results = [] jafftop = 0 results1 = [] [results.append(i) for i in words_1] [results.append(i) for i in words_2] for i in results: if i in results1: pass else: results1.append(i) results = results1 for i in words_1: if i in words_2: jafftop += 1 if len(results) == 0: return False else: return jafftop/len(results) def top_n_similarity(norm_tweets, norm_query, n): index = len(norm_tweets) list_tweet = [] for i in range(index): list_temp = [] tweets = norm_tweets[i] if jaccard(tweets, norm_query) > 0: list_temp.append(i) list_temp.append(jaccard(tweets, norm_query)) list_tweet.append(list_temp) top_n = sorted(list_tweet, key= lambda x: -x[1]) return top_n[0:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('#{:} ({:})'.format(tweet_id, jc_coef)) r = int(len(tweet_content)/48)+1 for i in range(r): if len(tweet_content) > 48: print(' ' + tweet_content[:48]) tweet_content = tweet_content[48:] elif len(tweet_content) <= 48: print(' ' + tweet_content) break #--------------------------------------------
# 6331138621 (19.90) 414 (2021-03-01 23:50) def get_unique( words ): a=[] for i in words: if not i in a: a.append(i) unique_words = a return unique_words def jaccard(words_1, words_2): words = words_1 + words_2 a = get_unique( words ) b = [] for i in words_1: if i in words_2: b.append(i) b = get_unique(b) jaccard_coef = len(b)/len(a) return jaccard_coef def top_n_similarity(norm_tweets, norm_query, n): top_n = [] for i in range (len(norm_tweets)): w = jaccard(norm_tweets[i],norm_query) if w>0 : top_n.append( [(-1)*w,i] ) top_n = [[i[1], -i[0]] for i in sorted(top_n)] return top_n[:n] def show_tweet(tweet_id, tweet_content, jc_coef, print_width): print('') print('#'+str(tweet_id)+'('+str(round(jc_coef,2))+')') t = tweet_content.split(' ') show_output = " " for o in t: if len((show_output+o)) <= print_width: show_output += o + " " else: print(show_output) show_output = " " + o + " " print(show_output) #--------------------------------------------