0.0% ≤ diff ≤ 50.0%

18 clusters, 50 submissions

ALL: cluster #1 (2)

# 6330303621 (2021-03-21 22:13) %diff = 12.08 file_name = input('File name = ',) a = input("Use feature hashing ? (y,Y,n,N) ",) while a != 'n' and a != 'N' and a != 'y' and a != 'Y' : print('Try again.') a = input("Use feature hashing ? (y,Y,n,N) ",) if a == 'y' or a == 'Y' : b = int(input('M = ',)) print('-------------------') def fhash(w, M) : n = 0 for i in range(len(w)) : n = n+(ord(w[i])*37**i) n %= M return n def count_words(w) : count5 = 0 for i in range(len(g)): if g[i] == w : count5 += 1 else : count5 += 0 return count5 sw = open('stopwords.txt', 'r') f = open(file_name, 'r') ff1 = f.read().strip() ff = ff1.split() fff = " ".join(ff) sww = sw.read().strip().split() count1 = 0 for line in ff1 : if line == '\n' : count1+=0 else : count1 += len(line) print('char count =', count1) count2 = 0 x=[] z=[] for line in ff : for i in range(len(line)) : if line[i].isalnum()==True : count2 += 1 x.append(line[i]) y = ''.join(x) else : continue z.append(y) x=[] print('alphanumeric count =', count2) count3 = 0 f = open(file_name, 'r') for line in f : count3 += 1 print('line count =', count3) count4 = 0 for i in range(len(fff)) : if fff[i]==fff[0] : continue if fff[i].isalnum()==False and fff[i].isalnum() != fff[i-1].isalnum() : count4 += 1 else : continue print('word count =', count4) BoW = [] BoW0 = [] BoW1 = [] BoW2 = [] g = " ".join(z).lower().split() if a == 'y' or a == 'Y' : for i in range(len(g)) : if g[i] not in sww and g[i] not in BoW0 : BoW0.append(g[i]) BoW1.append([fhash(g[i],b), count_words(g[i])]) k = sorted(BoW1) for i in range(len(k)) : if i < len(k)-1 : for j in range(i+1,len(k)) : if k[i][0]==k[j][0] : k[i][1]+=k[j][1] for i in range(len(k)) : if k[i][0]==k[i-1][0] : continue else : BoW.append(k[i]) print('BoW =', BoW) elif a == 'n' or a == 'N' : for i in range(len(g)) : if g[i] not in sww : BoW2.append([g[i], count_words(g[i])]) k = sorted(BoW2) for i in range(len(k)) : if k[i][0]==k[i-1][0] : continue else : BoW.append(k[i]) print('BoW =', BoW) f.close() sw.close()# 6330565721 (2021-03-22 21:11) %diff = 12.08 file_name = input('File name = ',) x = input("Use feature hashing ? (y,Y,n,N) ",) while x != 'n' and x != 'N' and x != 'y' and x != 'Y' : print('Try again.') a = input("Use feature hashing ? (y,Y,n,N) ",) if x == 'y' or x == 'Y' : y = int(input('M = ',)) print('-------------------') def fhash(o, p) : f = 0 for i in range(len(o)) : f = f+(ord(o[i])*37**i) f %= p return f def count_words(o) : ct5 = 0 for i in range(len(h)): if h[i] == o : ct5 += 1 else : ct5 += 0 return ct5 s = open('stopwords.txt', 'r') ; z = open(file_name, 'r') ; z2 = z.read().strip() ; z3 = z2.split() ; z4 = " ".join(z3) ss = s.read().strip().split() ct1 = 0 for line in z2 : if line == '\n' : ct1+=0 else : ct1 += len(line) print('char count =', ct1) ct2 = 0 ; a=[] ; b=[] for line in ff : for i in range(len(line)) : if line[i].isalnum()==True : ct2 += 1 a.append(line[i]) b = ''.join(a) else : continue c.append(b) a=[] print('alphanumeric count =', ct2) ct3 = 0 z = open(file_name, 'r') for line in z : ct3 += 1 print('line count =', ct3) ct4 = 0 ; b=[] for i in range(len(z4)) : if z4[i]==z4[0] : b.append(z4[i]) continue if z4[i].isalnum()==False and z4[i].isalnum() != z4[i-1].isalnum() : ct4 += 1 else : b.append(z4[i]) continue print('word count =', ct4) BW = [] ; BW0 = [] ; BW1 = [] ; BW2 = [] h = " ".join(c).lower().split() if x == 'y' or x == 'Y' : for i in range(len(h)) : if h[i] not in ss and h[i] not in BW0 : BW0.append(h[i]) BW1.append([fhash(h[i],b), count_words(h[i])]) q = sorted(BW1) for i in range(len(q)) : if i < len(q)-1 : for j in range(i+1,len(q)) : if q[i][0]==q[j][0] : q[i][1]+=q[j][1] for i in range(len(q)) : if q[i][0]==q[i-1][0] : continue else : BW.append(q[i]) print('BoW =', BW) elif x == 'n' or x == 'N' : for i in range(len(h)) : if h[i] not in ss : BW2.append([h[i], count_words(h[i])]) q = sorted(BoW2) for i in range(len(q)) : if q[i][0]==q[i-1][0] : continue else : BoW.append(k[i]) print('BoW =', BW) z.close() s.close()

ALL: cluster #2 (2)

# 6330426121 (2021-03-22 18:01) %diff = 13.88 def fhash(w,M): x = 0 for i in range(len(w)): x += ord(w[i])*37**i return x%M def to_alpha(s): i = 0 for c in s: if c.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': i += 1 return i def check_word(s): x = [] w = '' for c in s: if c.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': w += c else: if w != '': x.append(w) w = '' if w != '': x.append(w) return x file_name = input('File name = ') checkfh = input('Use feature hashing ? (y,Y,n,N) ') while True: if checkfh == 'y' or checkfh == 'Y': M = int(input('M = ')) checkfh = True break elif checkfh == 'n' or checkfh == 'N': checkfh = False break else: print('Try again.') checkfh = input('Use feature hashing ? (y,Y,n,N) ') print('-------------------') stopword = open('stopwords.txt', 'r') t = open(file_name, 'r') stw = [] for line in stopword: for e in line.strip().split(): stw.append(e.lower()) stopword.close() count_line = 0 count_c = 0 count_alp = 0 count_word = 0 for line in t: count_line += 1 count_c += len(line) count_alp += to_alpha(line) count_word += len(check_word(line)) count_c -= count_line-1 t.close() t = open(file_name, 'r') BoW = [] if checkfh == True: W_in_BoW = [] for line in t: for e in check_word(line): if e.lower() not in stw: if fhash(e.lower(),M) not in W_in_BoW: W_in_BoW.append(fhash(e.lower(),M)) BoW.append([fhash(e.lower(),M),1]) else: for i in range(len(BoW)): if BoW[i][0] == fhash(e.lower(),M): BoW[i][1] += 1 BoW.sort() else: W_in_BoW = [] for line in t: for e in check_word(line): if e.lower() not in stw: if e.lower() not in W_in_BoW: W_in_BoW.append(e.lower()) BoW.append([e.lower(),1]) else: for i in range(len(BoW)): if BoW[i][0] == e.lower(): BoW[i][1] += 1 t.close() print('char count =',count_c) print('alphanumeric count =',count_alp) print('line count =',count_line) print('word count =',count_word) print('BoW =',BoW)# 6330433521 (2021-03-22 16:52) %diff = 13.88 def fhash(w,M): nword = [] f = 0 for e in w: nword.append(e[:len(e)+1]) for i in range(len(nword)): f += (ord(nword[i])*37**i) return f%M def alphabet(t): c = 0 for e in t: if e.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': c += 1 return c def chword(t): x = [] w = '' for e in t: if e.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': w += e else: if w != '': x.append(w) w = '' return x file_name = input('File name = ') feature = input('Use feature hashing ? (y,Y,n,N) ') while True: if feature == 'y' or feature == 'Y': M = int(input('M = ')) feature = True break elif feature == 'n' or feature == 'N': feature = False break else: print('Try again.') feature = input('Use feature hashing ? (y,Y,n,N) ') print('-------------------') stopword = open('stopwords.txt', 'r') fle = open(file_name,'r') stop = [] for line in stopword: for e in line.strip().split(): stop.append(e.lower()) stopword.close chcount = 0 alcount = 0 lincount = 0 wcount = 0 for line in fle: lincount += 1 chcount += len(line) alcount += alphabet(line) wcount += len(chword(line)) chcount = chcount - lincount +1 fle.close() fle = open(file_name, 'r') bow = [] if feature == True: wbow = [] for line in fle: for e in chword(line): if e.lower() not in stop: if fhash(e.lower(),M) not in wbow: wbow.append(fhash(e.lower(),M)) bow.append([fhash(e.lower(),M), 1]) else: for i in range(len(bow)): if bow[i][0] == fhash(e.lower(),M): bow[i][1] += 1 bow.sort() else: wbow = [] for line in fle: for e in chword(line): if e.lower() not in stop: if e.lower() not in wbow: wbow.append(e.lower()) bow.append([e.lower(),1]) else: for i in range(len(bow)): if bow[i][0] == e.lower(): bow[i][1] += 1 fle.close() print('char count = ', chcount) print('alphanumeric count = ', alcount) print('line count = ', lincount) print('word count = ', wcount) print('BoW = ', bow)

ALL: cluster #3 (5)

# 6330362621 (2021-03-22 00:39) %diff = 26.74 filename=input('File name = ') feature=input('Use feature hashing ? (y,Y,n,N) ') while feature not in 'yYnN': print('Try again.') feature=input('Use feature hashing ? (y,Y,n,N) ') if feature in "yY": M=int(input("M = ")) print('-------------------') file= open(filename).read().lower().strip('\n') file2= open('stopwords.txt').read().lower() charcount =0 alphanumericcount=0 linecount=1 word=[] word2=[] a='' b='' def fhash(w,M): ans=0 for i in range(len(w)): ans+=ord(w[i])*(37**i) return ans%M for e in file: if e!='\n': charcount+=1 else: linecount+=1 if 'a'<=e<='z' or '0'<= e<='9': alphanumericcount+=1 a+=e else: if a!='': word.append(a) a='' print('char count =',charcount) print('alphanumeric count =',alphanumericcount) print('line count =',linecount) if a!='': word.append(a) wordcount=len(word) print('word count =',wordcount) for x in file2: if 'a'<=x<='z' or '0'<=x<='9': b+=x else: if b!='': word2.append(b) b='' if b!='': word2.append(b) for e in word2: for i in range(word.count(e)): word.remove(e) if feature in "yY": for i in range(len(word)): word[i]=fhash(word[i],M) BoW =[] a=[] for e in word: if e not in a: BoW.append([e,word.count(e)]) a.append(e) BoW.sort() print('BoW =',BoW)# 6330375821 (2021-03-22 22:25) %diff = 26.74 file_name=input('File name = ') ft=input('Use feature hashing ? (y,Y,n,N) ') while ft not in 'yYnN': print('Try again.') ft=input('Use feature hashing ? (y,Y,n,N) ') if ft in "yY": M=int(input("M = ")) print('-------------------') file= open(file_name).read().lower().strip('\n') file2= open('stopwords.txt').read().lower() Ch_c,Ap_c,L_c,word,word2,A,B =0,0,1,[],[],'','' def fhash(w,M): ans=0 for i in range(len(w)): ans+=ord(w[i])*(37**i) return ans%M def PrBow(word): Bow,A =[],[] for e in word: if e not in A: Bow.append([e,word.count(e)]) A.append(e) Bow.sort() print('BoW =',Bow) return(Bow) for e in file: if e!='\n': Ch_c+=1 else: L_c+=1 if 'A'<=e<='z' or '0'<= e<='9': Ap_c+=1 A+=e else: if A!='': word.append(A) A='' print('char count =',Ch_c) print('alphanumeric count =',Ap_c) print('line count =',L_c) if A!='': word.append(A) word_count=len(word) print('word count =',word_count) for x in file2: if 'A'<=x<='z' or '0'<=x<='9': B+=x else: if B!='': word2.append(B) B='' if B!='': word2.append(B) for e in word2: for i in range(word.count(e)): word.remove(e) if ft in "yY": for i in range(len(word)): word[i]=fhash(word[i],M) PrBow(word)# 6330355221 (2021-03-22 23:03) %diff = 34.41 #--------------------------------------- def fhash(w,M): confhash = 0 for i in range(len(w)): confhash += ord(w[i])*(37**i) return confhash % M #--------------------------------------- vala = '' valb = '' vocab_one = [] vocab_two = [] linecount = 1 sarawordcount = 0 sicticcount = 0 #--------------------------------------- list_Fileimport = input('File name = ') thename_char = input('Use feature hashing ? (y,Y,n,N) ') #--------------------------------------- while thename_char not in 'yYnN': print('Try again.') thename_char = input('Use feature hashing ? (y,Y,n,N) ') if thename_char in "yY": case_one = int(input("M = ")) print('-------------------') linefilea = open('stopwords.txt') linefileaa = linefilea.read() open_filetwo = linefileaa.lower() linefile = open(list_Fileimport) linefiles = linefile.read() linefiless = linefiles.lower() open_file = linefiless.strip('\n') #--------------------------------------- for i_e in open_file: if i_e != '\n': sicticcount += 1 else: linecount += 1 if 'a'<= i_e <='z' or '0'<= i_e <='9': sarawordcount += 1 vala += i_e elif vala != '': vocab_one.append(vala) vala = '' print('char count =', sicticcount) print('alphanumeric count =', sarawordcount) print('line count =', linecount) #--------------------------------------- if vala != '': vocab_one.append(vala) countvocab = len(vocab_one) print('word count =',countvocab) for i_j in open_filetwo : if '0'<= i_j <='9' or 'a'<= i_j <='z': valb+=i_j elif valb != '': vocab_two.append(valb) valb = '' if valb!='': vocab_two.append(valb) for i_o in vocab_two: for i in range(vocab_one.count(i_o)): vocab_one.remove(i_o) if thename_char in "yY": for i in range(len(vocab_one)): vocab_one[i] = fhash(vocab_one[i],case_one) #--------------------------------------- vala = [] finalBoW =[] for i_k in vocab_one: if i_k not in vala: finalBoW.append([i_k,vocab_one.count(i_k)]) vala.append(i_k) #--------------------------------------- finalBoW.sort() print('BoW =',finalBoW)# 6330574321 (2021-03-22 22:28) %diff = 42.96 file=input("File name = ") file1="sample.txt" file2="stopwords.txt" fea=input("Use feature hashing ? (y,Y,n,N) ") a=["y","Y","n","N"] b=["y","Y"] while fea not in a: print("Try again.") fea=input("Use feature hashing ? (y,Y,n,N) ") if fea in b: M=int(input("M = ")) f1=open(file1,"r").read().lower() f2=open(file2,"r").read().lower() def fhash(j,p): c=0 A=len(j) for e in range(A): c=c+ord(j[e])*(37**e) d=int(c%p) return d k=0 l=1 n=0 w=[] s=[] o=[] r=[] x="" y="" for i in f1: if i!="\n": k=k+1 else: l=l+1 if "0"<=i<="9" or "a"<=i<="z": n=n+1 x=x+i elif x!="": w.append(x) x="" print("-------------------") K=str(k) print("char count = "+K) N=str(n) print("alphanumeric count = "+N) L=str(l) print("line count = "+L) if x!="": word.append(x) x="" W2=str(len(w)) print("word count = "+W2) for e in f2: if "0"<=e<="9" or "a"<=e<="z": y=y+e elif y!="": s.append(y) y="" if y!="": s.append(y) for e in s: for i in range(w.count(e)): w.remove(e) w1=len(w) if fea in b: for i in range(w1): w[i]=fhash(w[i],M) for e in w: if e not in r: o.append([e,w.count(e)]) r.append(e) o.sort() O=str(o) print("BoW = ",O)# 6330467921 (2021-03-21 23:06) %diff = 49.49 def fhash(w,M): s = 0 for i in range(len(w)): s += ord(w[i])*(37**i) fh = s%M return fh def count(word, wordslist): c = 0 for w in wordslist: if w == word: c += 1 return c file_name = input('File name = ') yn = input('Use feature hashing ? (y,Y,n,N) ') while yn not in 'yYnN': print('Try again.') yn = input('Use feature hashing ? (y,Y,n,N) ') if yn == "y" or yn == 'Y': M = int(input('M = ')) print('-------------------') stopwords = [] stopfile = open("stopwords.txt","r") for line in stopfile: line = line.lower() if len(line) > 0: stopwords += line.split() stopfile.close() abnum = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' cc = 0 abc123 = 0 lc = 0 words = [] file = open(file_name,"r") for line in file: l = '' for a in line: if a != '\n': cc += 1 if a not in abnum: l += ' ' else: l += a abc123 +=1 words += l.split() if len(line) > 0: lc += 1 file.close() print('char count =',cc) print('alphanumeric count =',abc123) print('line count =',lc) print('word count =',len(words)) for i in range(len(words)): words[i] = words[i].lower() for i in range(len(stopwords)): stopwords[i] = stopwords[i].lower() cut_words = [] for a in words: if a not in stopwords: cut_words.append(a) if yn == 'y' or yn == 'Y': for i in range(len(cut_words)): cut_words[i] = fhash(cut_words[i],M) bow = [] for e in cut_words: if e not in bow: bow.append(e) for i in range(len(bow)): bow[i] = [bow[i], count(bow[i],cut_words)] bow.sort() print('BoW =',bow)

ALL: cluster #4 (7)

# 6330241321 (2021-03-22 23:59) %diff = 27.07 def fhash(w, M) : a = 0 for i in range(len(w)) : a = a + (ord(w[i]) * ((37)**i)) a = a % M return a file_name = input("File name = ") ans = input("Use feature hashing ? (y,Y,n,N) ") M = 0 while ans != "n" and ans != "N" and ans != "y" and ans != "Y" : print("Try again.") ans = input("Use feature hashing ? (y,Y,n,N) ") if ans.lower() == "n": ans = False else : M = int(input("M = ")) ans = True print("-"*19) l1 = 0 l2 = 0 lineCount = 0 words = [] file_words = open(file_name, "r") for line in file_words : lineCount = lineCount + 1 for c in line : l1 = l1 + 1 if c == "\n" : l1 = l1 - 1 if ("0" <= c <= "9") or ("a" <= c <= "z") or ("A" <= c <= "Z") : l2 = l2 + 1 word = "" for c in line : if ("0" <= c <= "9") or ("a" <= c <= "z") or ("A" <= c <= "Z") : word = word + c else : if len(word) != 0 : words.append(word) word = "" file_words.close() stopwords = [] file_stopwords = open("stopwords.txt", "r") for line in file_stopwords : for w in line.split() : w = w.lower() if w not in stopwords : stopwords.append(w) file_stopwords.close() a = [] for c in words : c = c.lower() if c in stopwords : pass else : have = False if ans : d = fhash(c, M) for i in range(len(a)) : if a[i][0] == d : a[i][1] = a[i][1] + 1 have = True break if not have : a.append([d, 1]) else: for i in range(len(a)) : if a[i][0] == c : a[i][1] = a[i][1] + 1 have = True break if not have : a.append([c, 1]) print("char count =", l1) print("alphanumeric count =", l2) print("line count =", lineCount) print("word count =", len(words)) print("BoW =", a)# 6330257421 (2021-03-22 19:43) %diff = 27.07 def fhash(w, M): G = 37 r = 0 for i in range(len(w)): r += (ord(w[i]) * (G**i)) return r % M tx = input('File name = ') hashing = input('Use feature hashing ? (y,Y,n,N) ') if(hashing == ''): hashing = 'a' while hashing not in 'yYnN ': hashing = input('Use feature hashing ? (y,Y,n,N) ') if(hashing == ''): hashing = 'a' print('Try again.') if(hashing in 'yY'): m = int(input('M = ')) print('-------------------') i = 0 j = 0 lc = 0 words = [] file = open(tx, 'r') for l in file: lc=lc+1 for k in l: i=i+1 if(k == '\n'): i=i-1 if('a'<=k<='z')or('A'<=k<='Z')or('0'<=k<='9'): j=j+1 word = '' for k in l: if('a'<=k<='z')or('A'<=k<='Z')or('0'<=k<='9'): word=word+k elif len(word) != 0: words.append(word) word = '' file.close() stopword = [] stop = open('stopwords.txt', 'r') for line in stop: for word in line.strip().split(): word = word.lower() if word not in stopword: stopword.append(word) stop.close() r = [] for c in words: c = c.lower() if c not in stopword: if hashing in 'yY': cEdit = fhash(c, m) for x in range(len(r)): if r[x][0] == cEdit: r[x][1] += 1 break else: r.append([cEdit, 1]) else: for x in range(len(r)): if r[x][0] == c: r[x][1] += 1 break else: r.append([c, 1]) print('char count =', i) print('alphanumeric count =', j) print('line count =', lc) print('word count =', len(words)) print('BoW =', r)# 6330188821 (2021-03-22 23:18) %diff = 33.55 file_name = input("File name = ") BoW = input("feature hashing ? (y,Y,n,N) ") M = - 1 while BoW not in "nNyY": print ("try again") BoW = input("feature hashing ? (y,Y,n,N) ") if BoW in "Yy": M = int(input("M = ")) BoW = True else: BoW = False print("-------------------") a = [] stop = open("stopwords.txt" , "r") for line in stop: for x in line.strip().split(): x = x.lower() if x not in a: a.append(x) stop.close() len1 = 0 len2 = 0 linecount = 0 words = [] file = open(file_name , "r") for line in file: linecount += 1 for b in line: len1 += 1 if ("A"<= b <= "Z") or ("a"<= b <="z") or ("0" <= b <= "9"): len2 += 1 if b == "\n": len1 -= 1 word = '' for b in line: if ('A' <= b <= 'Z') or ('a' <= b <= 'z') or ('0' <= b <= '9'): word += b else: if len(word) != 0: words.append(word) word = "" file.close() def get(words, stopWords, isBoW, M): k = [] for p in words: p = p.lower() if p in stopWords: pass else: found = False if BoW: G = 37 r = 0 for i in range(len(x)): p = ord(x[i]) p = p * (G**i) r += p Edit = r % M else: for i in range(len(k)): if k[i][0]==Edit: k[i][1] += 1 found = True break if not found: k.append([Edit, 1]) else: for i in range(len(k)): if k[i][0] == p: k[i][1] += 1 found = True break if not found : k.append([p, 1]) return k print("char count=", len1) print("alphanumeric count", len2) print("line count=", linecount) print("word count =", len(words)) print("BoW =", get(words, a, BoW, M))# 6330487421 (2021-03-22 23:39) %diff = 33.55 def iinput(): M=-1 file_name = input('File name = ') wantfhash = input('Use feature hashing ? (y,Y,n,N) ') while wantfhash not in ['y', 'Y', 'n', 'N']: print('Try again.') wantfhash = input('Use feature hashing ? (y,Y,n,N) ') if wantfhash in ['y', 'Y']: M = int(input('M = ')) wantfhash = True else: wantfhash = False print('-------------------') return file_name, wantfhash, M def sstopwords(): x = [] stopWordsFile = open('stopwords.txt', 'r') for line in stopWordsFile: for word in line.strip().split(): word = word.lower() if word not in x: x.append(word) stopWordsFile.close() return x def wwords(file_name): q = 0 p = 0 lines = 0 words = [] wordsFile = open(file_name, 'r') for line in wordsFile: lines += 1 for y in line: q += 1 if y == '\n': q -= 1 if ('A' <= y <= 'Z') or('a' <= y <= 'z') or ('0' <= y <= '9') : p += 1 w = '' for y in line: if ('0' <= y <= '9') or ('A' <= y <= 'Z') or('a' <= y <= 'z') : w += y else: if len(w) != 0: words.append(w) w = '' wordsFile.close() return q, p , lines, words def fhash(w, M): G = 37 x = 0 for i in range(len(w)): x += (ord(w[i])*(G**i)) return x % M def bbow(words, stopWords, wantfhash, M): r = [] for y in words: y = y.lower() if y in stopWords: pass else: found = False if wantfhash: cEdit = fhash(y, M) for i in range(len(r)): if r[i][0] == cEdit: r[i][1] += 1 found = True break if not found: r.append([cEdit, 1]) else: for i in range(len(r)): if r[i][0] == y: r[i][1] += 1 found = True break if not found: r.append([y, 1]) return r #---------------------------------------------------------------------------- file_name, wantfhash, M = iinput() stopWords = sstopwords() q, p , lines, words = wwords(file_name) print('char count =', q) print('alphanumeric count =', p) print('line count =', lines) print('word count =', len(words)) print('BoW =', bbow(words, stopWords, wantfhash, M))# 6330477121 (2021-03-22 23:55) %diff = 38.01 def Input_data(): Count = 0 M = -1 File_name_input = input('File name = ') BoW_num = input('Use feature hashing ? (y,Y,n,N) ') while BoW_num not in ['Y', 'y', 'N', 'n']: print('Try again.') BoW_num = input('Use feature hashing ? (y,Y,n,N) ') Count += 1 if BoW_num in ['y', 'Y']: M = int(input('M = ')) BoW_num = True elif BoW_num in ['n' , 'N']: BoW_num = False else: pass print('-------------------') return File_name_input, BoW_num, M def TikTok(w, M): Start = 37 Second = 0 for i in range(len(w)): Second += ((Start**i) * ord(w[i])) Ans = (Second % M) return Ans def Words_Func(File_name_input): Lenght1= 0 Lenght2 = 0 Num_Line = 0 words = [] Count = 0 word = '' wordsFile = open(File_name_input, 'r') for line in wordsFile: Num_Line += 1 for c in line: Lenght1+= 1 if c == '\n': Lenght1-= 1 else: pass #Count += 1 for c in line: if ('a' <= c <= 'z') or ('A' <= c <= 'Z') or ('0' <= c <= '9'): Lenght2 += 1 word += c else: if len(word) != 0: words.append(word) else: False #Count += 1 word = '' wordsFile.close() return Lenght1 , Lenght2 , Num_Line, words def StopWords_Func(): r = [] File_Of_stopWords = open('stopwords.txt', 'r') Count = 0 for line in str(File_Of_stopWords): for i in line.strip().split(): i = i.lower() if i not in r: r.append(i) else: False #Count += 1 File_Of_stopWords.close() return r def BoW_Ans(words, stopWords, BoW_num, M): Ans = [] for j in words: j = j.lower() if j in stopWords: pass else: Check = False if BoW_num: Edit = TikTok(j, M) for i in range(len(Ans)): if Ans[i][0] == Edit: Ans[i][1] += 1 Check = True break else: pass if not Check: Ans.append([Edit, 1]) else: pass else: for i in range(len(Ans)): if Ans[i][0] == j: Ans[i][1] += 1 Check = True break else: False if not Check: Ans.append([j, 1]) else: pass return Ans File_name_input,\ BoW_num, \ M = Input_data() stopWords = StopWords_Func() Lenght1,\ Lenght2 ,\ Num_Line, words = Words_Func(File_name_input) print('char count = ', Lenght1) print('alphanumeric count = ', Lenght2) print('line count = ', Num_Line) print('word count = ', len(words)) print('BoW = ', BoW_Ans(words, stopWords, BoW_num, M))# 6330255121 (2021-03-22 23:47) %diff = 47.63 file_name = input('File name = ') fh = input('Use feature hashing ? (y,Y,n,N) ') while fh != 'y' and fh != 'Y' and fh != 'n' and fh != 'N': print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') if fh == 'y' or fh == 'Y': M = int(input('M = ')) print('-------------------') def tostopwords(): stopwordlist = [] stopwords = open('stopwords.txt', 'r') for line in stopwords: for s in line.split(): s = s.lower() if s not in stopwordlist: stopwordlist.append(s) stopwords.close() return stopwordlist lenght = 0 wfile = open(file_name, 'r') for w in wfile: for ww in w: if ww != '\n': lenght += 1 wfile.close() l = 0 words = [] word = '' wfile = open(file_name, 'r') for w in wfile: for ww in w: if ('a'<=ww<='z') or ('A'<=ww<='Z') or ('0'<=ww<='9'): word += ww else: l += len(word) words.append(word) word = '' wfile.close() words2 = [] for w in words: if w != '': words2.append(w) linecount = 0 wfile = open(file_name, 'r') for line in wfile: linecount += 1 wfile.close() def fhash(word,M): G = 37 v = 0 for i in range(len(word)): v += ord(word[i])*(G**i) f = v % M return f #---------------------------------------------------------------- for w in words: w = w.lower() print('char count =', lenght) print('alphanumeric count =', l) print('line count =', linecount) print('word count =', len(words2)) print('BoW =', )# 6330459921 (2021-03-21 17:25) %diff = 47.89 # -------------------------------------------------- def fhash(w,M) : c = [] for i in range(len(w)) : if ("a" <= w[i] <= "z") or ("A" <= w[i] <= "Z") or ("0" <= w[i] <= "9"): c.append(w[i]) a = [] for i in range(len(c)) : x = str(ord(c[i])) a.append(x) ass = 0 for i in range(len(a)): z = int(a[i]) * (37**i ) ass += z ass = ass % M return ass # -------------------------------------------------- file_name = input("File name = ") useBoW = input("Use feature hashing ? (y,Y,n,N)") # หา "char count" , "alphanumeric count " , "line count" , "words count" charcount = 0 alphanumericcount = 0 newline = 0 words = [] linecount = 0 file = open(file_name , "r") for line in file : linecount += 1 for c in line : charcount += 1 if c == "\n" : newline += 1 if "a" <= c <= "z" or "A" <= c <= "Z" or "0" <= c <= "9" : alphanumericcount += 1 word = "" for c in line: if "a" <= c <= "z" or "A" <= c <= "Z" or "0" <= c <= "9" : word += c else : if len(word) != 0 : words.append(word) word = "" charcount = charcount - newline file.close() #หาstop words stopwords = [] stopw = open( "stopwords.txt" , "r") for line in stopw : for w in line.strip().split() : w = w.lower() if w not in stopwords : stopwords.append(w) while useBoW not in ["y" , "Y" , "n" , "N"] : print("Try again.") useBoW = input("Use feature hashing ? (y,Y,n,N)") BoW = [] if useBoW in ["y" , "Y"] : M = int(input("M = ")) print("-------------------") for c in words : c = c.lower() if c in stopwords : pass else : found = 0 newc = fhash(c,M) for i in range(len(BoW)) : if BoW[i][0] == newc : BoW[i][1] += 1 found = 1 break if not found : BoW.append([newc,1]) if useBoW in ["n" , "N"] : print("-------------------") for c in words : c = c.lower() if c in stopwords : pass else : found = 0 for i in range(len(BoW)) : if BoW[i][0] == c : BoW[i][1] += 1 found = 1 break if not found : BoW.append([c,1]) print("char count = " , charcount) print("alphanumeric count = " , alphanumericcount) print("line count = " , linecount) print("word count = " , len(words)) print("BoW = " , BoW)

ALL: cluster #5 (2)

# 6330234021 (2021-03-22 22:03) %diff = 30.54 def char_count(file_name): ### fn = open(file_name) c = 0 for line in fn : for e in line : if e != '\n' : c += 1 fn.close() return c def alphanumeric_count(file_name) : ### fn = open(file_name) c = '' c_1 = "\"\'/\\,.:; " for line in fn : for e in line : if e not in c_1 : c += e fn.close() return len(c)- line_count(file_name) + 1 def line_count(file_name) : ### fn = open(file_name) c = 0 for line in fn : c += 1 fn.close() return c def word_count(file_name) : ### f = open(file_name) c = '' x = 0 alp = 'abcdefghijklmnopqrstuvwxyz0123456789' for line in f : for g in line : if g.lower() in alp : c += g else : c += ' ' x += len(c.split()) c = '' f.close() return x def BoW(file_name , stopwords) : ### f1 = open(file_name) f2 = open(stopwords) l = [] cc = '' d2 = [] alp = 'abcdefghijklmnopqrstuvwxyz0123456789' c ='' for line in f1 : for g in line : if g.lower() in alp : cc += g.lower() else : cc += ' ' for line in f2 : c += ' ' if line[-1] == '\n' : line = line[:-1] for g in line : c += g.lower() c1 = c.split() c2 = '' for r in cc.split() : if r not in c1 : c2 += r + ' ' d = c2.split() for d1 in d : if d1 not in d2 : d2.append(d1) for e in d2 : c3 = 0 w = 0 while c2.find(e,w) != -1 : c3 += 1 w += c2.find(e,w) + 1 l.append([e,c3]) l.sort() f1.close() f2.close() return l def feature_harshing(l,M) : ### x = [] y = '' for l1 in l : c = 0 c1 = 0 for l3 in l1[0] : c += ord(l3)*(37**c1) c1 += 1 flash = c % M y += str(flash)*l1[1] for i in range(M): c2 = 0 c3 = 0 while y.find(str(i),c3) != -1 : c2 += 1 c3 = y.find(str(i),c3) + 1 if c2 != 0 : x.append([i,c2]) return x def display(file_name , stopwords , x) : print("-------------------") print("char_count =", char_count(file_name)) print("alphanumeric_count =", alphanumeric_count(file_name)) print("line_count =", line_count(file_name)) print("word_count =", word_count(file_name)) if x == '0' : print('BoW =' , BoW(file_name , stopwords) ) if x == '1' : print('BoW =' , feature_harshing(BoW(file_name , stopwords),M)) file_name = input("File name = ") x = input("use feature hashing ? (y,Y,n,N)") while x not in ['y','Y','n','N'] : print('Try again.') x = input("use feature hashing ? (y,Y,n,N)") if x == 'y' or x == 'Y' : M = int(input("M = ")) display(file_name , 'stopwords.txt' , '1') else : display(file_name , 'stopwords.txt' , '0')# 6330349521 (2021-03-22 23:53) %diff = 30.54 def char_count(file_name): f = open(file_name) c = 0 d = 0 for line in f: c += len(line) if line[-1::]=='\n': d += 1 f.close() return c-d def alnum_count(file_name): f = open(file_name) c = 0 for line in f: for g in line: if g.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': c += 1 f.close() return c def line_count(file_name): f = open(file_name) c = 0 for line in f: c += 1 f.close() return c def word_count(file_name): f = open(file_name) c = '' wc = 0 for line in f: for g in line: if g.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': c += g else: c += ' ' wc += len(c.split()) c = '' f.close() return wc def BoW(file_name,stopwords): f1 = open(file_name) f2 = open(stopwords) lb = [] lc = [] cfn = '' d2 = [] csw = '' cb = '.' for line in f1: for g in line: if g.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': cfn += g.lower() else: cfn += ' ' for line in f2: csw += ' ' if line[-1::1] == '\n': line = line[0:-1:1] for g in line: csw += g.lower() for r in cfn.split(): if r not in csw.split(): cb += r cb += '.' for cdc in cb.split('.'): if cdc not in lc: lc.append(cdc) for e in lc: if e == '': pass else: cnb = 0 w = 0 while cb.find(e,w)!=-1: cnb += 1 w = cb.find(e,w)+1 lb.append([e, cnb]) lb.sort() f1.close() f2.close() return lb def feature_harshing(l,M): s = [] f = '' for l1 in l: c = 0 c1 = 0 for l3 in l1[0]: c += ord(l3)*(37**c1) c1 += 1 fhash = c%M f += (str(fhash)+'.')*l1[1] for i in range(M): c2 = 0 c3 = 0 while f.find(str(i),c3) != -1: c2 += 1 c3 = f.find(str(i),c3)+1 if c2 != 0: s.append([i, c2]) return s def display(file_name,stopwords,x): print('-------------------') print('char count =', char_count(file_name)) print('alphanumeric count =', alnum_count(file_name)) print('line count =', line_count(file_name)) print('word count =', word_count(file_name)) if x=='0': print('BoW =', BoW(file_name,stopwords)) if x=='1': print('BoW =',feature_harshing(BoW(file_name,stopwords),M)) file_name = input("File name = ") x = input("Use feature hashing ? (y,Y,n,N) ").strip() while x not in ['y','Y','n','N']: x = input("Use feature hashing ? (y,Y,n,N) ").strip() if x == 'y' or x == 'Y': M = int(input("M = ").strip()) display(file_name,'stopwords.txt','1') else: display(file_name,'stopwords.txt','0')

ALL: cluster #6 (2)

# 6330340821 (2021-03-21 02:21) %diff = 35.06 file_name=input('File name = ') op=input('Use feature hashing ? (y,Y,n,N) ') while op not in ['y','Y','n','N']: print('Try again.') op=input('Use feature hashing ? (y,Y,n,N) ') if op in ['y','Y']: M=input('M = ') print('-------------------') stop=open('stopwords.txt','r') file=open(file_name,'r') linecount=0 wordcount=0 xyz='' words=[] charcount=0 alphacount=0 for line in file: linecount+=1 charcount+=len(line) for e in line: if e.isalnum(): xyz+=e else: xyz+=" " word=xyz.split() wordcount+=len(word) for i in word: words.append(i.lower()) for e in range(len(word)): for u in word[e]: if u.lower() in'abcdefghijklmnopqrstuvwxyz0123456789': alphacount+=1 charcount=charcount-linecount+1 print('char count =',charcount) print('alphanumeric count =',alphacount) print('line count =',linecount) print('word count =',wordcount) aa=[] stopword=[] for line in stop: n= line.split() for i in n: stopword.append(i.lower()) ####################################### def removepunc(x): y=[] k='' for i in x: for e in i: if e not in '\'\"\(\),\/\\.:;-><+-*=' : k+=e y.append(k) k='' return y ####################################### for i in words: if i not in stopword: aa.append(i) ww= removepunc(aa) ####################################### w=[]#word n=[]#fre for i in range (len(ww)): if ww[i] not in w: w.append(ww[i]) n.append(1) else: n[w.index(ww[i])]+=1 wn=[] for i in range (len(w)): wn.append([w[i],n[i]]) wn.sort() ####################################### def fhash(w,M): G=37 y=0 for i in range (len(w)): y+=ord(w[i])*G**(i) z=y%int(M) return z ####################################### if op.lower()=='y': ss=[] tt=[] for i in range (len(ww)): if fhash(ww[i],M) not in ss: ss.append(fhash(ww[i],M)) tt.append(1) else: tt[ss.index(fhash(ww[i],M))]+=1 fn=[] for i in range (len(ss)): fn.append([ss[i],tt[i]]) fn.sort() print('BoW =',fn) else: print('BoW =',wn) ####################################### stop.close() file.close()# 6330481621 (2021-03-22 19:51) %diff = 35.06 file_name=input("File_name= ") use=input("Use feature hashing ? (y,Y,n,N) ") while use not in ['y','Y','n','N']: print('Try again.') use=input("Use feature hashing ? (y,Y,n,N) ") if use in ['y','Y']: M=input("M = ") print('-------------------') stop=open('stopwords.txt','r') file=open(file_name,'r') cha=0 alpha=0 stw=[] linecount=0 wordcount=0 sen=''#ประโยคในfile ที่ cleanแล้ว for line in stop: a=line.split() for e in a: stw.append(e) for line in file: linecount+=1 a=line.split() cha+=len(line) for e in line: if e.isalnum(): sen+=e else: sen+=' ' word=sen.split() #[]คำสะอาด wordlow=[] #[]คำสะอาดพิมเล็ก wordcount=len(word) cha=cha-linecount+1 for e in word: wordlow.append(e.lower()) for i in range(len(word)): for e in word[i]: if '0'<=e<='9' or 'a'<=e.lower()<='z': alpha+=1 perfsen=' '.join(wordlow) #ประโยคสวย print('char count =',cha) print('alphanumeric count =',alpha) print('line count =',linecount) print('word count =',wordcount) #----------------------------------- def fhash(w,M): summ=0 G=37 for i in range(len(w)): summ+=ord(w[i])*G**i sol=summ%int(M) return sol #----------------------------------- def clean(s): a=[ '(', ')', '-', '_', '[', ']' ,'"' ,"'" ,';', ':', '>', '<','.' ] c=[] for i in range(len(s)): if s[i] not in a: c.append(s[i]) return c #----------------------------------- nsen=[] for e in wordlow: if e not in stw: nsen.append(e) newsen=clean(nsen) #----------------------------------- x=[] y=[] for i in range(len(newsen)): if newsen[i] not in x: x.append(newsen[i]) y.append(1) else: y[x.index(newsen[i])]+=1 block=[] for i in range(len(x)): block.append([x[i],y[i]]) block.sort() #----------------------------------- if use in ['y','Y']: o=[] p=[] for i in range(len(newsen)): if fhash(newsen[i],M) not in o: o.append(fhash(newsen[i],M)) p.append(1) else: p[o.index(fhash(newsen[i],M))]+=1 q=[] for i in range(len(o)): q.append([o[i],p[i]]) q.sort() print('BoW =',q) else: print('BoW =',block) stop.close() file.close()

ALL: cluster #7 (2)

# 6330248821 (2021-03-22 20:51) %diff = 35.17 alp = 'abcdefghijklmnopqrstuvwxyz' num = '0123456789' file = input('File name = ') x = input('Use feature hashing ? (y,Y,n,N) ').lower() M = '' def nFhash(w): BoW = [] bow = [] count = [] for word in w: if word not in bow: bow.append(word) count.append(int(1)) else : for i in range(len(bow)): if word == bow[i]: count[i] += 1 for j in range(len(bow)): BoW.append([bow[j],count[j]]) return BoW def yFhase(w,m): fhase = [] BoW = [] bow = [] count = [] for word in w: f = 0 for i in range(len(word)): f += ord(word[i]) * (37 ** i) fhase.append(f % int(m)) for e in fhase : if e not in bow: bow.append(e) count.append(1) else: for j in range(len(bow)): if e == bow[j]: count[j] += 1 for k in range(len(bow)): BoW.append([bow[k],count[k]]) return BoW while x not in'ny': print('Try again') x = input('Use feature hashing ? (y,Y,n,N) ').lower() if x == 'y': M = input('M = ') sFile = open('stopwords.txt','r') stop_words = [] for line in sFile: stop_words += line.split() stop_words = list(map(str.lower,stop_words)) sFile.close() wFile = open(file,'r') charCount = 0 alpCount = 0 lineCount = 0 wordCount = 0 words = [] text = '' for line in wFile: lineCount += 1 words += line.split() words = list(map(str.lower,words) ) for char in line.strip(): charCount += 1 for word in words: for alpnum in word: if alpnum in alp or alpnum in num: text += alpnum text += ' ' clearedWords = text.split() print(clearedWords) wordCount += len(clearedWords) for i in range(len(clearedWords)): alpCount += len(clearedWords[i]) print('-------------------') print('char count =',charCount) print('alphanumeric count =',alpCount) print('line count =',lineCount) print('word count =',wordCount) deletedWord = [] for w in clearedWords: if w not in stop_words: deletedWord.append(w) if x == 'y': print('BoW =',sorted(yFhase(deletedWord,M))) else : print('Bow =',sorted(nFhash(deletedWord)))# 6330474221 (2021-03-22 20:52) %diff = 35.17 alpnum = 'abcdefghijklmnopqrstuvwxyz0123456789' file = input('File name = ') x = input('Use feature hashing ? (y,Y,n,N) ').lower() M = '' def Bag_of_words(words): BoW = [] word_list = [] count = [] for word in words: if word not in word_list: word_list.append(word) count.append(int(1)) else: for i in range(len(word_list)): if word_list[i] == word: count[i]+=1 for i in range(len(word_list)): BoW.append([word_list[i],count[i]]) return sorted(BoW) def fhash_BOW(BoW,M): BoW_hash = [] hash_list = [] hash_count = [] for word, count in BoW: num_hash = fhash(word,M) if num_hash not in hash_list: hash_list.append(num_hash) hash_count.append(count) else: for i in range(len(hash_list)): if num_hash == hash_list[i]: hash_count[i]+=count for i in range(len(hash_list)): BoW_hash.append([hash_list[i],hash_count[i]]) BoW_hash = sorted(BoW_hash) return BoW_hash def fhash(word, M): f = 0 for i,char in enumerate(word): f += ord(char)*(37**i) f = f%int(M) return f while x not in ['n','y']: print('Try again') x = input('Use feature hashing ? (y,Y,n,N) ').lower() if x == 'y': M = input('M =') sFile = open('stopwords.txt','r') stop_words = [] for line in sFile: stop_words += line.split() stop_words = list(map(str.lower,stop_words)) sFile.close() wFile = open(file,'r') charCount = 0 alpCount = 0 lineCount = 0 wordCount = 0 words = [] for line in wFile: lineCount+=1 words += line.split() words = list(map(str.lower,words)) charCount+= len(line.strip()) wFile.close() clean_words = [] for word in words: text = '' for char in word: if char in alpnum: text += char alpCount+=1 clean_words.append(text) wordCount += len(clean_words) clean_word_stopword = [] for word in clean_words: if word not in stop_words: clean_word_stopword.append(word) print(clean_word_stopword) BoW = Bag_of_words(clean_word_stopword) if x == 'y': BoW_hash = fhash_BOW(BoW,M) print('-------------------') print('char count =',charCount) print('alphanumeric count =',alpCount) print('line count =',lineCount) print('word count =',wordCount) if x =='y': print('BoW = ', BoW_hash) else: print('BoW = ', BoW)

ALL: cluster #8 (2)

# 6330323121 (2021-03-22 21:47) %diff = 41.78 #..................................................................................... #ให้ w คือคำที่ประกอบด้วยอักขระ c0 c1 c2 ... cn –1 #fhash(w,M) = fhash(c0 c1 c2 ... cn –1, M) = ( ord(c0) + ord(c1)G1 + ord(c2)G2 + ... + ord(cn –1)Gn –1) % M def fhash(w,M) : u=0 G=37 fh=0 for i in range(len(w)): fh+=ord(w[i])*(G**u) u+=1 return fh%M def char_count(file_name): n = -1 c = 0 f = open(file_name) for line in f: n += 1 c += len(line) f.close() c -= n return c return c def a_and_num_count(file_name): f=open(file_name) c=0 alphabet='abcdefghijklmnopqrstuvwxyz' num='0123456789' for line in f: for i in line: if i in alphabet or i in alphabet.upper() or i in num: c+=1 f.close() return c def words_count(file_name): f=open(file_name) s='' alphabet='abcdefghijklmnopqrstuvwxyz' num='0123456789' for line in f: for i in line : if i in alphabet or i in alphabet.upper() or i in num : s+=i else: s+=' ' x=s.split() f.close() return len(x) def line_count(file_name): c = 0 f = open(file_name) for line in f: c += 1 f.close() return c def BoW_Nn(file_name,stop): alphabet = "abcdefghijklmnopqrstuvwxyz" num = "1234567890" f = open(file_name) f2 = "" s2 = "" for line in f: for e in line: if e.lower() in alphabet or e in num: f2 += e else: f2 += " " s = open(stop) for line in s: for e in line: s2 += e f3 = f2.lower().split() s3 = s2.lower().split() x = [] for e in f3: if e not in s3: x.append(e) x.sort() b0 = [x[0]] b1 = [1] for i in range(1,len(x)): if x[i] != x[i-1]: b0.append(x[i]) b1.append(1) else: b1[-1] += 1 b = [] for i in range(len(b0)): b.append([b0[i],b1[i]]) f.close() s.close() return b def BoW_Yy(file_name,stop,M): b=BoW_Nn(file_name,stop) by=[] for i in range(len(b)): by.append(fhash(b[i][0],M)) by.sort() #[1,1,2,3,3,4,5,5,6] by0=[by[0]] by1=[1] for i in range(1,len(by)): if by[i-1]!=by[i]: by0.append(by[i]) by1.append(1) else: by1[-1]+=1 bowyes=[] for i in range(len(by0)): bowyes.append([by0[i],by1[i]]) return bowyes #.......................................... file_name=input('File name = ') yn=input('Use feature hashing ? (y,Y,n,N) ') while yn not in ['Y','y','N','n'] : print('Try again.') yn=input('Use feature hashing ? (y,Y,n,N) ') if yn =='N' or yn=='n': print('char count = '+str(char_count(file_name))) print('alphanumeric count = '+str(a_and_num_count(file_name))) print('line count = '+str(line_count(file_name))) print('word count = '+str(words_count(file_name))) print('BoW = '+str(BoW_Nn(file_name,'stopword.txt'))) elif yn=='Y' or yn=='y': M=int(input('M = ')) print('char count = '+str(char_count(file_name))) print('alphanumeric count = '+str(a_and_num_count(file_name))) print('line count = '+str(line_count(file_name))) print('word count = '+str(words_count(file_name))) print('BoW = '+str(BoW_Yy(file_name,'stopword.txt',M)))# 6330352321 (2021-03-22 03:58) %diff = 41.78 def fhash(w,m): c = [] s = 0 for i in range(len(w)): c.append(ord(w[i])) for i in range(len(w)): s += c[i]*(37**i) fh = s%m return fh def char_count(filename): c_line = 0 c_char = 0 f = open(filename) for line in f: c_line += 1 for i in range(len(line)): c_char += 1 f.close() c_char -= c_line-1 return c_char def alp_count(filename): c = 0 f = open(filename) for line in f: for e in range(len(line)): if ("a" <= line[e].lower() <= "z") or ("0" <= line[e] <= "9"): c += 1 f.close() return c def line_c(filename): c = 0 f = open(filename) for line in f: c += 1 f.close() return c def word_count(filename): alp = "abcdefghijklmnopqrstuvwxyz" num = "1234567890" st = "" f = open(filename) for line in f: for e in line: if (e.lower() not in alp) and (e not in num): st += " " else: st += e f.close() c = st.lower().split() return len(c) def bow_n(filename,stopwords): alp = "abcdefghijklmnopqrstuvwxyz" num = "1234567890" s1 = "" s2 = "" l1 = [] l2 = [] l = [] f1 = open(filename) for line in f1: for e in line: if (e.lower() not in alp) and (e not in num): s1 += " " else: s1 += e f2 = open(stopwords) for line in f2: for e in line: s2 += e f1.close() f2.close() l1 = s1.lower().split() l2 = s2.lower().split() for e in l1: if e not in l2: l.append(e) l.sort() bow0 = [l[0]] bow1 = [1] for i in range(1,len(l)): if l[i] == l[i-1]: bow1[-1] += 1 else: bow0.append(l[i]) bow1.append(1) bow = [] for i in range(len(bow0)): bow.append([bow0[i],bow1[i]]) return bow def bow_y(filename,stopwords,m): bow = bow_n(filename,stopwords) for i in range(len(bow)): bow[i][0] = fhash(bow[i][0],m) bow.sort() bowy = [bow[0]] for i in range(1,len(bow)): if bow[i][0] == bow[i-1][0]: bowy[-1][1] += bow[i][1] else: bowy.append(bow[i]) return bowy #--------------------------------------------------------------------- yesno = ["y","Y","n","N"] file_name = input("File name = ") ufh = input("Use feature hashing ? (y,Y,n,N) ") while ufh not in yesno: print("Try again.") ufh = input("Use feature hashing ? (y,Y,n,N) ") if ufh== "n" or ufh == "N": print("-"*19) print("char count = " + str(char_count(file_name))) print("alphanumeric count = " + str(alp_count(file_name))) print("line count = " + str(line_c(file_name))) print("word count = " + str(word_count(file_name))) print("BoW = " + str(bow_n(file_name,"stopword.txt"))) elif ufh == "y" or ufh == "Y": M = int(input("M = ")) print("-"*19) print("char count = " + str(char_count(file_name))) print("alphanumeric count = " + str(alp_count(file_name))) print("line count = " + str(line_c(file_name))) print("word count = " + str(word_count(file_name))) print("BoW = " + str(bow_y(file_name,"stopword.txt",M)))

ALL: cluster #9 (2)

# 6330489721 (2021-03-22 21:01) %diff = 41.94 file_name = input('File name = ') ft = input('Use feature hashing ? (y,Y,n,N) ') uh = False while ft not in ['y','Y','n','N']: print('Try again.') ft = input('Use feature hashing ? (y,Y,n,N) ') if ft in ['y','Y']: M=input('M = ') uh = True print('-------------------') stopwords_list = [] stopwords_file = open('stopwords.txt', 'r') line_count=0 char_count=0 alpha_count=0 word_count=0 for line in stopwords_file: strip_stopwords_file = line.strip() strip_split_stopwords_file = strip_stopwords_file.split() stopwords_list += strip_split_stopwords_file stopwords_file.close() file = open(file_name, 'r') for line in file: strip_line = line.strip().lower() char_count += len(strip_line) file.close() file = open(file_name, 'r') for line in file: strip_line = line.strip().lower() for i in strip_line: isalnum = i.isalnum() if isalnum == True: alpha_count +=1 file.close() file = open(file_name, 'r') for line in file: strip_line = line.strip().lower() line_count +=1 file.close() def find_replace(t): result = "" for c in t: if c in "\"\'/\\,.:;": result += " " else: result += c return result file = open(file_name, 'r') for line in file: strip_line = line.strip().lower() words = find_replace(strip_line) strip_words = words.strip() split_strip_words = strip_words.split() word_count += len(split_strip_words) file.close() print('char count =',char_count) print('alphanumeric count =',alpha_count) print('line count =',line_count) print('word count =',word_count) all_words_list =[] file = open(file_name, 'r') for line in file: strip_line = line.strip().lower() words = find_replace(strip_line) strip_words = words.strip() split_strip_words = strip_words.split() all_words_list += split_strip_words file.close() all_words_withoutstopwords_list = [] for i in all_words_list: if not i in stopwords_list: all_words_withoutstopwords_list.append(i) BoW = [] def add(BoW,d): c = True for i in range(len(BoW)): if BoW[i][0] == d: c = False BoW[i][1] += 1 if c == True: BoW.append([d,1]) return BoW def fhash(list_of_word,M): wordhash_list = [] for word in list_of_word: char_count = 0 for i in range(len(word)): char_count += ord(word[i])*(37**i) wordhash_list.append(char_count%int(M)) return wordhash_list if uh == False: for i in all_words_withoutstopwords_list: BoW = addwordToBoW(BoW,i) print('BoW =',sorted(BoW)) if uh == True: wordhash_list = fhash(all_words_withoutstopwords_list,M) BoWhash = [] for i in sorted(wordhash_list): BoWhash = add(BoWhash,i) print('BoW =',BoWhash)# 6330523321 (2021-03-21 19:13) %diff = 41.94 filename = input('File name = ') feature = input('Use feature hashing ? (y,Y,n,N) ') usehash = False while not feature in ['y','Y','n','N']: print('Try again.') feature = input('Use feature hashing ? (y,Y,n,N) ') if feature in ['y','Y']: M = int(input('M = ')) usehash = True print('-------------------') stopwordslist = [] stopwords_file = open('stopwords.txt', 'r') for line in stopwords_file: strip_stopwords_file = line.strip() strip_split_stopwords_file = strip_stopwords_file.split() stopwordslist += strip_split_stopwords_file stopwords_file.close() def find_replace(t): result = "" for c in t: if c in "\"\'/\\,.:;": result += " " else: result += c return result charcount = 0 file = open(filename, 'r') for line in file: strip_line = line.strip().lower() charcount += len(strip_line) file.close() print('char count =',charcount) alphanumericcount = 0 file = open(filename, 'r') for line in file: strip_line = line.strip().lower() for i in strip_line: isalnum = i.isalnum() if isalnum == True: alphanumericcount +=1 file.close() print('alphanumeric count =',alphanumericcount) linecount = 0 file = open(filename, 'r') for line in file: strip_line = line.strip().lower() linecount +=1 file.close() print('line count =',linecount) wordcount = 0 file = open(filename, 'r') for line in file: strip_line = line.strip().lower() words = find_replace(strip_line) strip_words = words.strip() split_strip_words = strip_words.split() wordcount += len(split_strip_words) file.close() print('word count =',wordcount) all_words_list =[] file = open(filename, 'r') for line in file: strip_line = line.strip().lower() words = find_replace(strip_line) strip_words = words.strip() split_strip_words = strip_words.split() all_words_list += split_strip_words file.close() all_words_withoutstopwords_list = [] a = [] for i in all_words_list: if not i in stopwordslist: all_words_withoutstopwords_list.append(i) BoW = [] def addwordToBoW(BoW,newword): contain = False for i in BoW: if i[0] == newword: contain = True i[1] +=1 break if contain == False: BoW.append([newword,1]) return BoW if usehash == False: for i in all_words_withoutstopwords_list: BoW = addwordToBoW(BoW,i) print('BoW =',sorted(BoW)) def fhash(word,M): G = 37 numchar = 0 for charindex in range (len(word)): numchar += ord(word[charindex])*(G**charindex) return numchar%M if usehash == True: wordhash_list = [] for word in all_words_withoutstopwords_list: wordhash = fhash(word,M) wordhash_list.append(wordhash) BoWhash = [] for i in sorted(wordhash_list): BoWhash = addwordToBoW(BoWhash,i) print('BoW =',BoWhash)

ALL: cluster #10 (2)

# 6330275721 (2021-03-22 21:44) %diff = 42.03 #-------------------------------------------------------- alphabet = 'abcdefghijklmnopqrstuvwxyz' number = '0123456789' special_char = '!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' stopword_file = 'stopwords.txt' #-------------------------------------------------------- def char_count(x): file = open(x, 'r') c = 0 for line in file: line = line.strip() c += len(line) file.close() return c def alphanumeric_count(x): file = open(x, 'r') c = 0 for line in file: line = line.strip() text = '' for char in line: if char.lower() in alphabet or char in number: text += char c += len(text) file.close() return c def line_count(x): file = open(x, 'r') c = 0 for line in file: c += 1 file.close() return c def word_count(x): file = open(x, 'r') c = 0 for line in file: line = line.strip() text = '' for char in line: if char in special_char: text += ' ' else: text += char c += len(text.split()) file.close() return c def fhash(w, M): sum = 0 for i in range(len(w)): sum += ord(w[i])*(37**i) return sum % M def list_stopwords(x): file = open(x, 'r') stopwords_list=[] for line in file: line = line.strip() pre_stw = '' for char in line: if char in special_char: pre_stw += ' ' else: pre_stw += char pre_stw = pre_stw.strip().split() for i in pre_stw: stopwords_list.append(i) file.close() return stopwords_list def BoW(x, ufh, M): file = open(x, 'r') words=[] for line in file: line = line.strip() pre_words = '' for char in line: if char in special_char: pre_words += ' ' else: pre_words += char pre_words = pre_words.strip().split() for e in pre_words: k = e.lower() if k not in list_stopwords(stopword_file): words.append(k) words.sort() file.close() bow = []; bow_n = []; bow_y = []; n_word = [] if ufh in ['n', 'N']: for e in words: if e in bow_n: n_word[bow_n.index(e)] += 1 else: bow_n.append(e); n_word.append(1) for i in range(len(bow_n)): bow.append([bow_n[i],n_word[i]]) return bow elif ufh in ['y', 'Y']: for e in words: p = fhash(e,M) if p in bow_y: n_word[bow_y.index(p)] += 1 else: bow_y.append(p); n_word.append(1) for i in range(len(bow_y)): bow.append([bow_y[i],n_word[i]]) bow.sort() return bow #-------------------------------------------------------- file_name = input('File name = ') ufh = input('Use feature hashing ? (y,Y,n,N) ') while ufh not in ['y','Y','n','N']: print('Try again.') ufh = input('Use feature hashing ? (y,Y,n,N) ') if ufh in ['y', 'Y']: M = int(input('M = ')) elif ufh in ['n', 'N']: M = 0 else: M = 0 print('-'*19) print('char count =', char_count(file_name)) print('alphanumeric count =', alphanumeric_count(file_name)) print('line count =', line_count(file_name)) print('word count =', word_count(file_name)) print('BoW =', BoW(file_name, ufh, M))# 6330281421 (2021-03-21 12:19) %diff = 42.03 #-------------------------------------- #ข้อมูลที่แก้ได้ stopword_file='stopwords.txt' sp_char='!@#$%^&*()_+{}[]:\";\',./<>?\\=-`' al_and_nume='abcdefghijklmnopqrstuvwxyz0123456789' #-------------------------------------- #ส่วนฟังก์ชั่น def c_count(filename): with open(filename,'r') as file: n=0 for line in file: line=line.strip() n+=len(line) return n def alpha_count(filename): with open(filename,'r') as file: n=0 for line in file: line=line.strip() text='' for char in line: #if not(char in sp_char): if char.lower() in al_and_nume : text+=char #text=''.join(text.split()) n+=len(text) return n def line_count(filename): with open(filename,'r') as file: n=0 for line in file: n+=1 return n def word_count(filename): with open(filename,'r') as file: n=0 for line in file: line=line.strip() text='' for char in line: if char in sp_char: text+=' ' else: text+=char n+=len(text.split()) return n def list_of_stopwords(filename): with open(filename,'r') as file: stopwords_list=[] for line in file: line=line.strip() text='' for char in line: if char in sp_char: text+=' ' else: text+=char text=text.split() for i in text: stopwords_list.append(i.lower()) return stopwords_list def fhash(word,m): sum=0 for i in range(len(word)): sum+=ord(word[i])*(37**i) return sum%m def BoW(filename,condition,m): with open(filename,'r') as file: words=[] for line in file: line=line.strip() text='' for char in line: if char in sp_char: text+=' ' else: text+=char text=text.split() for i in text: if not(i.lower() in list_of_stopwords(stopword_file)): words.append(i.lower()) words.sort() bag_of_word=[] repit_word=[] bag_of_words=[] if condition.lower() =='n': for i in words: if i in bag_of_word: repit_word[bag_of_word.index(i)]+=1 else: bag_of_word.append(i) repit_word.append(1) for i in range(len(bag_of_word)): bag_of_words.append([bag_of_word[i],repit_word[i]]) return bag_of_words else: for i in words: p=fhash(i,m) if p in bag_of_word: repit_word[bag_of_word.index(p)]+=1 else: bag_of_word.append(p) repit_word.append(1) for i in range(len(bag_of_word)): bag_of_words.append([bag_of_word[i],repit_word[i]]) bag_of_words.sort() return bag_of_words #-------------------------------------- #ส่วนทำงาน file=input('File name = ') feature=input('Use feature hashing ? (y,Y,n,N) ') while not(feature in ['Y','n','N','y']): print('Try again.') feature=input('Use feature hashing ? (y,Y,n,N) ') if feature.lower() == 'y': m=int(input('M = ')) else: m=0 print('-------------------') print('char count =',c_count(file)) print('alphanumeric count =',alpha_count(file)) print('line count =',line_count(file)) print('word count =',word_count(file)) print('BoW =',BoW(file,feature,m))

ALL: cluster #11 (2)

# 6330199721 (2021-03-22 17:35) %diff = 43.85 file_name = input('File name = ') fn = open(file_name.strip(), 'r') a = input('Use feature hashing ? (y,Y,n,N) ') while 1>0: if a in 'nN': break elif a in 'yY': M = input('M = ') break else : print('Try again.') a = input('Use feature hashing ? (y,Y,n,N) ') print('-------------------') s = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890' cc = 1 ac = 0 lc = 0 wc = 0 for i in fn: lc+=1 for b in range(len(i)): cc+=1 if i[b] in s: ac+= 1 if i[b+1] not in s: wc+=1 print('char count =',cc-lc) print('alphanumeric count =',ac) print('line count =',lc) print('word count =',wc) if a in 'nN': fn = open(file_name.strip(), 'r') st = open("stopwords.txt", 'r') s = 'abcdefghijklmnopqrstuvwxyz1234567890' w = '' for l in fn: l = l.strip('\n') l = l.lower() for i in l: if i in s: w += i else: w += ' ' w = w.split() for l in st: l = l.split() for i in l: while i in w: w.remove(i) bow = [] for i in w: x = w.count(i) if [i,x] not in bow: bow += [[i,x]] print('BoW =',bow) def fhash(x,M): a = 0 for i in range(len(x)): c = ord(x[i])*(pow(37,i)) a += c d = a%int(M) return str(d) if a in 'Yy': fn = open(file_name.strip(), 'r') st = open("stopwords.txt", 'r') s = 'abcdefghijklmnopqrstuvwxyz1234567890' w = '' for l in fn: l = l.strip('\n') l = l.lower() for e in l: if e in s: w += e else: w += ' ' w = w.split() for l in st: l = l.split() for i in l: while i in w: w.remove(i) j=[] for i in w: j += fhash(i,M) bow = [] for i in j: x = j.count(i) if [int(i),x] not in bow: bow += [[int(i),x]] bow.sort() print('BoW =',bow) fn.close() st.close()# 6330200621 (2021-03-22 00:15) %diff = 43.85 file_name = input('File name = ') fn = open(file_name.strip(), 'r') fh = input('Use feature hashing ? (y,Y,n,N) ') while True: if fh not in 'yYnN': print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') if fh in 'yY': M = input('M = ') break if fh in 'nN': break #------------------------------------------------------------ def everything(fn): ac = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0'] anc = 0 cc = 0 lc = 0 tap = '' for line in fn: line = line.strip('\n') line = line.lower() lc += 1 for e in line: cc += 1 if e in ac: anc += 1 tap += e else: tap += ' ' tap = tap.split() wc = len(tap) return anc,cc,lc,wc #------------------------------------------------------------- anc,cc,lc,wc = everything(fn) print('-'*len('Use feature hashing')) print('char count =',cc) print('alphanumeric count =',anc) print('line count =',lc) print('word count =',wc) fn.close() #--------------------------------------------------- def bow1(): fn = open(file_name.strip(), 'r') st = open("stopwords.txt", 'r') ac = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0'] b = '' for line in fn: line = line.strip('\n') line = line.lower() for e in line: if e in ac: b += e else: b += ' ' b = b.split() for line in st: line = line.split() for e in line: while e in b: b.remove(e) bow = [] for e in b: z = b.count(e) if [e,z] not in bow: bow.append([e,z]) fn.close() st.close() return bow #--------------------------------------------------- if fh == 'n' or fh == 'N': bow = bow1() print('BoW =',bow) #--------------------------------------------------- def fhash(a,M): summ=0 for i in range (len(a)): summ += ord(a[i])*(37)**i c = summ % int(M) return str(c) #---------------------------------------- def bow2(): fn = open(file_name.strip(), 'r') st = open("stopwords.txt", 'r') ac = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0'] b = '' for line in fn: line = line.strip('\n') line = line.lower() for e in line: if e in ac: b += e else: b += ' ' b = b.split() for line in st: line = line.split() for e in line: while e in b: b.remove(e) c=[] for f in b: c.append(fhash(f,M)) bow = [] for e in c: z = c.count(e) if [int(e),z] not in bow: bow.append([int(e),z]) bow.sort() fn.close() st.close() return bow #------------------------------------------- if fh == 'y' or fh == 'Y': bbb = bow2() print('BoW =',bbb)

ALL: cluster #12 (2)

# 6330192221 (2021-03-22 23:09) %diff = 44.18 c=0 alpha=0 l=0 w=[] x=[] b=[] Bag=[] logic=['y','Y','n','N'] def char_count(line): c=0 c+=(len(line)-1) return c #------------------------------- def alphanumeric_count(line): c1=0 for t in line: if 'a'<=t<='z' or 'A'<=t<='Z' or '0'<=t<='9': c1+=1 return c1 #------------------------------- def line_count(line): l=0 if len(line)!=0: l+=1 return l #------------------------------- def word_count(line): word=[] wordn=[] s="" for t in line: if 'a'<=t<='z' or 'A'<=t<='Z' or '0'<=t<='9': s+=t else: if s!="": word.append(s.lower()) s="" return word #------------------------------- def BoW(lis): c=[] for i in range(len(lis)): if [lis[i],lis.count(lis[i])] not in c: c.append([lis[i],lis.count(lis[i])]) return c #----------------------------- def fhash(w,M): G=37 s=0 for i in range(len(w)): s+=ord(w[i])*(G**i) s=s%M return s #----------------------------- def hashedBoW(wordlist,M): A=[] for word in wordlist: A.append(fhash(word,M)) return BoW(A) #----------------------------- file_name=input("File name = ") fin = open(file_name,"r") fin2= open("stopwords.txt","r") a=input("Use feature hashing ? (y,Y,n,N) ") for line in fin: c+=char_count(line) alpha+=alphanumeric_count(line) l+=line_count(line) w+=word_count(line) else: c+=1 lw=len(w) for line in fin2: x+=line.split() for i in range(len(w)): if w[i] not in x: b.append(w[i].lower()) if a=='y' or a=='Y': M=int(input("M = ")) print("-------------------") print("char count =",c) print("alphanumeric count =",alpha) print("line count =",l) print("word count =",lw) print("BoW =",hashedBoW(b,M)) elif a=='n' or a=='N': print("-------------------") print("char count =",c) print("alphanumeric count =",alpha) print("line count =",l) print("word count =",lw) print("BoW =",BoW(b)) else: while a not in logic: print("Try again.") a=input("Use feature hashing ? (y,Y,n,N) ") if a=='y' or a=='Y': M=int(input("M = ")) print("-------------------") print("char count =",c) print("alphanumeric count =",alpha) print("line count =",l) print("word count =",lw) print("BoW =",hashedBoW(b,M)) elif a=='n' or a=='N': print("-------------------") print("char count =",c) print("alphanumeric count =",alpha) print("line count =",l) print("word count =",lw) print("BoW =",BoW(b))# 6330356921 (2021-03-21 22:58) %diff = 44.18 #--------------------------------------------------------------------- def fhash(w,M) : a = [] b = 0 c = 0 for i in range(len(w)) : a.append(str(ord(w[i]))) for i in range(len(a)) : b += int(a[i])*(37**c) c += 1 b = int(b)%int(M) return b #--------------------------------------------------------------------- def cutpunc(N) : result = "" for c in N: if c in "\"\'/\\,.:;" : result += "" elif c in "\n" : result += " " else : result +=c return result #--------------------------------------------------------------------- def cutword(N) : N = cutpunc(N) N = N.lower() N = N.split() x = "" a = open("stopwords.txt", "r") for lines in a : x += lines b = cutpunc(x) b = b.split() result = "" for c in N : if c in b : result += "" else : result += c+" " return result #--------------------------------------------------------------------- def BOW1(N) : N = cutword(N) N = N.split() N.sort() N.append("") a = [] c = 1 for i in range(len(N)-1) : if N[i]==N[i+1] : c +=1 else : a.append([N[i],c]) c = 1 return a #--------------------------------------------------------------------- def BOW2(N) : N = cutword(N) N = N.split() N.sort() a = [] c = 1 x = [] for i in range(len(N)) : x.append(fhash(N[i],M)) x.sort() x.append("") for i in range(len(x)-1) : if x[i]==x[i+1] : c +=1 else : a.append([x[i],c]) c = 1 return a #--------------------------------------------------------------------- x = "" character_count = 0 line_count = 0 word_count = 0 alphanumeric_count = 0 d = input("File name = ") a = open(d, "r") for lines in a : x += lines character_count += len(lines) line_count +=1 print(x) y = cutpunc(x) z = "".join(y) character_count -=line_count-1 y = cutpunc(x) y = y.split() for i in range(len(y)): if y[i]==y[i] : word_count += 1 h = 0 for i in range(len(z)) : if " "==z[i] : h +=1 alphanumeric_count += len(z)-h a.close() #--------------------------------------------------------------------- b = input("Use feature hashing ? (y,Y,n,N) ") if b=="y" or b=="Y" : M = int(input("M = ")) print("-------------------") print("char count =",character_count) print("alphanumeric count =",alphanumeric_count) print("line count =",line_count) print("word count =",word_count) print("BoW =",BOW2(x)) if b=="n" or b=="N" : print("-------------------") print("char count =",character_count) print("alphanumeric count =",alphanumeric_count) print("line count =",line_count) print("word count =",word_count) print("BoW =",BOW1(x)) while b!="n" and b!="N"and b!="y" and b!="Y" : print("Try again.") b = input("Use feature hashing ? (y,Y,n,N) ") if b=="y" or b=="Y" : M = int(input("M = ")) print("-------------------") print("char count =",character_count) print("alphanumeric count =",alphanumeric_count) print("line count =",line_count) print("word count =",word_count) print("BoW =",BOW2(x)) if b=="n" or b=="N" : print("-------------------") print("char count =",character_count) print("alphanumeric count =",alphanumeric_count) print("line count =",line_count) print("word count =",word_count) print("BoW =",BOW1(x)) break

ALL: cluster #13 (2)

# 6330180721 (2021-03-18 17:39) %diff = 44.88 #Prog-08: Bag-of-words #6330180721 Nichakul Pichitwutikorn def fhash(w,m): a = 0 for e in range(len(w)): a+= ord(w[e])*(37**e) return a%m def num(lis,word): c = 0 for t in lis: if t == word:c+=1 return c def cut_repeat(listt): qr = [] for e in listt: if not e in qr: qr.append(e) return qr file_name = input('File name = ') h = input('Use feature hashing ? (y,Y,n,N) ') while h!='y' and h!='Y' and h!='n' and h!='N': print('Try again.') h = input('Use feature hashing ? (y,Y,n,N) ') if h in 'yY': m= input('M = ') book = open(file_name,'r') stop = open('stopwords.txt','r') char_al = 0; al = 0;l = 0 sen = ''; st = '' for line in book: for i in line: if 'a'<=i<='z' or 'A'<=i<='Z' or '0'<=i<='9': al+=1 sen+=i else: char_al+=1 sen+=' ' l+=1 sen = sen.lower().split() for t in stop: for s in t: if s==' ':st+=' ' else: st+=s st = st.lower().split() bow = []; ans = []; f = [] for p in sen: if not p in st:bow.append(p) print('-------------------') print('char count =',al+char_al-l+1) print('alphanumeric count =',al) print('line count =',l) print('word count =',len(sen)) if h in 'yY': for j in bow: ans.append(fhash(j,int(m))) for q in ans: f.append([q,num(ans,q)]) rrr = cut_repeat(f) rrr.sort() print('BoW =',rrr) else: for j in bow: ans.append([j,num(bow,j)]) rrr = cut_repeat(ans) rrr.sort() print('BoW =',rrr) book.close() stop.close()# 6330572021 (2021-03-22 15:09) %diff = 44.88 def fhash(w,m): p=0 for i in range(len(w)): p+=ord(w[i])*(37**i) fh=p%m return fh def bow(sen): b=[] bow=[] for e in sen : if not e in b: b.append(e) c=[0]*len(b) for i in range(len(sen)): for j in range(len(b)): if sen[i]==b[j]: c[j]+=1 for k in range(len(b)): bow.append([b[k],c[k]]) return bow file_name=input('File name=') f=input('Use feature hashing ? (y,Y,n,N)') while f!='y' and f!='Y' and f!='n' and f!='N': print('Try again.') f=input('Use feature hashing ? (y,Y,n,N)') if f=='y' or f=='Y': m=input('M=') file=open(file_name,'r') stopw=open('stopwords.txt','r') lines=stopw.readlines() stopw.close() lines=[line.strip() for line in lines] stw='' for i in range(len(lines)): stw+=str(lines[i].lower())+' ' stop=stw.split() char=0 al=0 l=0 sen='' for line in file: for c in line: if c.isalnum()==True: char+=1 al+=1 sen+=c else: char+=1 sen+=' ' l+=1 s=sen.lower().split() sent=[] for p in s: if p not in stop: sent+=[p] print('-------------------') print('char count = ',char-l+1) print('alphanumeric count = ',al) print('line count = ',l) print('word count = ',len(s)) if f=='y' or f=='Y': bb=[] for q in range(len(sent)): bb+=[fhash(sent[q],int(m))] BoW=bow(bb) BoW.sort() print('BoW = ',BoW) else: BoW=bow(sent) BoW.sort() print('BoW = ',BoW) file.close()

ALL: cluster #14 (7)

# 6330280821 (2021-03-22 22:05) %diff = 45.24 def char_count(fn): file_name = open(fn) c = 0 for e in file_name: for a in e: if a != "\n": c += 1 file_name.close return c def count_line(fn): file_name = open(fn) c = 0 for line in file_name: c += 1 file_name.close() return c def alphanumeric(fn): a = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" file_name = open(fn) c = 0 for e in file_name : for d in e: if d in a: c += 1 file_name.close() return c def word_count(fn): file_name = open(fn) c = "" for e in file_name: for a in e: if a not in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789": c += " " else : c += a file_name.close return c.split() s = open("stopwords.txt") stop = "" for e in s: for d in e: stop += d stop2 = stop.lower().split() s.close() def bow_no_hashing(fn): p = [] q = word_count(fn) for e in q: if e.lower() not in stop2: p.append(e) p.sort() p += "!!" d = p[0] last = [] num = 1 for j in range(1,len(p)) : if d != p[j]: last.append([d,num]) num = 1 d = p[j] else : num += 1 return last def fhash(w,M): c = 0 a = 0 G = 37 for i in w : c += ord(i)*(G**a) a += 1 b = c%M return b def bow_hashing(fn,M): p = [] q = word_count(fn) for e in q: if e.lower() not in stop2: p.append(e) s = [] for i in p : v = fhash(i,int(M)) s.append(v) s.sort() s += "!!" d = s[0] last = [] num = 1 for j in range(1,len(s)) : if d != s[j]: last.append([d,num]) num = 1 d = s[j] else : num += 1 return last print(bow_hashing("sample.txt",10)) x = input("File name = ") b = input("Use feature hashing ? (y,Y,n,N) ") while b not in "yYnN": print("Try again.") b = input("Use feature hashing ? (y,Y,n,N) ") if b in "yY": M = input("M = ") print("-------------------") print("char count =",char_count(x)) print("alphanumeric count =",alphanumeric(x)) print("line count =",count_line(x)) print("word count =",len(word_count(x))) print("BoW = ",bow_hashing(x,M)) else : print("-------------------") print("char count =",char_count(x)) print("alphanumeric count =",alphanumeric(x)) print("line count =",count_line(x)) print("word count =",len(word_count(x))) print("BoW = ",bow_no_hashing(x))# 6330485121 (2021-03-21 01:01) %diff = 45.24 def char_count(file_name): words = '' c = 0 fn = open(file_name) for line in fn: words += line for e in words: if e != '\n': c += 1 fn.close() return c def alphanumeric_count(file_name): words = '' c = 0 fn = open(file_name) for line in fn: words += line for e in words: if e.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': c += 1 fn.close() return c def line_count(file_name): c = 0 fn = open(file_name) for line in fn: c += 1 fn.close() return c def list_of_words(file_name): words = '' listwords = '' fn = open(file_name) for line in fn: words += line for e in words: if e.lower() not in 'abcdefghijklmnopqrstuvwxyz0123456789' or e.lower() == '\n': listwords += ' ' else: listwords += e.lower() listwords = listwords.split() fn.close() return listwords # ['it', 'was', 'the', 'best', 'of', ...] def bag_of_words(file_name): listwords = list_of_words(file_name) sw = list_of_words('stopwords.txt') new = [] for e in listwords: if e not in sw: new.append(e) word = []; fr = []; bow = [] for e in new: if e.lower() not in word: word.append(e.lower()) fr.append(int(1)) elif e.lower() in word: fr[word.index(e.lower())] += 1 for i in range(len(word)): bow.append([word[i], fr[i]]) bow.sort() return bow def fhashing(w,m): fhash = 0 g = 37 for i in range(len(w)): fhash += ord(w[i])*(g**i) return fhash%m def feature_hashing(file_name): listwords = list_of_words(file_name) sw = list_of_words('stopwords.txt') new = [] for e in listwords: if e.lower() not in sw: new.append(e.lower()) fhash = []; ordd = []; fr = []; bow = [] for e in new: fhash.append(fhashing(e,m)) for i in range(len(fhash)): if fhash[i] not in ordd: ordd.append(fhash[i]) fr.append(int(1)) elif fhash[i] in ordd: fr[ordd.index(fhash[i])] += 1 for i in range(len(ordd)): bow.append([ordd[i], fr[i]]) bow.sort() return bow x = ['y', 'Y', 'n', 'N' ] file_name = input('File name = ') hashing = input('Use feature hashing ? (y,Y,n,N) ') while hashing not in x: print('Try again.') hashing = input('Use feature hashing ? (y,Y,n,N) ') if hashing in 'yY': m = int(input('M = ')) print('-------------------') print('char count = '+str(char_count(file_name))) print('alphanumeric count = '+str(alphanumeric_count(file_name))) print('line count = '+str(line_count(file_name))) print('word count = '+str(len(list_of_words(file_name)))) print('BoW =',feature_hashing(file_name)) elif hashing in 'nN': print('-------------------') print('char count = '+str(char_count(file_name))) print('alphanumeric count = '+str(alphanumeric_count(file_name))) print('line count = '+str(line_count(file_name))) print('word count = '+str(len(list_of_words(file_name)))) print('Bow =',bag_of_words(file_name))# 6330311621 (2021-03-18 22:11) %diff = 46.64 def num_all(fn): c = 0 fn = open(fn,'r').read() for i in fn: if i != '\n': c += 1 return c def num_char(fn): out = '' fn = open(fn,'r').read() for i in fn: if 'a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9': out += i return len(out) def num_line(fn): c = 0 fn = open(fn,'r') for line in fn: c += 1 return c def num_word(fn): out = '' fn = open(fn,'r').read() for i in fn: if not('a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9'): i = ' ' out += i word = out.split() return len(word) def listword(fn): out = '' fn = open(fn,'r').read() for i in fn: if not('a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9'): i = ' ' out += i out = out.lower().split() sw = open('stopwords.txt','r').read() t = sw.split() list_word = [] for i in out: if not i in t: list_word.append(i) return list_word def BoW(fn): word = listword(fn) t_word = [] for i in word: if not i in t_word: t_word.append(i) slot = [0]*len(t_word) for i in word: if i in t_word: slot[t_word.index(i)] += 1 f_word = [] for i in range(len(slot)): f_word.append([t_word[i],slot[i]]) return f_word def fhash(w,M): num = 0 for i in range(len(w)): num += ord(w[i])*(37**i) return num%int(M) def BoW_fhash(fn,M): word1st = listword(fn) word = [] for i in word1st: word.append(fhash(i,M)) t_word = [] for i in word: if not i in t_word: t_word.append(i) slot = [0]*len(t_word) for i in word: if i in t_word: slot[t_word.index(i)] += 1 f_word = [] for i in range(len(slot)): f_word.append([t_word[i],slot[i]]) return f_word file_name = input('File name = ') choose = input('Use feature hashing ? (y,Y,n,N) ') while not choose in 'nNyY': print('Try again.') choose = input('Use feature hashing ? (y,Y,n,N) ') if choose in 'yY': M = input('M = ') print('-------------------') print('char count =',num_all(file_name)) print('alphanumeric count =',num_char(file_name)) print('line count =',num_line(file_name)) print('word count =',num_word(file_name)) print('BoW =',BoW_fhash(file_name,M)) else: print('-------------------') print('char count =',num_all(file_name)) print('alphanumeric count =',num_char(file_name)) print('line count =',num_line(file_name)) print('word count =',num_word(file_name)) print('BoW =',BoW(file_name))# 6330564021 (2021-03-22 01:34) %diff = 46.64 #---------------------------------------------------------- def char_count(file_name): i = 0 fn = open(file_name, 'r') for line in fn: if line[-1] == '\n': i += len(line[:-1]) else: i += len(line) fn.close() return i def alp_count(file_name): i = 0 fn = open(file_name, 'r') for line in fn: for c in line.lower(): if 'a' <= c <= 'z' or '0' <= c <= '9': i += 1 fn.close() return i def line_count(file_name): i = 0 fn = open(file_name, 'r') for line in fn: i += 1 fn.close() return i def stop_words(stop_name): k = [] fn = open(stop_name, 'r') for line in fn: k += line.lower().strip().split() fn.close() return k def words(file_name): k = [] fn = open(file_name, 'r') for line in fn: d = '' for c in line.lower(): if 'a' <= c <= 'z' or '0' <= c <= '9': d += c else: d += ' ' k += d.strip().split() fn.close() return k def BoW(file_name, stop_name): a = words(file_name) b = stop_words(stop_name) k = [] for c in a: if c in b: k.append(c) p = [] for d in a: if d not in k: p.append(d) word_c = [] word = [] for i in range(len(p)): if p[i] in word_c: j = word_c.index(p[i]) word[j] += 1 else: word_c.append(p[i]) word.append(1) r = [] for i in range(len(word)): r.append([word_c[i],word[i]]) return r def f_hashing(file_name, stop_name,M): a = words(file_name) b = stop_words(stop_name) k = [] for c in a: if c in b: k.append(c) p = [] for d in a: if d not in k: p.append(d) word_or1 = [] for e in p: n = 0 for i in range(len(e)): x = ord(e[i]) n += x*((37)**i) word_or1.append(n%M) word_or1 = sorted(word_or1) word_or2 = [] word_num = [] for i in range(len(word_or1)): if word_or1[i] in word_or2: j = word_or2.index(word_or1[i]) word_num[j] += 1 else: word_or2.append(word_or1[i]) word_num.append(1) z = [] for i in range(len(word_num)): z.append([word_or2[i],word_num[i]]) return z #---------------------------------------------------------- stop_name = 'stopwords.txt' file_name = input('File name = ') s = 1 while s == 1: t = input('Use feature hashing ? (y,Y,n,N) ').lower() if t == 'y' or t == 'n': s = 0 else: print('Try again.') s = 1 if t == 'y': M = int(input('M = ')) print('-------------------') print('char count =',char_count(file_name)) print('alphanumeric count =',alp_count(file_name)) print('line count =',line_count(file_name)) print('word count =',len(words(file_name))) print('BoW =',f_hashing(file_name, stop_name,M)) else: print('-------------------') print('char count =',char_count(file_name)) print('alphanumeric count =',alp_count(file_name)) print('line count =',line_count(file_name)) print('word count =',len(words(file_name))) print('BoW =',BoW(file_name, stop_name))# 6330468521 (2021-03-22 16:42) %diff = 49.15 def char_count(file_name) : fin = open(file_name, "r") char_count = 0 for line in fin : a = line.strip() char_count += len(a) fin.close() return char_count def alphanumeric_count(file_name) : fin = open(file_name, "r") alphanumeric_count = 0 for line in fin : for e in line.strip() : if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9" : alphanumeric_count += 1 else : alphanumeric_count += 0 fin.close() return alphanumeric_count def line_count(file_name) : fin = open(file_name, "r") line_count = 0 for line in fin : if len(line) > 0 : line_count += 1 fin.close() return line_count def word_count(file_name) : fin = open(file_name, "r") words = "" for line in fin : for e in line.strip() : if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9" : words += e else : words += " " words_list = words.split() word_count = len(words_list) fin.close() return word_count def BOW_list(file_name) : fin = open(file_name, "r") words_of_BOW2 = [] words_of_BOW = [] for line in fin : words_of_BOW1 = "" for e in line.strip() : if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9" : words_of_BOW1 += e else : words_of_BOW1 += " " words_of_BOW2.append(words_of_BOW1.lower().split()) for i in range(len(words_of_BOW2)) : for e in words_of_BOW2[i] : words_of_BOW.append(e) fin.close() fin1 = open("stopwords.txt", "r") stopwords_list1 = [] stopwords_list = [] for line in fin1 : stopwords_list1.append(line.strip().split()) for i in range(len(stopwords_list1)) : for e in stopwords_list1[i] : stopwords_list.append(e) fin1.close() BOW_list = [] BOW_list[:] = words_of_BOW for e in stopwords_list : k = 0 while k < len(BOW_list) : if BOW_list[k] == e : BOW_list.pop(k) else : k += 1 return BOW_list def fhash(w, M) : G = 37 s = 0 for i in range(len(w)) : s += ord(w[i])*(G**i) s1 = s % int(M) return s1 def Bow_yY(p) : B = [] for e in p : y = fhash(e, int(M)) B.append(y) B1 = [] B2 = [] for e in B : x = 0 for i in range(len(B)) : if e == B[i] : x += 1 B1.append([e,x]) B1.sort() for e in B1 : if e not in B2 : B2.append(e) return B2 def BOW_nN(v) : b = [] b[:] = v list1 = [] #before for e in v : c = [] n = 0 for i in range(len(v)) : if b[i] == e : n += 1 c.append(e) c.append(n) list1.append(c) BOW_nN = [] for e in list1 : if e not in BOW_nN : BOW_nN.append(e) return BOW_nN file_name = input("File name = ") u = input("Use feature hashing ? (y,Y,n,N) ") while u not in ["y","Y","n","N"] : print("Try again.") u = input("Use feature hashing ? (y,Y,n,N) ") if u == "y" or u == "Y" : M = input("M = ") print("-------------------") print("char count = "+str(char_count(file_name))) print("alphanumeric count = "+str(alphanumeric_count(file_name))) print("line count = "+str(line_count(file_name))) print("word count = "+str(word_count(file_name))) p = BOW_list(file_name) print("BoW = "+str(Bow_yY(p))) elif u == "n" or u == "N" : print("-------------------") print("char count = "+str(char_count(file_name))) print("alphanumeric count = "+str(alphanumeric_count(file_name))) print("line count = "+str(line_count(file_name))) print("word count = "+str(word_count(file_name))) v = BOW_list(file_name) print("BoW = "+str(BOW_nN(v)))# 6330245921 (2021-03-22 20:17) %diff = 49.75 #Prog-08: Bag-of-words #6330245921 Teetat Karuhawanit def somchai(c): v = open(c,'r') x = '' b = [] for j in v.readlines(): b+= [j.strip()] splitted = '' for i in b: splitted += i.lower()+' ' x = splitted.split() v.close() return ' '.join(x) def paisan(file_name): u = open(file_name) x = u.readlines() alphacount = 0 for b in range(len(x)): x[b] = x[b].strip('\n').lower() for n in x[b]: if n in 'abcdefghijklmnopqrstuvwxyz0123456789': alphacount += 1 u.close() return alphacount def chate(file_name): z = 0 c = somchai(file_name) for i in range(len(c)): z+=1 return((z-thanarat(file_name))+1) def thanarat(file_name): f = open(file_name,'r') v = f.readlines() f.close() return len(v) def pannarai(file_name): d = somchai(file_name) c = len(d.split()) return c def sukree(file_name): a = somchai(file_name) b = somchai('stopwords.txt') x = '' for i in a: if i not in 'abcdefghijklmnopqrstuvwxyz0123456789': x += ' ' else: x += i x = x.split() l = [] for i in x: if i in b: l += [] else: l += [i] return l def fhash(W,M): x = 0 s = 0 G = 37 for i in W: x += ord(i)*(G**s) s += 1 d = x % M return d def kirati(): v = sukree(file_name) a = [] b = [] c = [] d = 0 for i in v: if i not in a: a.append(i) b.append(1) else: b[a.index(i)] += 1 for i in a: c += [[i,b[d]]] d += 1 return c def parngod(M): a = [] b = [] c = [] d = 0 for i in sukree(file_name): if fhash(i,M) not in a: a += [fhash(i,M)] b.append(1) else: b[a.index(fhash(i,M))] += 1 for i in a: c += [[i,b[d]]] d += 1 return c file_name = input('File name = ') x = input('Use feature hashing ? (y,Y,n,N) ') while x not in 'yYnN': print('Try again.') x = input('Use feature hashing ? (y,Y,n,N) ') if x in 'Yy': M = int(input('M = ')) print('-------------------') print('char count =',chate(file_name)) print('alphanumeric count =',paisan(file_name)) print('line count =',thanarat(file_name)) print('word count =',pannarai(file_name)) print('BoW =',parngod(M)) else: print('-------------------') print('char count =',chate(file_name)) print('alphanumeric count =',paisan(file_name)) print('line count =',thanarat(file_name)) print('word count =',pannarai(file_name)) print('BoW =',kirati())# 6330412321 (2021-03-22 21:13) %diff = 49.8 file_name = input('File name = ') def char_count(a): a = open(file_name,'r') cc = 0 for line in a: cc += int(len(line.strip())) a.close() return cc def alphanumeric_count(a): a = open(file_name,'r') alp = 0 for line in a: for e in line: if '0' <= e <= '9' or 'A' <= e <= 'Z' or 'a' <= e <= 'z': alp += 1 a.close() return alp def line_count(a): a = open(file_name,'r') lc = 0 for line in a: lc += 1 a.close() return lc def word_count(a): a = open(file_name,'r') k = '' wc = 0 for line in a: for e in line: if (e not in 'abcdefghijklmnopqrstuvwxyz') and (e not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') and(e not in '0123456789'): k += ' ' else: k += e words = k.split() wc += len(words) a.close() return wc def BoW(a): a = open(file_name, "r") stop_words = open("stopwords.txt","r") new = '' for line in a: line = line.lower() for e in line: if (e in 'abcdefghijklmnopqrstuvwxyz') or (e in '0123456789'): new += e else: new += ' ' new1 = new.split(' ') sw = '' for line in stop_words: line = line.lower() for e in line: if (e in 'abcdefghijklmnopqrstuvwxyz') or (e in '0123456789'): sw += e else: sw += ' ' sw1 = sw.split(' ') new2 = [] for e in new1: if e in sw1: new2.append('') else: new2.append(e) new3 = [] for e in new2: if e != '': new3.append(e) u = [] v= [] for e in new3: if e not in u: u.append(e) v.append([e,1]) else: t = u.index(e) v[t] = [e,v[t][1]+1] a.close() stop_words.close() return v def fhash(w,M): s = 0 for i in range(len(w)): s += int(ord(w[i])*((37)**i)) fhash = s%M return fhash def new_bow(a): s = BoW(a) u = [] v = [] for e in s: i = fhash(e[0],M) j = e[1] if i not in u: u.append(i) v.append([i,j]) else: k = u.index(i) v[k] = [i,v[k][1]+j] v.sort() return v fh = input('Use feature hashing ? (y,Y,n,N) ') while fh != 'n' and fh != 'N' and fh != 'y' and fh != 'Y': print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') if fh == 'n' or fh == 'N': print('-------------------') print('char count = '+str(char_count(file_name))) print('alphanumeric count = '+str(alphanumeric_count(file_name))) print('line count = '+str(line_count(file_name))) print('word count = '+str(word_count(file_name))) print('BoW = '+str(BoW(file_name))) elif fh == 'y' or fh == 'Y': M = int(input('M = ')) print('-------------------') print('char count = '+str(char_count(file_name))) print('alphanumeric count = '+str(alphanumeric_count(file_name))) print('line count = '+str(line_count(file_name))) print('word count = '+str(word_count(file_name))) print('BoW = '+str(new_bow(file_name)))

ALL: cluster #15 (2)

# 6330319721 (2021-03-22 17:04) %diff = 46.04 def fhash(w,M): G = 37 frac = 0 letters = list(w) for i in letters: frac += ord(i)*(G**(len(letters)-1)) ans = frac % M return ans #============================================================= def stopwords(): b = [] stop = open('stopwords.txt') for line in stop: if line != "\n": line1 = line.strip('\n') line2 = line1.split(' ') for j in range(len(line2)): b.append(line2[j]) stop.close() return b #============================================================= def text(file): file = open(file) a = '' for line in file: if line != "\n": line = line.lower() l = line.strip('\n') a += ''.join(l)+' ' file.close() return a #============================================================= def char(file): file = open(file) char = '' for line in file: linex = line.strip() if linex != "\n": line = line.lower() l1 = line.strip('\n') char += ''.join(l1) file.close() ans = len(char) return ans #============================================================= def alphanum(cn): ans = '' for i in cn: if i == ' ': ans += ' ' elif 48<=ord(i)<=57 or 97<=ord(i)<=122 or 65<=ord(i)<=90: ans += i else: ans += ' ' return ans #============================================================= def line(file_name): file = open(file_name) ans = 0 r = file.read() r1 = r.strip('\n') r2 = r1.split('\n') for i in r2: ans += 1 file.close() return ans #============================================================= def BoW(file_name): a1 = file_name.split() ans = [] num = 0 for i in a1: for k in range(len(a1)): if i == a1[k]: num += 1 a2 = [i,num] if a2 in ans: num = 0 else: ans.append([i,num]) num = 0 return ans #============================================================= def BoWfhash(w,m): a1 = w.split() ans = [] list1 = [] for i in a1: feh = fhash(i,m) list1.append(feh) num = 0 for j in list1: for k in range(len(list1)): if j == list1[k]: num+=1 a2 = [j,num] if a2 in ans: num = 0 else: ans.append(a2) num = 0 return ans #============================================================= file_name = input('File name = ') yn = input('use feature hashing ? (y,Y,n,N) ') do = 0 b = stopwords() a = text(file_name) cn1 = alphanum(a) cn2 = ''.join(cn1.split()) cut = ' '.join([i for i in cn1.split() if i not in b]) while yn != 'y' or yn != 'Y': if yn == 'n' or yn == 'N': break elif yn == 'y' or yn == 'Y': do = 1 m = input('M = ') break else: print('Try again.') yn = input('Use feature hashing ? (y,Y,n,N) ') if do == 1: print('-------------------') print('char count =', char(file_name)) print('alphanumeric count =', len(cn2)) print('line count =', line(file_name)) print('word count =', len(a.split())) print('BoW =', BoWfhash(cut,int(m))) else: print('-------------------') print('char count =', char(file_name)) print('alphanumeric count =', len(cn2)) print('line count =', line(file_name)) print('word count =', len(cn1.split())) print('BoW =', BoW(cut))# 6330354621 (2021-03-22 23:28) %diff = 46.04 file_name=input('File name = ') #------------------------------------------------------------------------------ def stopwordtolist(): b=[] z=open('stopword.txt') for line in z: if line != "\n": line1=line.strip('\n') line2=line1.split(' ') for j in range(len(line2)): b.append(line2[j]) #word chec use for b.o.w z.close() return b #------------------------------------------------------------------------------ def alphanum(word): text='' for i in word: #alphanumeric if i==' ': text+=' ' elif i in 'abcdefghijklmnopqrstuvwxyz0123456789': text+=i else: text+=' ' return text #------------------------------------------------------------------------------ def linecount(k): file=open(k) line_count = 0 x=file.read() x1=x.strip('\n') x2=x1.split('\n') for i in x2: line_count+=1 file.close() #for line in file: # if line != "\n": #line_count+=1 return line_count #------------------------------------------------------------------------------ def texttosent(file): file=open(file) a='' for line in file: if line != "\n": line=line.lower() l1=line.strip('\n') a+=''.join(l1)+' ' file.close() return a #------------------------------------------------------------------------------ def charcount(file): file=open(file) charcount ='' for line in file: linex=line.strip() if linex != "\n": line=line.lower() l1=line.strip('\n') charcount+=''.join(l1) ans=len(charcount) file.close() return ans #------------------------------------------------------------------------------ def allChar(l1): word_stick=''.join(l1.split())#find char count ans=alphanum(word_stick) return ans #------------------------------------------------------------------------------ def BoW(word):#word= alpha a1=word.split() ans=[] num=0 for i in a1: for k in range(len(a1)): if i == a1[k]: num+=1 a2=[i,num] if a2 in ans: num=0 else: ans.append([i,num]) num=0 return ans #feature hashing--------------------------------------------------------------- def BoWfe(w,m):#cut_word='best times worst times age wisdom 555' a1=w.split() listall=[] list1=[] for i in a1: feh=fe(i,m) list1.append(feh) num=0 for j in list1: for k in range(len(list1)): if j == list1[k]: num+=1 a2=[j,num] if a2 in listall: num=0 else: listall.append(a2) num=0 return listall #------------------------------------------------------------------------------ def fe(w,m): sum1=0 k=0 for i in w: sum1+=(ord(i)*(37**(w.find(i,0+k)))) k+=1 ans=sum1 % m return ans #------------------------------------------------------------------------------ chose=0 choice=input('use feature hashing ? (y,Y,n,N) ') while choice!='n' or choice!='N': if choice=='y' or choice=='Y': chose=1 break if choice=='n' or choice=='N': chose=0 break else: print('Try again.') choice=input('use feature hashing ? (y,Y,n,N) ') if chose == 1: m=input('M = ') print('-------------------') a=texttosent(file_name) n=linecount(file_name) b=stopwordtolist() alpha=alphanum(a) alpha2=''.join(alpha.split()) cut_word =' '.join([i for i in alpha.split() if i not in b]) print('char count =', charcount(file_name)) print('alphanumeric count =',len(alpha2)) print('line count =', n) print('word count =', len(alpha.split())) print('BoW =',BoWfe(cut_word,int(m))) elif chose == 0: print('-------------------') a=texttosent(file_name) n=linecount(file_name) b=stopwordtolist() alpha=alphanum(a) alpha2=''.join(alpha.split()) cut_word =' '.join([i for i in alpha.split() if i not in b]) #word that already cut stopwords usr for b.o.w print('char count =', charcount(file_name)) print('alphanumeric count =',len(alpha2)) print('line count =', n) print('word count =', len(alpha.split())) print('BoW =',BoW(cut_word))

ALL: cluster #16 (3)

# 6330250021 (2021-03-20 11:58) %diff = 48.31 def fhash(w,M): n=0 for i in range(len(w)): n+=ord(w[i])*37**i return n%M file_name=input('File name = ') fh=input('Use feature hashing ? (y,Y,n,N) ').lower() while fh not in 'yn': print('Try again.') fh=input('Use feature hashing ? (y,Y,n,N) ').lower() if fh=='y': M=int(input('M = ')) stop=open('stopwords.txt','r') sw=[] for line in stop: if len(line.strip())!=0: sw+=line.split() stop.close() ch=0 al=0 li=0 word='' f=open(file_name,'r') for line in f: li+=1 ch+=len(line.strip()) for a in line.lower(): if '0'<=a<='9' or 'a'<=a<='z': word+=a al+=1 else: word+=' ' wordlist=word.split() wd=len(wordlist) fhlist=[] bow=[] if fh=='n': for w in wordlist: if [w,wordlist.count(w)] not in bow and w not in sw: bow.append([w,wordlist.count(w)]) elif fh=='y': for w in wordlist: if w not in sw: fhlist.append(fhash(w,M)) for x in fhlist: if [x,fhlist.count(x)] not in bow: bow.append([x,fhlist.count(x)]) bow.sort() print('-------------------') print('char count =',ch) print('alphanumeric count =',al) print('line count =',li) print('word count =',wd) print('BoW =',bow)# 6330507321 (2021-03-18 21:59) %diff = 48.31 def fhash(w,M): c=0 for i in range(len(w)): c += ord(w[i])*(37**(i)) c=c%int(M) return c x=input('File name = ',) y=input('Use feature hashing ? (y,Y,n,N) ',) while y not in ['y','Y','n','N']: print('Try again.') y=input('Use feature hashing ? (y,Y,n,N) ',) if y.lower()=='y': M=input('M = ',) k=[] File=open(x,'r') a='' lc=0 chc=0 for line in File: for e in line.strip(): if (('a'<=e.lower() and e.lower()<='z') or ('0'<= e<='9')): a+=e else: a+=' ' chc+=1 lc+=1 a=a.lower().split() wc=len(a) File.close() stop=open('stopwords.txt','r') b='' for line in stop: b += line+' ' b=b.split() alm=0 for e in a: l=len(e) for i in range(len(e)): if not (('a'<=e[i].lower() and e[i].lower()<='z') or ('0'<= e[i]<='9')): l-=1 alm+=l stop.close() File=open(x,'r') B=[] for e in a: if not e in b: B.append(e) B.sort() B.append(' ') h=1 j=[] if y in['N','n']: for i in range(len(B)-1): if B[i]==B[i+1]: h+=1 else: j.append([B[i],h]) h=1 else: k=[] B.remove(' ') for e in B: k.append(fhash(e,M)) k.sort() k.append(111) for i in range(len(k)-1): if k[i]==k[i+1]: h+=1 else: j.append([k[i],h]) h=1 File.close() print('-------------------') print('char count =',chc) print('alphanumeric count =',alm) print('line count =',lc) print('word count =',wc) print('BoW =',j)# 6330434121 (2021-03-22 17:39) %diff = 48.36 def flash(w,m): c = 0 for i in range(len(w)): c = c + (ord(w[i])*(37**i)) number = c%int(m) return number file_name = input("File name = ") feature = input("Use feature hashing ? (y,Y,n,N) ") m = 0 if not feature == "y" and feature == "Y" and feature == "n" and feature == "N": a = 0 while a == 0: print("Try again") feature = input("Use feature hashing ? (y,Y,n,N) ") if feature == "y" or feature == "Y" or feature == "n" or feature == "N": if feature == "y" or feature == "Y": m = input("M = ") break else: if feature == "y" or feature == "Y": m = input("M = ") stopword = [] stopwords = open("stopwords.txt", "r") for line in stopwords: words = line.strip().split() for i in range(len(words)): stopword.append(words[i]) stopwords.close() file = [] newword = "" line_count = 0 word_count = 0 char_count = 0 alphanumeric_count = 0 files = open(file_name, "r",encoding="utf-8") for line in files: line_count += 1 char_count += len(line.strip("\n"))+1 check_word = line.strip().lower() for i in range(len(check_word)): if check_word[i] not in "abcdefghijklmnopqrstuvwxyz0123456789": newword = newword+" " else: newword = newword + check_word[i] alphanumeric_count += 1 file = file + newword.strip().split() newword = "" files.close() char_count = char_count - line_count word_count = len(file) new_file = [] for i in range(len(file)): if not file[i] in stopword: new_file.append(file[i]) new_file.sort() bow = [] bow1 = [] c = 1 for i in range(len(new_file)-1): if new_file[i]==new_file[i+1]: c +=1 else: bow.append([new_file[i],c]) c = 1 if not m == 0: e = "" f = [] for i in range(len(new_file)): e = flash(new_file[i],m) f.append(e) f.sort() for i in range(len(f)-1): if f[i]==f[i+1]: c +=1 else: bow1.append([f[i],c]) c = 1 bow1.append([f[-1],c]) print("-------------------") print("char count = ",char_count) print("alphanumeric count = ",alphanumeric_count) print("line count = ",line_count) print("word count = ",word_count) if m == 0: print("BoW = ",bow) else: print("BoW = ",bow1)

ALL: cluster #17 (2)

# 6330402021 (2021-03-21 23:25) %diff = 48.69 def fhash(w,m): c = 0 for i in range(len(w)) : c += ord(w[i])*(37**i) return c % m #------------------------------------------------------------------------------------- file_name = input("File name = ") hashing = input("Use feature hashing ? (y,Y,n,N) ") while hashing not in ["Y","y","n","N"]: print("Try again.") hashing = input("Use feature hashing ? (y,Y,n,N) ") if hashing in ["Y","y"]: m = int(input("M = ")) print("-------------------") stopword = open("stopword.txt","r") sw = [] for i in stopword: word_char1 = "" for a in i: if a.isalpha() == True : word_char1 += a.lower() elif word_char1 != "" : sw.append(word_char1) word_char1 = "" else: word_char1 = "" if word_char1.isalpha() == True : sw.append(word_char1) stopword.close() line = 0 char = 0 alnum = 0 file = open(file_name,"r") for i in file: line += 1 char += len(i)-1 for a in i: if a.isalnum() == True : alnum += 1 h = a if h != "\n" : char += 1 file.close() print("char count = ",char) print("alphanumeric count = ",alnum) print("line count = ",line) file = open(file_name,"r") word_char = [] for i in file : word_char1 = "" for a in i: if a.isalnum() == True : word_char1 += a.lower() elif word_char1 != "" : word_char.append(word_char1) word_char1 = "" else: word_char1 = "" if word_char1.isalnum() == True : word_char.append(word_char1) file.close() word = len(word_char) print("word count = ",word) word_clear = [] for i in word_char: if i not in sw : word_clear.append(i) BoW = [] for i in word_clear: bow_count = 0 for a in range(len(word_clear)): if i == word_clear[a]: bow_count += 1 if [i,bow_count] not in BoW : BoW.append([i,bow_count]) if hashing in "Nn": print("BoW = ",BoW) elif hashing in "Yy": BoW_fhash = [] for i in BoW : BoW_fhash.append([fhash(i[0],m),i[1]]) BoW_fhash_clear = [] for i in BoW_fhash: bow_fhash_count = 0 for a in range(len(BoW_fhash)): if i[0] == BoW_fhash[a][0]: bow_fhash_count += BoW_fhash[a][1] if [i[0],bow_fhash_count] not in BoW_fhash_clear : BoW_fhash_clear.append([i[0],bow_fhash_count]) print("BoW = ",BoW_fhash_clear)# 6330577221 (2021-03-22 02:52) %diff = 48.69 #Prog-08: Bag-of-Words #6330577221 Name Akrachai Kovittayanun def fhash (w,M): allord=0 for i in range(len(w)): o=ord(w[i])*37**i allord+=o return allord%M filename=input('File name = ') bow=input('Use feature hashing ? (y,Y,n,N)) ') while bow not in ('y','Y','n','N'): print('Try again.') bow=input('Use feature hashing ? (y,Y,n,N)) ') if bow == 'y' or bow == 'Y': m=int(input('M = ')) print('-------------------') stopwords=[] s_file=open('stopwords.txt','r') for line in s_file: for w in line.strip().split(): stopwords.append(w) s_file.close() text='' textalnum='' wordstext='' line_count=0 file = open(filename,'r') for line in file: for e in line.strip(): text+=e text+=' ' line_count+=1 char=len(text)-line_count print('char count =',char) for e in text: if e.isalnum()!=True: textalnum+='' else: textalnum+=e alnum=len(textalnum) print('alphanumeric count =',alnum) print('line count =',line_count) for e in text: if e.isalnum()==True: wordstext+=e.lower() else: wordstext+=' ' wordslist=wordstext.strip().split() wordcount=len(wordslist) print('word count =',wordcount) uniquelist=[] for e in wordslist: if e not in stopwords: uniquelist.append(e) fhashlist=[] if bow=='y' or bow=='Y': for e in uniquelist: fhashwords=fhash(e,m) fhashlist.append(fhashwords) uniquelist=fhashlist output=[] point=0 for i in range(len(uniquelist)): if uniquelist[i] not in uniquelist[i+1:] and uniquelist[i] not in uniquelist[:i]: output.append([uniquelist[i],1]) if uniquelist[i] in uniquelist[i+1:] and uniquelist[i] not in uniquelist[:i]: point+=1 j=i+1 while j in range(len(uniquelist)) and uniquelist[i] in uniquelist[j:]: point+=1 j=uniquelist[j:].index(uniquelist[i])+j+1 output.append([uniquelist[i],point]) point=0 print('BoW =',output) file.close()

ALL: cluster #18 (2)

# 6330266021 (2021-03-22 23:55) %diff = 49.41 def fhash(w,M): total = 0 index = 0 for i in w: if index == 0: total += ord(i) else: total += ord(i)*(37**(index-1)) index += 1 total = total%M return total file_name = input("File name = ") file_name = open(file_name, "r") while True: feature = input("Use feature hashing ? (y,Y,n,N) ") if feature == 'Y' or feature == 'y' or feature == 'N' or feature == 'n': break; else: print("Try again.") if feature == 'Y' or feature == 'y': M = int(input("M = ")) print("-----------------------") with open('stopwords.txt', 'r') as stopwords: ban = stopwords.read().replace('\n', ' ') ban = ban.split(); charCount = 0 alphanumbericCount = 0 lineCount = 0 wordCount = 0 temp = [] BoW = [] res = [] for text in file_name: charCount += len(text) for t in text: if t.isalnum(): alphanumbericCount += 1 text = text.lower() textArray = text.split() wordCount += len(textArray) resultwords = [word for word in textArray if word not in ban] result = ' '.join(resultwords) result = ''.join([i for i in result if i.isalnum() or ' ' in i]) for word in result.split(): temp.append(word) lineCount += 1 print("char count = ", charCount) print("alphanumberic count = ",alphanumbericCount) print("line count = " , lineCount) print("word count = ",wordCount) if feature == 'Y' or feature == 'y': fhashArray = [] for x in temp: fhashArray.append(fhash(x,M)) BoW = [fhashArray.count(w) for w in fhashArray] for i in zip(fhashArray, BoW): if i not in res: res.append(i) print("BoW = ", res) else: BoW = [temp.count(w) for w in temp] for i in zip(temp, BoW): if i not in res: res.append(i) print("BoW = ", res)# 6330269021 (2021-03-21 21:55) %diff = 49.41 def fhash(w, m): s = 0 for i in range(len(w)): s += ord(w[i]) * 37**i return s % m fileName = input("File name = ").strip() while True: fhashMode = input("Use feature hashing ? (y,Y,n,N) ").strip().lower() if fhashMode == 'y' or fhashMode == 'n': break else: print("Try again.") if fhashMode == 'y': m = int(input("M = ")) print("-------------------") stopWordsFile = open("stopwords.txt") stopWords = [] for line in stopWordsFile: if line != "": for e in line.split(): stopWords.append(e) stopWordsFile.close() inputFile = open(fileName) chCount = 0 alnumCount = 0 lineCount = 0 wordCount = 0 BoW = [] wordTemp = "" words = [] for line in inputFile: lineCount += 1 for ch in line: chCount += 1 if ch.isalnum(): alnumCount += 1 wordTemp += ch elif wordTemp != "": words.append(wordTemp) wordTemp = "" words.append(wordTemp) inputFile.close() wordCount = len(words) print("char count =", chCount) print("alphanumeric count =", alnumCount) print("line count =", lineCount) print("word count =", wordCount) wordsLower = [e.lower() for e in words if e.lower() not in stopWords and e != ""] wordsLowerNoDuplicate = [] for e in wordsLower: if e not in wordsLowerNoDuplicate: wordsLowerNoDuplicate.append(e) for e in wordsLowerNoDuplicate: if fhashMode == 'y': BoW.append([fhash(e, m), wordsLower.count(e)]) else: BoW.append([e, wordsLower.count(e)]) print("BoW =", BoW)