ALL (408)

# 6030048821 (0.00) 1 (2021-03-21 15:46) import string def fhash(w,M): G = 37 s = 0 for i in range(len(w)): s += (ord(w[i]) * (G**i)) return s % M def fileProcess(filename): outlist = [] punc = string.punctuation char_count = 0 alnum_count = 0 line_count = 0 word_count = 0 with open(filename) as f: content = f.readlines() for line in content: line_count += 1 line = line.strip().lower() char_count += (len(line)) for c in punc: line = line.replace(c,' ') alnum_count += (len(line) - line.count(' ')) outlist += line.split() word_count = len(outlist) out = [(char_count,alnum_count,line_count,word_count),outlist] return out def listProcess(words): outlist = [] stopwords = fileProcess('stopwords.txt')[1] for c in words: if c not in stopwords: outlist.append(c) return outlist def bow(words, isFeature = False, M = 0): unique_words = [] Bow = [] for word in words: if isFeature: hsh = fhash(word,M) if hsh not in unique_words: Bow.append([hsh,1]) unique_words.append(hsh) else: for i in range(len(Bow)): if Bow[i][0] == fhash(word,M): Bow[i][1] += 1 else: if word not in unique_words: Bow.append([word,1]) unique_words.append(word) else: for i in range(len(Bow)): if Bow[i][0] == word: Bow[i][1] += 1 return sorted(Bow) def main(): filename = input('File name = ').strip() while (prompt := input('Use feature hashing ? (y,Y,n,N) ').strip().lower()) not in ['y','n']: print('Try again.') if prompt == 'y': M = int(input('M = ')) fp = fileProcess(filename) if prompt == 'y': Bow = bow(listProcess(fp[1]),True,M) else: Bow = bow(listProcess(fp[1])) print('-------------------') print('char count = '+str(fp[0][0])) print('alphanumeric count = '+str(fp[0][1])) print('line count = '+str(fp[0][2])) print('word count = '+str(fp[0][3])) print('BoW = '+str(Bow)) main()
# 6030380021 (30.00) 2 (2021-03-22 22:36) def fhash(w, M): val = 0 for i, alpha in enumerate(w): val += ord(alpha)*pow(37, i) else: return val % M file_name = input('File name = ').strip() h = input('Use feature hashing ? (y,Y,n,N) ').strip() while h != "n" and h != "N" and h != "y" and h != "Y": print("Try again.") h = input('Use feature hashing ? (y,Y,n,N) ').strip() if h == "n" or h == "N": hh = False elif h == "y" or h == "Y": hh = True try: M = int(input('M = ')) except: exit() stopword = [] with open("stopwords.txt") as f0: for i in f0.readlines(): if i == "\n": continue datas = i.split() for data in datas: if data not in stopword: stopword.append(data) allwords = "abcdefghijklmnopqrstuvwxyz" allwords += allwords.upper() allwords += "0123456789" charcount = 0 alphanumcount = 0 wordcount = 0 linecount = 0 bow = [] inbow = [] ffhash = [] infhash = [] with open(file_name) as f1: for line in f1.readlines(): newline = "" for alpha in line: if alpha == "\n": continue if alpha not in allwords: newline += " " else: newline += alpha.lower() alphanumcount += 1 charcount += 1 newline = newline.split() for word in newline: if word not in stopword: if word not in inbow: inbow.append(word) bow.append([word, 1]) else: idx = inbow.index(word) bow[idx][1] += 1 if hh: outcome = fhash(word, M) if outcome not in infhash: infhash.append(outcome) ffhash.append([outcome, 1]) else: idx = infhash.index(outcome) ffhash[idx][1] += 1 wordcount += 1 linecount += 1 print('-------------------') print('char count =', charcount) print('alphanumeric count =', alphanumcount) print('line count =', linecount) print('word count =', wordcount) if hh: print('BoW =', sorted(ffhash)) else: print('BoW =', sorted(bow))
# 6030924521 (20.15) 3 (2021-03-21 12:43) def read_file(filename): def check_alp_num(sentences_): for i in range(len(sentences_)): sentences_[i] = sentences[i].lower() sent = '' for w in sentences_[i]: if w.isalnum() or w == ' ': sent += w sentences_[i] = sent return sentences_ line_count = 0 sentences = [] with open(filename) as f: for i in f: line_count += 1 sentences.append(i.replace('\n', '')) char_count = sum(list(map(len, sentences))) sentences = check_alp_num(sentences) sent2 = [sentences[i].replace(' ', '') for i in range(len(sentences)) ] alnum_count = sum(list(map(len, sent2))) for j in range(len(sentences)): sentences[j] = sentences[j].split(' ') word_count = sum(list(map(len, sentences))) bow = [word for sent in sentences for word in sent] return line_count, char_count, alnum_count , word_count, bow def remove_stop_words(stop_file, bagofwords): stop_list = [] with open(stop_file) as f: stop_list += f.read().split() finalbow = [] for word in bagofwords: if word not in stop_list: finalbow.append(word) return finalbow def select_hashing(): c = True fhash = '' while c: i = input('Use feature hashing ? (y,Y,n,N) ') if i in ['y', 'Y', 'n', 'N']: fhash = i c = False else: print('Try again') return fhash def fhashing(fhash_select, bow): if fhash_select == 'y' or fhash_select == 'Y': m = int(input('M = ')) bow_list = [] for word in bow: fhash_score = 0 for i in range(len(word)): fhash_score+= ord(word[i])*(37**i) fhash_score = fhash_score%m bow_list.append(fhash_score) bow_list = sorted(Counter_(bow_list)) return bow_list else: return sorted(Counter_(bow)) def Counter_(listt): key = [] count = [] for i in listt: if i in key: count[key.index(i)] += 1 else: key.append(i) count.append(1) counter = [[key[i], count[i]] for i in range(len(key))] return counter def test1(): filename = input('File name = ') line_count, char_count, alnum_count, word_count, bow = read_file(filename) final_bow = remove_stop_words('stopwords.txt', bow) fhash_select = select_hashing() result = fhashing(fhash_select, final_bow) print('-------------------') print('char count =', char_count) print('alphanumeric count =', alnum_count) print('line count =', line_count) print('word count =', word_count) print('BoW =',result) #========== Run Test ================================================ test1() #========== End Test ================================================
# 6130097621 (30.00) 4 (2021-03-22 22:09) def fhash(w, M) : a = 0 for i in range(len(w)) : a += ord(w[i])*(37**i) b = a % M return b def charcount(file_name) : a = 0 b = 0 for line in file_name : b += 1 for i in line : a += 1 c = a-b+1 return c def alphanumericcount(file_name) : a = 0 for line in file_name : for i in line : if i in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" : a += 1 return a def linecount(file_name) : a = 0 for line in file_name : a += 1 return a def wordcount(file_name) : a = "" for line in file_name : for i in line : if i in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" : a += i if i not in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" : a += " " b = a.split() return len(b) def split1(stop1) : a = "" for line in stop1 : for i in line : if i in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" : a += i if i not in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" : a += " " a = a.lower() b = a.split() return b def bown(file_name) : a = split1(file_name) b = split1(stop) c = [] d = [] e = [] for i in range(len(a)) : if a[i] not in b : c += [a[i]] for i in c : if i not in e : e += [i] for i in e : d += [[i,0]] for i in range(len(d)) : for j in range(len(c)) : if d[i][0] == c[j] : d[i][1] += 1 d.sort() return d def bowy(file_name) : a = split1(file_name) b = split1(stop) c = [] d = [] e = [] f = [] for i in range(len(a)) : if a[i] not in b : c += [a[i]] for i in c : g = fhash(i, M) d += [g] for i in d : if i not in e : e += [i] for i in e : f += [[i,0]] for i in range(len(f)) : for j in range(len(d)) : if f[i][0] == d[j] : f[i][1] += 1 f.sort() return f file_name = str(input("File name = ")) stop = open("stopwords.txt") while True : fh = input("Use feature hashing ? (y,Y,n,N) ") if fh == "Y" or fh == "y" : M = int(input("M = ")) print("-------------------") print("char count =",charcount(open(file_name))) print("alphanumeric count =",alphanumericcount(open(file_name))) print("line count =",linecount(open(file_name))) print("word count =",wordcount(open(file_name))) print("BoW =",bowy(open(file_name))) exit() if fh == "N" or fh == "n" : print("-------------------") print("char count =",charcount(open(file_name))) print("alphanumeric count =",alphanumericcount(open(file_name))) print("line count =",linecount(open(file_name))) print("word count =",wordcount(open(file_name))) print("BoW =",bown(open(file_name))) exit() elif fh != "Y" or fh != "y" or fh != "N" or fh != "n" : print("Try again.")
# 6130917221 (22.99) 5 (2021-03-22 23:53) file_name = open(input('File name = '), 'r') fh = input('Use feature hashing ? (y,Y,n,N) ') if fh == 'y' or fh == 'Y': M = int(input('M = ')) a = 2 elif fh == 'n' or fh == 'N': a = 0 else: a = 1 while a == 1: print('Try again') fh = input('Use feature hashing ? (y,Y,n,N) ') if fh == 'y' or fh == 'Y': M = int(input('M = ')) a = 2 elif fh == 'n' or fh == 'N': a = 0 else: a = 1 print('-------------------') #--------------------------------------------------------------- stop_words = open('stopwords.txt', 'r') sw = [] for line in stop_words: sw += line.lower().strip().split() #--------------------------------------------------------------- atoz = 'abcdefghijklmnopqrstuvwxyz' no = '0123456789' cc = al = lc = wc = 0 wiwc1 = '' wiwc2 = [] wiwc3 = [] for line in file_name: if line[-2:] == '\\n': i = line[:-2].lower() else: i = line.lower() cc += len(i) lc += 1 for e in i: if e in atoz or e in no: al += 1 wiwc1 += e else: wiwc2.append(wiwc1) wiwc1 = '' for i in wiwc2: if i != '': wiwc3.append(i) wc = len(wiwc3) cc = cc - (lc - 1) print('char count =', cc) print('alphanumeric count =', al) print('line count =', lc) print('word count =', wc) #--------------------------------------------------------------- BoW = [] x = [] y = [] z = [] c = 0 for i in range(len(wiwc3)): if wiwc3[i] not in sw: z.append(wiwc3[i]) #--------------------------------------------------------------- def fhash(w, M): G = 37 x = 0 for i in range(len(w)): x += ord(w[i])* (G**i) x = x % M return x #--------------------------------------------------------------- if a == 2: for i in z: b = fhash(i, M) x.append(b) if b not in y: y.append(b) for i in y: BoW.append([i, x.count(i)]) elif a == 0: for i in z: x.append(i) if i not in y: y.append(i) for i in y: BoW.append([i, x.count(i)]) print('BoW =', BoW)
# 6130924621 (14.05) 6 (2021-03-22 23:51) def fhash(w,M): count = 0 G = 37 for i in range(len(w)): count += ord(w[i])* (G**i) return count%M filename = input('File name = ') yesorno = False while not yesorno: isHashing = input('Use feature hashing ? (y,Y,n,N) ') if isHashing in 'YyNn': yesorno = True else: print('Try again.') if isHashing in 'yY': m = int(input('M = ')) print('-------------------') textFile = open(filename) uselessChar = '(") ?.!/;:\"\'/\\,' charCount = 0 textAndNumCount = 0 wordCount = 0 lineCount = 0 BoW = [] for line in textFile: lineCount += 1 words = line.strip().split(' ') if words != ['']: wordCount += len(words) for word in words: charCount += len(word) cleanWord = word.replace(uselessChar,'').lower() for charecter in cleanWord: if 'a' <= charecter <= 'z' or '0' <= charecter <= '9': textAndNumCount += 1 isFound = False if isHashing in 'yY': cleanWord = fhash(cleanWord,m) for item in BoW: if item[0] == cleanWord: item[1] += 1 isFound = True if not isFound: BoW.append([cleanWord, 1]) print('char count = ', charCount) print('alphanumeric count = ', textAndNumCount) print('line count = ', lineCount) print('word count = ', wordCount) print('BoW = ' ,BoW)
# 6230041021 (23.90) 7 (2021-03-21 16:09) File_name = input('File name = ') hashimg = input('Use feature hashing ? (y,Y,n,N) ') File = open(File_name, 'r') stop = open('stopwords.txt', 'r') while True : if hashimg == 'y' or hashimg == 'Y': m = int(input('M = ')) break elif hashimg == 'n' or hashimg == 'N': break else: print('Try again.') hashimg = input('Use feature hashing ? (y,Y,n,N) ') #------------------------------------------------ def Cr_sent (text): Total = 0 line_c = 0 text_word = [] for i in text: if i != '\n': q = '' for e in range(len(i)): if i[e] != ' ' and i[e] != '\n' : w = i[e].strip('\n') q += w elif i[e] == ' ' : text_word.append(q) q ='' elif i[e] == '\n' : text_word.append(q) q = '' if q != ' ' and q != '': text_word.append(q) Total += len(i) line_c += 1 return(text_word, line_c,Total) def alpha (text): r = '' for i in text: q = i.lower() w = q.strip('(').strip(')').strip('"').strip("'").strip('\\').strip(',').strip('.') for e in range(len(w)): if 'A' <= w[e] <= 'Z' or 'a' <= w[e] <= 'z' or '0' <= w[e] <= '9': r += w[e] return(r) def findco (wordf,txt): x = txt.find(wordf) if x != -1: c = 1 while x != -1 : x = txt.find(wordf,x+1) if x != -1: c += 1 return(c) def bow (Bow_find): sen_can = '' bow = [] rtz = '' for i in Bow_find: sen_can += i sen_can += ' ' for e in Bow_find : if e not in rtz: v = findco(e,sen_can) bow.append([e, v]) rtz += e rtz += ' ' return(bow) def lowewr(File_sen_low): file_low = [] for i in File_sen_low: q_low = i.lower() w_low = q_low.strip('(').strip(')').strip('"').strip("'").strip('\\').strip(',').strip('.') file_low.append(w_low) return( file_low) def fash(File_c): G = 37 bow = [] bow_q = [] for i in range(m): bow.append([i,0]) for i in File_c : q = 0 for e in range(len(i)): q += ord(i[e])*(G**e) w = q%m bow[w][1] += 1 for i in range(len(bow)): if bow[i][1] != 0: bow_q.append(bow[i]) return(bow_q) #----------------------------------------------------- File_sen, line_File, Total_File = Cr_sent(File) stop_sen, line_stop, Total_stop = Cr_sent(stop) File_cantstop = [] for i in File_sen: if i not in stop_sen: File_cantstop.append(i) flie_cantstop02 = [] for i in lowewr(File_sen): if i not in lowewr(stop_sen): flie_cantstop02.append(i) if hashimg == 'n' or hashimg == 'N': qwe = bow(flie_cantstop02) else: qwe = fash(flie_cantstop02) #--------------------------------------------- print('char count =', (Total_File - line_File)+1) print('alphanumeric count =', len(alpha(File_sen))) print('line count =', line_File) print('word count =', len(File_sen)) print('BoW =', qwe)
# 6230092021 (24.00) 8 (2021-03-21 22:31) def fhash(w,M): f = 0 c = 0 for i in w: f += ord(str(i))*37**(c) c +=1 f = f%int(M) return f file_name = input('File name = ') fh = input('Use feature hashing ? (y,Y,n,N) ') if not fh in 'yYnN': print('Try again') fh = input('Use feature hashing ? (y,Y,n,N) ') elif fh in 'yY': M = int(input('M = ')) infile = open(file_name,'r') char_count = 0 alph_count = 0 line_count = 0 words = [] for line in infile: line = line.lower() line = line.strip() for i in line: char_count += 1 for i in line: if 'a' <= i <= 'z' or '0' <= i <= '9': alph_count +=1 line_count += 1 a = '' for i in line: if 'a' <= i <= 'z' or i == ' ' or '0' <= i <= '9': a += i else: a += ' ' w1 = a.split() words += w1 word_count = len(words) if fh == 'n' or fh == 'N': BoW0 = [] BoW2 = [] st = open('stopwords.txt','r') stopw = [] for line in st: line = line.split() stopw += line for i in words: if not i in stopw: BoW0.append(i) BoW1 = [] for i in range(len(BoW0)): if not BoW0[i] in BoW1: BoW1.append(BoW0[i]) for i in BoW1: c = 0 for j in range(len(words)): if words[j] == i: c += 1 BoW2.append(c) BoW = [] for i in range(len(BoW1)): BoW.append([BoW1[i],BoW2[i]]) BoW.sort() print('char count = ' + str(char_count)) print('alphanumeric count = ' + str(alph_count)) print('line count = ' + str(line_count)) print('word count = ' + str(word_count)) print('BoW = ' +str(BoW)) elif fh == 'y' or fh == 'Y': BoW0 = [] BoW2 = [] st = open('stopwords.txt','r') stopw = [] for line in st: line = line.split() stopw += line for i in words: if not i in stopw: BoW0.append(i) BoW1 = [] for i in range(len(BoW0)): if not BoW0[i] in BoW1: BoW1.append(BoW0[i]) for i in BoW1: c = 0 for j in range(len(words)): if words[j] == i: c += 1 BoW2.append(c) BoWy0 = [] BoWy1 = BoW2 BoWy3 = [] for i in BoW1: w = fhash(i,M) BoWy0.append(w) for i in range(len(BoWy0)): BoWy3.append([BoWy0[i],BoWy1[i]]) a = [] b = [] BoWy = [] for i in BoWy3: if not i[0] in a: a.append(i[0]) for i in a: c = 0 for j in range(len(BoWy3)): if i == BoWy3[j][0]: c += int(BoWy3[j][1]) b.append(c) for i in range(len(a)): BoWy.append([a[i],b[i]]) BoWy.sort() print('char count = ' + str(char_count)) print('alphanumeric count = ' + str(alph_count)) print('line count = ' + str(line_count)) print('word count = ' + str(word_count)) print('BoW = ' +str(BoWy)) infile.close() st.close()
# 6230131921 (20.55) 9 (2021-03-22 22:13) def fhash(w,M): a = 0 for i in range(len(w)): ans = ord(w[i])*(37**i) a += ans a1 = a%M return a1 file_name = input("File name = ") text1 = open(file_name,"r") text2 = open(file_name,"r") text3 = open(file_name,"r") text4 = open(file_name,"r") text5 = open(file_name,"r") stp = open("stopwords.txt","r") x = 0 y = 0 z = 0 while True: fhinput = input("Use feature hashing ? (y,Y,n,N) ") if fhinput == "y" or fhinput == "Y": mInput= int(input("M = ")) break elif fhinput == "n" or fhinput == "N": break else: print("Try again.") for line in text1: ls = line.strip() long = len(ls) x+= long for line in text2: ls2 = line.strip() for e in ls2: if "0" <= e <= "9" or "a" <= e <= "z" or "A" <= e <= "Z": y+=1 for line in text3: z += 1 def countword(text4): b = "" c = [] for line in text4: for e1 in line: if "0" <= e1 <= "9" or "a" <= e1 <= "z" or "A" <= e1 <= "Z": b += e1 else: if b!= "": c.append(b) b = "" return(c) print("-------------------") print("char count =",x) print("alphanumeric count =",y) print("line count =",z) print("word count =",len(countword(text4))) def BoW(text5): f = [] g = [] result = [] result2 = [] new = [] new2 = [] bowsub = [] bowbig = [] bowsub2 = [] bowbig2 = [] flash = [] stplist = countword(stp) for e2 in countword(text5): e2 = e2.lower() f.append(e2) for e3 in f: if e3 not in stplist: g.append(e3) g.sort() for e in g: if e not in result: result.append(e) wtf = g.count(e) new.append(wtf) for i in range(len(result)): bowsub.append(result[i]) bowsub.append(new[i]) bowbig.append(bowsub) bowsub=[] if fhinput == "y" or fhinput == "Y": for i in range(len(g)): flash.append(fhash(g[i],mInput)) flash.sort() for e in flash: if e not in result2: result2.append(e) wtf2 = flash.count(e) new2.append(wtf2) for i in range(len(result2)): bowsub2.append(result2[i]) bowsub2.append(new2[i]) bowbig2.append(bowsub2) bowsub2=[] if fhinput == "y" or fhinput == "Y": final = bowbig2 elif fhinput == "n" or fhinput == "N": final = bowbig return (final) print("BoW =",BoW(text5))
# 6230133121 (30.00) 10 (2021-03-20 03:19) file_name = input('File name = ') yn = input('Use feature hashing ? (y,Y,n,N) ') yn = yn.lower() while yn not in ['y','n']: print('Try Again.') yn = input('Use feature hashing ? (y,Y,n,N) ') yn = yn.lower() #-------------------------------- def char_count(file): F = open(file,'r') char = 0 for line in F: char += len(line.strip()) return char def stop_words(stop_file): ST = open(stop_file,'r') STOP = [] for line in ST: WORD = line.strip().lower().split() for e in WORD: STOP.append(e) ST.close() return STOP def BoW(word): bow = [] for i in range(len(word)): n = 0 if word[i] not in word[0:i] : for j in range(len(word)): if word[i] == word[j]: n+=1 bow.append([word[i],n]) bow.sort() return bow def fhash(a,M): f = 0 ORD=[] for s in a: ORD.append(ord(s)) for i in range(len(a)): f += ORD[i]*37**i f = f % M return f #--------------------------------- stop = stop_words('stopwords.txt') fn = open( file_name ,'r') word = [] alphanumeric = 0 linc = 0 wordc = 0 for line in fn: linc += 1 S = '' for s in line: if 'a' <= s <= 'z' or 'A' <= s <= 'Z' or '0' <= s <= '9': S += s else: S += ' ' WORD = S.strip().lower().split() wordc += len(WORD) for e in WORD: alphanumeric += len(e) if e not in stop: word.append(e) fn.close() if yn == 'n': print('-------------------') print('char count =',char_count(file_name)) print('alphanumeric count =',alphanumeric) print('line count =',linc) print('word count =',wordc) print('BoW =',BoW(word)) else: M = int(input('M =')) numword = [] for e in word: numword.append(fhash(e,M)) print('-------------------') print('char count =',char_count(file_name)) print('alphanumeric count =',alphanumeric) print('line count =',linc) print('word count =',wordc) print('BoW =',BoW(numword))
# 6230153721 (0.00) 11 (2021-03-22 21:21) o='.,\"\'[]:<>/#%!^{}$ +=*_-|&' def fhash(w,M): r=0 for i in range(len(w)): n=ord(w[i]) r+=n*(37**i) x=r%M return x fin=open('stopwords.txt') s=[] line=fin.readline() while line!='': k='' for i in line: if i not in o and i!='\n': k+=i elif i in o and i!='\n': s.append(k.lower()) k='' elif i=='\n': if k!='': s.append(k.lower()) line=fin.readline() file_name=input('File name = ') a=input('Use feature hashing ? (y,Y,n,N) ') while a!='y' and a!='Y' and a!='n' and a!='N': print('Try again.') a=input('Use feature hashing ? (y,Y,n,N) ') if a=='y' or a=='Y': print('-------------------') M=int(input('M = ')) fine=open(file_name) character=0 alpha=0 lines=0 line=fine.readline() while line!='': for i in range(len(line)): if line[i] !='\n': character+=1 if 'A'<=line[i]<='Z' or 'a'<=line[i]<='z' or line[i] in '0123456789': alpha+=1 if line[i]=='\n': lines+=1 line=fine.readline() print('char count =',character) print('alphanumeric count =',alpha) print('line count =',lines-1) g='' bow=[] BoW=[] word=0 fine=open(file_name) line=fine.readline() while line!='': for i in line: if i not in o and i!='\n': g+=i.lower() elif i in o and g not in s and fhash(g,M) not in bow and i!='\n' and g!='': bow.append(fhash(g,M)) BoW.append([fhash(g,M),1]) word+=1 g='' elif i in o and g not in s and fhash(g,M) in bow and i!='\n' and g!='': BoW[bow.index(fhash(g,M))][1]+=1 word+=1 g='' elif i in o and g in s and g!='': word+=1 g='' elif i=='\n': if g!='': word+=1 if g not in s and fhash(g,M) not in bow: bow.append(fhash(g,M)) BoW.append([fhash(g,M),1]) g='' elif g not in s and fhash(g,M) in bow: BoW[bow.index(fhash(g,M))][1]+=1 g='' elif g in s: g='' line=fine.readline() print('word count =',word) print('BoW =',BoW) elif a=='n' or a=='N': print('-------------------') fine=open(file_name) character=0 alpha=0 lines=0 line=fine.readline() while line!='': for i in range(len(line)): if line[i] !='\n': character+=1 if 'A'<=line[i]<='Z' or 'a'<=line[i]<='z' or line[i] in '0123456789': alpha+=1 if line[i]=='\n': lines+=1 line=fine.readline() print('char count =',character) print('alphanumeric count =',alpha) print('line count =',lines-1) g='' bow=[] BoW=[] word=0 fine=open(file_name) line=fine.readline() while line!='': for i in line: if i not in o and i!='\n': g+=i.lower() elif i in o and g not in s and g not in bow and i!='\n' and g!='': bow.append(g) BoW.append([g,1]) word+=1 g='' elif i in o and g not in s and g in bow and i!='\n' and g!='': BoW[bow.index(g)][1]+=1 word+=1 g='' elif i in o and g in s and g!='': word+=1 g='' elif i=='\n': if g!='': word+=1 if g not in s and g not in bow: bow.append(g) BoW.append([g,1]) g='' elif g not in s and fhash(g,M) in bow: BoW[bow.index(fhash(g,M))][1]+=1 g='' elif g in s: g='' line=fine.readline() print('word count =',word) print('BoW =',BoW)
# 6230154321 (18.10) 12 (2021-03-22 20:23) x=input('File name = ') ccount = 0 acount = 0 lcount = 0 wcount = 0 ch='ABCDEFGHIJKLMNOPQRSTUVWXYZ' num='0123456789' s=[] e=[] f='' k=[] BOW=[] BO=[] p=[] infile=open(x,"r") def fhash(w,M): o=0 for i in range (len(w)): o+=ord(w[i])*(37**i) return o%M for line in infile: lcount+=1 for i in line.strip(): s.append(i) ccount+=1 for a in line: if 'A'<=a<='Z' or 'a'<=a<='z' or '0'<=a<='9': acount+=1 for n in line.strip(): if n in ch or n in ch.lower() or n in num : f+=n elif n not in ch and n not in ch.lower() and n not in num: e.append(f) f='' for g in e: if g!='': k.append(g) wcount=len(k) infile.close() file=open('stopwords.txt','r') d=[] for line in file: d+=line.split() file.close() j=[] for u in k: if u.lower() not in d: p.append(u) a=input('Use feature hashing ? (y,Y,n,N) ') while a!='Y' and a!='y' and a!='n' and a!='N': print('Try again.') a=input('Use feature hashing ? (y,Y,n,N) ') if a=='Y' or a=='y': m=int(input('M = ')) for i in p: y = fhash(i,m) if y not in BO: BOW.append([y,1]) BO.append(y) elif y in BO: for i in BOW: if i[0]==y: i[1]+=1 elif a=='n' or a=='N': for i in p: if i not in BO: BOW.append([i,1]) BO.append(i) elif i in BO: for v in BOW: if v[0]==i: v[1]+=1 print('-------------------') print('char count = ',ccount) print('alphanumeric count = ',acount) print('line count = ',lcount) print('word count = ',wcount) print('BoW = ',BOW)
# 6230444321 (20.00) 13 (2021-03-22 23:15) arabic = ["1","2","3","4","5","6","7","8","9","0","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"] def Bow(n): a = onlyword(n) x = [] cnt = 1 for i in range(len(a)-1): if a[i] != a[i+1] : x.append([a[i],cnt]) cnt = 1 else: cnt += 1 x.append([a[-1],cnt]) return x def onlyword(n): a = n.split() q = "" for i in range(len(a)): x = 0 z = "" while x != len(a[i]): if a[i][x] in arabic: z += a[i][x] x += 1 else: x +=1 q += z + " " h = q.lower() k = h.split() k.sort() k1 = [] for i in range(len(k)): if len(k[i]) >=3 and k[i] not in ["was","she","them","they","this","the","there","are"]: k1.append(k[i]) return k1 def fhash(w,M): a = onlyword(w) q = [] i = 0 while i <= len(a)-1: if len(a[i]) <=2: i +=1 else: x = 0 z = 0 while x <= len(a[i])-1: z += ord(a[i][x])*(37**x) x +=1 y = z%int(M) q.append(y) i +=1 q.sort() s = [] cnt = 1 for i in range(len(q)-1): if q[i] != q[i+1] : s.append([q[i],cnt]) cnt = 1 else: cnt += 1 s.append([q[-1],cnt]) return s def charcount(n): return len(n)-linecount(k)+1 def alphacount(n): x = 0 for i in range(len(n)): if n[i] not in arabic: x +=1 return len(n)-x def wordcount(n): copy_n = '' for i in range(len(n)): if n[i] in arabic: copy_n += n[i] else : copy_n += " " A = copy_n.strip().split() return len(A) def linecount(n): c = 0 for line in n : if line == "\n" : c += 1 return c+1 n=input("File name = ") n1 = input("Use feature hashing ? ") while n1 not in ["y","Y","n","N"]: print("Try again.") n1 = input("Use feature hashing ? ") if n1 == "y" or n1 == "Y": m = input("M = ") a = open(n,"r") k = a.read() print("char count = " + str(charcount(k))) print("alphanumeric count = " + str(alphacount(k))) print("line count = " + str(linecount(k))) print("word count = " + str(wordcount(k))) if n1 == "y" or n1 =="Y": print("BoW = " + str(fhash(k,m))) elif n1 == "n" or n1 =="N": print("BoW = " + str(Bow(k)))
# 6230585121 (29.00) 14 (2021-03-22 22:06) def fhash(w, M): total = 0 for i in range(len(w)): total += (ord(w[i])*(37**i)) a = total%int(M) return a def alphanum_count(a): n = 0 for i in range(len(a)): if a[i] in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789': n += 1 return n def word_list(a): b = '' for i in range(len(a)): if a[i] not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789': b+=' ' else: b+=a[i] c = b.strip().split() return c file_name = input('File name = ') fh = input('Use feature hashing ? (y,Y,n,N) ') while fh not in 'yYnN': print("Try again.") fh = input('Use feature hashing ? (y,Y,n,N) ') if fh in 'yY': M = input('M = ') print('-------------------') if fh in 'nN': print('-------------------') stop_w = [] file = open("stopwords.txt", "r") for line in file: a = line.strip().split() stop_w += a file.close() char_c = 0 alpha_c = 0 line_c = 0 word_c = 0 list_w = [] infile = open(file_name, "r") for line in infile: x = line.strip() char_c += len(x) a = alphanum_count(x) alpha_c += a line_c += 1 b = word_list(x) word_c += len(b) for i in range(len(b)): d = b[i].lower() list_w.append(d) infile.close() print('char count = ',char_c) print('alphanumeric count = ',alpha_c) print('line count = ',line_c) print('word count = ',word_c) new_list = [] for i in range(len(list_w)): if list_w[i] not in stop_w: new_list.append(list_w[i]) BoW = [] for i in range(len(new_list)): x = new_list.count(new_list[i]) if new_list[i] not in BoW: BoW.append(new_list[i]) BoW.append(x) if fh in 'yY': BoW_hash = [] BoW_c = [] list_A = [] for i in range(0,len(new_list)): h = fhash(new_list[i], M) BoW_hash.append(h) for i in range(len(BoW_hash)): z = BoW_hash.count(BoW_hash[i]) if str(BoW_hash[i]) not in BoW_c: BoW_c.append(str(BoW_hash[i])) BoW_c.append(z) for i in range(0,len(BoW_c),2): k = [] k.append(int(BoW_c[i])) k.append(BoW_c[i+1]) list_A.append(k) k = [] list_A.sort() print("BoW = ",list_A) if fh in 'nN': list_B = [] for i in range(0,len(BoW),2): l = [] l.append(BoW[i]) l.append(BoW[i+1]) list_B.append(l) l = [] list_B.sort() print("BoW = ",list_B)
# 6231004021 (24.00) 15 (2021-03-21 22:24) def make_words(s): pos = 0 for i, c in enumerate(s): if not c.isalnum(): yield s[pos:i] pos = i + 1 yield s[pos:] def fhash(w, m): g = 37 res = 0 for i in range(len(w)): res = (res + ord(w[i]) * (g ** i)) % m return res def count_bow(bow, word): idx = 0 for i in range(len(bow)): if bow[i][0] == word: bow[i][1] += 1 return bow bow.append([word, 1]) return bow file_name = input('File name = ') txt = open(file_name, 'r').read() stop_words = open('stopwords.txt', 'r').read().split() while True: hash = input('Use feature hashing ? (y,Y,n,N) ') if hash not in ['y', 'Y', 'n', 'N']: print('Try again.') continue break if hash in ['y', 'Y']: m = int(input('M = ')) print('-------------------') words = [] s = 0 new_txt = ' '.join(txt.splitlines()) words = [word.lower() for word in list(make_words(new_txt)) if len(word) > 0] bow = [] for word in words: if word not in stop_words: bow = count_bow(bow, word) if hash in ['y', 'Y']: h_bow = [[fhash(b[0], m), b[1]]for b in bow] h_bow = sorted(h_bow, key=lambda x: x[0]) bow = [] bow.append(h_bow[0]) for i in range(1, len(h_bow)): if h_bow[i][0] == bow[-1][0]: bow[-1][1] += h_bow[i][1] else: bow.append(h_bow[i]) bow = sorted(bow, key=lambda x: x[0]) print('char count =', sum([1 for c in txt if c != '\n'])) print('alphanumeric count =', sum([1 for c in txt if c.isalnum()])) print('line count =', len(txt.splitlines())) print('word count =', len(words)) print('BoW =', bow) # print(stop_words)
# 6231008621 (30.00) 16 (2021-03-22 09:49) def fhash(w, M): G = 37 % M s = 0 g = 1 for i in range(len(w)): s = (s + ((ord(w[i]) % M) * g)) % M g = (g * G) % M return s def splitWords(line): out = [] for w in line.lower().split(): if w.isalnum(): out.append(w) else: buffer = "" for c in w: if c.isalnum(): buffer += c elif buffer != "": out.append(buffer) buffer = "" if buffer != "": out.append(buffer) return out def getStopWords(): fp = open("stopwords.txt", "r") words = [] for line in fp: for w in splitWords(line): if w not in words: words.append(w) fp.close() return words def readFile(fp, stopWords): charCount = 0 alnumCount = 0 lineCount = 0 wordCount = 0 bow = [] for line in fp: if line[-1:] == "\n": charCount += len(line) - 1 else: charCount += len(line) lineCount += 1 for w in splitWords(line): alnumCount += len(w) wordCount += 1 if w in stopWords: continue added = False for i in range(len(bow)): if w == bow[i][0]: bow[i][1] += 1 added = True if not added: bow.append([w, 1]) print("char count =", charCount) print("alphanumeric count =", alnumCount) print("line count =", lineCount) print("word count =", wordCount) return bow def readFileHash(fp, stopWords, M): charCount = 0 alnumCount = 0 lineCount = 0 wordCount = 0 bow = [0] * M for line in fp: if line[-1:] == "\n": charCount += len(line) - 1 else: charCount += len(line) lineCount += 1 for w in splitWords(line): alnumCount += len(w) wordCount += 1 if w not in stopWords: hw = fhash(w, M) bow[hw] += 1 print("char count =", charCount) print("alphanumeric count =", alnumCount) print("line count =", lineCount) print("word count =", wordCount) return [[i, bow[i]] for i in range(M) if bow[i] != 0] filename = input("File name = ") fp = open(filename, "r") temp = input("Use feature hashing ? (y,Y,n,N) ") while temp not in ("y", "Y", "n", "N"): print("Try again.") temp = input("Use feature hashing ? (y,Y,n,N) ") if temp == "y" or temp == "Y": M = int(input("M = ")) stopWords = getStopWords() bow = readFileHash(fp, stopWords, M) else: stopWords = getStopWords() bow = readFile(fp, stopWords) print("BoW =", bow)
# 6231012021 (30.00) 17 (2021-03-22 22:51) def BoW_N(x): a=[] for j in x: y='' for i in range(len(j)): if ('a' <= j[i].lower() <= 'z' ) or ('0' <= j[i] <= '9' ): y+=j[i] a.append(y) final = [] bow_n = [] for e in a: if e not in final: final.append(e) bow_n.append([e, 1]) else: b = final.index(e) bow_n[b][1] += 1 return bow_n def fhash(w,M): cal=0 for i in range(len(w)): ord_value=ord(w[i])*(37**i) cal+=ord_value fhash_value=cal%M return fhash_value def all_fhash(lst,M): all_f=[] for i in lst: q=fhash(i,M) all_f.append(q) not_repeat=[] for i in all_f: if i not in not_repeat: not_repeat.append(i) ans_f=[] for i in not_repeat: c=0 for j in range(len(all_f)): if all_f[j] == i: c += 1 ans_f.append([i,c]) ans_f.sort() return ans_f #------------------------------------ file_name=input('File name = ') BoW=input('Use feature hashing ? (y,Y,n,N)') while BoW not in ['y','Y','n','N']: print('Try Again') BoW=input("Use feature hashing ? (y,Y,n,N) ") file=open(file_name) char_count=0 line_count=0 string='' for char in file: string+=str(char) char_count+=len(char.strip()) line_count+=1 word='' for i in string.lower(): if i in ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0']: word+=i else: word+=' ' word=word.split() word_count=len(word) a=d=0 for i in range(len(string)): if(string[i].isalpha()): a +=1 elif(string[i].isdigit()): d += 1 stopfile = open('stopwords.txt') stop=open('stopwords.txt') stopword=[] for i in stopfile: i=i.strip().split() for e in i: stopword.append(e) no_stopword=[] for i in word: if i not in stopword: no_stopword.append(i) no_stopword.sort() if BoW in ['y','Y']: M=int(input('M = ')) print('-------------------') print('char count =',char_count) print('alphanumeric count =',a+d) print('line count =',line_count) print('word count =',word_count) print('BoW =',all_fhash(no_stopword,M)) else: print('-------------------') print('char count =',char_count) print('alphanumeric count =',a+d) print('line count =',line_count) print('word count =',word_count) print('BoW =',BoW_N(no_stopword))
# 6231019521 (19.00) 18 (2021-03-22 15:34) #------------------------------------------------------------- def count(infile): charcount=0 linecount=0 alphanum=0 word='' for line in infile: charcount+=len(line.strip()) linecount+=1 line.strip() word+=str(line) for c in line.lower(): if (c in Alpha) or (c in Number): alphanum+=1 wordcount=len(word.split()) word_use=word.split() return charcount,alphanum,linecount,wordcount,word_use #------------------------------------------------------------ def BoWN(worduse): words=[] for c in worduse: if c.lower() not in stopwo: words.append(c) use=[] for j in words: w='' for i in range(len(j)): if (j[i].lower() in Alpha) or (j[i] in Number): w+=j[i] use.append(w) final = [] bown = [] for v in use: if v not in final: final.append(v) bown.append([v, 1]) else: a = final.index(v) bown[a][1] += 1 return bown,use #---------------------------------------------------------------- def flash(w,M): G=37 ans=0 for i in range(len(w)): ans+=(ord(w[i].lower())*(G**i)) answer=ans%(int(M)) return answer #---------------------------------------------------------------- def BowY(e): wo = [] bowy = [] for r in e: if r not in wo: wo.append(r) bowy.append([r, 1]) else: t = wo.index(r) bowy[t][1] += 1 return bowy #---------------------------------------------------------------- def forbowy(use,M): e=[] for c in use: r=flash(c,M) e.append(r) return e #---------------------------------------------------------------- Alpha=['a','b','c','d','e','f','g','h','i','j','k','l',\ 'm','n','o','p','q','r','s','t','u','v','w','x','y','z'] Number=['1','2','3','4','5','6','7','8','9'] #---------------------------------------------------------------- file_name=input('File name = ') infile=open(file_name) stop=open('stopwords.txt') stopwo=[] for k in stop: k=k.strip().split() for e in k: stopwo.append(e) #------------------------------------------------------------- feature= input('Use feature hashing ? (y,Y,n,N) ') #------------------------------------------------------------- while feature.upper()!='Y' and feature.upper()!='N': print('Try again.') feature= input('Use feature hashing ? (y,Y,n,N) ') if feature.upper()=='Y': M=input('M = ') print('-------------------') x=count(infile) bn=BoWN(x[4]) by=forbowy(bn[1],M) bowy=BowY(by) print('char count =',x[0]) print('alphanumeric count =',x[1]) print('line count =',x[2]) print('word count =',x[3]) print('BoW =',bowy) if feature.upper()=='N': print('-------------------') y=count(infile) print('char count =',y[0]) print('alphanumeric count =',y[1]) print('line count =',y[2]) print('word count =',y[3]) z=BoWN(y[4]) print('BoW =',z[0]) infile.close() stop.close()
# 6231205921 (24.85) 19 (2021-03-22 11:42) file_name = input('File name = ') b = input('Use feature hashing ? (y,Y,n,N) ') while b != 'y' and b != 'Y' and b != 'n' and b != 'N' : print('Try again.') b = input('Use feature hashing ? (y,Y,n,N) ') if b == 'y' or b == 'Y' : m = int(input('M = ')) #open stopwords fin = open("stopwords.txt", 'r') stop = "" for line in fin : if line[-1:] == "\n" : stop += line[:-1]+" " else : stop += line #open file_name file = open(file_name, 'r') #character num_string = 0 for line in file : if line[-1:] == '\n' : line = line[:-1] num_string += len(line) print('-'*19) print('char count =',num_string) #alphanumeric file = open(file_name, 'r') alpha = 0 for line in file : if line[-1:] == '\n' : line = line[:-1] for i in range(len(line)) : if 'a' <= line[i] <= 'z' or 'A' <= line[i] <= 'Z' or '0' <= line[i] <= '9' : alpha += 1 print('alphanumeric count =',alpha) #line count file = open(file_name, 'r') line_count = 0 for line in file : if len(line) != 0 : line_count += 1 print('line count =',line_count) #cut stopwords file = open(file_name, 'r') words = "" for line in file : if line[-1:] == '\n' : line = line[:-1] for i in line : if i in "\'\".,:;()" : words += "" else : words += i words += " " words = words.lower().split() words2 = list(words) for w in words2 : if w in stop : words.remove(w) print('word count =',len(words2)) #feature hashing if b == 'y' or b == 'Y' : list_flash = [] flash = 0 for k in words : for i in range(len(k)) : flash += ord(k[i])*(37**i) flash = flash%m list_flash.append(flash) flash = 0 list_flash.sort() list_flash.append(100000) bow = [] m2 = 0 for j in range(1,len(list_flash)) : m2 += 1 if list_flash[j-1] != list_flash[j] : bow.append([list_flash[j-1],m2]) m2 = 0 print('BoW =',bow) #no feature hashing elif b == 'n' or b == 'N' : words.sort() words.append("###") list_bow = [] num = 0 for i in range(1,len(words)) : num += 1 if words[i-1] != words[i] : list_bow.append([words[i-1],num]) num = 0 print('BoW =',list_bow)
# 6231207121 (30.00) 20 (2021-03-22 03:12) file_name = str(input("File name = ")) def FHashOrNot() : Loop = True while Loop : p1 = input("Use feature hashing ? (y,Y,n,N) ") if p1 == "y" or p1 == "Y" : return True elif p1 == "n" or p1 == "N" : return False else : print("Try again.") def STWRead() : STW_lst = list() STW_str = str() fin = open("stopwords.txt", "r") for line in fin : for e in line : if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9" : STW_str += e.lower() else : STW_str += " " fin.close() STW_lst = STW_str.split() return STW_lst def FILRead(p) : FIL_lst = list() FIL_str = str() PURE = str() fin = open(file_name, "r") for line in fin : for e in line : PURE += e if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9" : FIL_str += e.lower() else : FIL_str += " " fin.close() FIL_lst = FIL_str.split() FIL_str = "".join(FIL_lst) if p == "STR" : return FIL_str elif p == "LST" : return FIL_lst elif p == "PURE" : return PURE def CharCount() : n_char = 0 for e in FILRead("PURE") : if e != "\n" : n_char += 1 print("char count =",n_char) def AlphCount() : n_alph = 0 for e in FILRead("STR") : if "A" <= e <= "Z" or "a" <= e <= "z" or "0" <= e <= "9" : n_alph += 1 print("alphanumeric count =",n_alph) def LineCount() : n_line = 0 for e in FILRead("PURE") : if e == "\n" : n_line += 1 if e != "\n" : n_line += 1 print("line count =",n_line) def WordCount() : n_word = len(FILRead("LST")) print("word count =",n_word) def BOW() : ListBOW = list() STW = STWRead() FIL = FILRead("LST") for e in FIL : if e not in STW : ListBOW.append(e) if FH == True : ListBOW2 = list() for e in ListBOW : ListBOW2.append(fhash(e,M_)) ListBOW[:] = ListBOW2 ListBOW.sort() ListBOW.append([]) BOWFIN = list() cs = 1 for i in range(len(ListBOW)-1) : if ListBOW[i] == ListBOW[i+1] : cs += 1 else : BOWFIN.append([ListBOW[i], cs]) cs = 1 print("BoW =",BOWFIN) def fhash(w,M) : G = 37 r = 0 for i in range(len(w)) : r += ord(w[i])*G**i r %= M return r # ===== Run ===== FH = FHashOrNot() if FH == True : M_ = int(input("M = ")) print("-------------------") CharCount() AlphCount() LineCount() WordCount() BOW()
# 6231214521 (24.95) 21 (2021-03-22 18:30) file_name = input("File name = ") feature = input("Use feature hashing ? (y,Y,n,N) ") feature = feature.upper() while True: if feature == "N": print("-"*19) break elif feature =="Y": M = int(input("M = ")) print("-"*19) break elif feature !="Y" or feature !="N": print("Try again.") feature = input("Use feature hashing ? (y,Y,n,N) ") feature = feature.upper() fn = open(file_name,"r") #print (fn.read()) lines = [] char_count =0 for line in fn: newline ="" for c in line.lower(): if c != "\n": newline +=c lines.append(newline) for i in range(len(lines)): char_count += len(lines[i]) print("char count =",char_count) alphanum = 0 for i in range(len(lines)): for e in lines[i]: if ord("a")<=ord(e)<=ord("z"): alphanum +=1 if ord("0")<=ord(e)<=ord("9"): alphanum +=1 print("alphanumeric count =",alphanum) print("line count =",len(lines)) words = "" for i in range(len(lines)): for e in lines[i]: if e in "\"\'/\\,.:;()[]{}": words += " " else: words += e words += " " #print(words) words = words.split() print("word count =",len(words)) re = open("stopwords.txt","r") stword = "" for p in re: for e in p.lower(): if e != "\n": stword +=e else: stword +=" " stword = stword.split() #print(words) out_stword = [] for w in words: if w not in stword: out_stword.append(w) #print(out_stword) def fhash(w,M): G = 37 ans = 0 for i in range(len(w)): ans += (ord(w[i]))*(G**i) return ans % M if feature == "n" or feature == "N": result = [] for e in out_stword: if e not in result: result.append(e) bow = [] for k in result: c = 0 for e in words: if k == e: c += 1 bow.append([k,c]) print("BoW =",bow) else: word_hash = [] for e in out_stword: word_hash.append(fhash(e,M)) # print(word_hash) result = [] for e in word_hash: if e not in result: result.append(e) bow = [] for k in result: c = 0 for e in word_hash: if k == e: c += 1 bow.append([k,c]) print("BoW =",bow)
# 6231220221 (26.85) 22 (2021-03-21 22:35) def fhash(w,M): a = 0 G = 37 for i in range(len(w)): a += ord(w[i])*((37)**i) a %= int(M) return a def char_count(file): a = open(file) b = 0 for line in a: s = line.strip() b += len(s) a.close() return b def alphanumeric_count(file): a = open(file) b = '' for line in a: if len(line) > 0: for i in line: if i not in "\\\"\'-()[].,><?:;#@!$%^&*_+=": b += i else: b += ' ' b = b.strip().split() s = len(''.join(b)) a.close() return s def line_count(file): a = open(file) s = 0 for line in a: s += 1 a.close() return s def word_count(file): a = open(file) b = '' for line in a: if len(line) > 0: t = line.strip() for i in t: if i not in "\\\"\'-()[].,><?:;#@!$%^&*_+=": b += i else: b += ' ' b += ' ' c = len(b.strip().split()) return c def BoW(file,YN): a = open(file) b = open('stopwords.txt') c = [] d = '' e = [] for line in b: if len(line) > 0: s = line.strip().split() for i in s: c.append(i) for line in a: if len(line) > 0: t = line.strip().lower() for i in range(len(t)): if t[i] not in "\\\"\'-()[].,><?:;#@!$%^&*_+=": if i == len(t)-1 : d += t[i]+' ' else: d += t[i] else: d += ' ' if YN == 'N': d = d.strip().split() for i in d: if i not in c and i not in e: e.append(i.lower()) final = [] for i in e: k = d.count(i) final.append([i,k]) final.sort() else: d = d.strip().split() for i in d: if i not in c: e.append(i.lower()) final = [] coll = [] f = [] for i in e: q = fhash(i,M) f.append(q) if q not in coll: coll.append(q) coll.sort() for i in coll: k = f.count(i) final.append([i,k]) a.close() b.close() return final file_name = input('File name = ') fh = input('Use feature hashing ? (y,Y,n,N) ').upper() b = False while b == False: if fh == 'Y': M = int(input('M = ')) print('-------------------') b = True elif fh == 'N': print('-------------------') b = True else: print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ').upper() print('char count =',char_count(file_name)) print('alphanumeric count =',alphanumeric_count(file_name)) print('line count =',line_count(file_name)) print('word count =',word_count(file_name)) print('BoW =',BoW(file_name,fh))
# 6231222521 (20.10) 23 (2021-03-22 16:35) def fhash(s,M) : n = len(s) q = 0 for i in range(n) : q += ord(s[i])*(37**i) q %= M return q an = "abcdefghijklmnopqrstuvwxyz" an += an.upper() an += "0123456789" file_name = input("File name = ").strip() yn = "" symbol = '''!@#$%^&*()_+':./'",''' while True : yn = input("Use feature hashing ? (y,Y,n,N) ").strip() if yn.lower() == "y" or yn.lower() == "n" : break else : print("Try again.") stopwords = [] f2 = open("stopwords.txt","r") for q in f2 : q = q.strip() q = q.split() for i in q : stopwords.append(i.lower()) if yn.lower() == "y" : M = int(input("M = ")) file = open(file_name,"r") char_count = 0 alphanum_count = 0 line_count = 0 word_count = 0 used = [] count = [] for line in file : line = line.strip() line_count += 1 for c in line : if c in an : alphanum_count += 1 char_count += 1 line = line.split() for i in line : for s in symbol : i = i.replace(s,"") if i in used : idx = used.index(i) count[idx] += 1 else : used.append(i) count.append(1) print("char count =",char_count) print("alphanumeric count =",alphanum_count) print("line count =",line_count) print("word count =",word_count) word_count += len(line) bow = [[used[i],count[i]] for i in range(len(used)) if used[i].lower() not in stopwords] bow.sort() if yn.lower() == "y" : b2 = [[fhash(bow[i][0],M),bow[i][1]] for i in range(len(bow))] bow = [] used = [0]*37 for i in range(len(b2)) : used[b2[i][0]] += b2[i][1] for i in range(len(used)) : if used[i] != 0 : bow.append([i,used[i]]) print("BoW =",bow)
# 6231223121 (30.00) 24 (2021-03-22 12:01) def fhash(w,M): a=0 for i in range(len(w)): a+=ord(w[i])*(37**i) b=a%int(M) return b file_name=input('File name = ') bow=input('Use feature hashing ? (y,Y,n,N) ') q='y,Y,n,N' if bow=='n' or bow=='N': print('-'*19) elif bow=='y' or bow=='Y': M=input('M = ') print('-'*19) while not bow in q: print('Try again.') bow=input('Use feature hashing ? (y,Y,n,N) ') if bow=='n' or bow=='N': print('-'*19) elif bow=='y' or bow=='Y': M=input('M = ') print('-'*19) f=open('stopwords.txt') f1=open(file_name) stop=[] sam=[] for line in f: line=line.lower() line=line.split() stop+=line sam1=[] nsam1=[] for line in f1: line=line.lower() line1='' sam1+=line.split() nsam1.append(line) for c in line: for x in c: if 'a'<=x<='z' or '0'<=x<='9': line1+=x else: line1+=' ' line1=line1.split() sam+=line1 f.close() f1.close() kk=0 for tt in nsam1: kk+=len(tt) ccc=0 for cc in nsam1: if cc[-1]=='\n': ccc+=1 char_count=kk-ccc print('char count = '+str(char_count)) al=0 for t in sam: al+=len(t) alpha=al print('alphanumeric count = '+str(alpha)) ll=0 for aa in range(len(nsam1)-1): if nsam1[aa][-1]=='\n': ll+=1 if nsam1[-1]!='': lll=1 else: lll=0 line_count=ll+lll print('line count = '+str(line_count)) word_count=len(sam) print('word count = '+str(word_count)) out=[] for i in range(len(sam)): if not sam[i] in stop: out.append(sam[i]) wd=[] for e in out: if not e in wd: wd.append(e) fe=[] for k in range(len(wd)): num=out.count(wd[k]) fe.append(str(num)) BoW=[] if bow=='n' or bow=='N': for g in range(len(fe)): fe[g]=int(fe[g]) BoW.append([wd[g],fe[g]]) print('BoW = '+str(BoW)) elif bow=='y' or bow=='Y': n2=[] for u in range(len(out)): n1=fhash(out[u],M) n2+=str(n1) n2.sort() n3=[] nc1=[] for d in n2: if not d in n3: n3+=d for q in n3: x=n2.count(q) nc1.append(x) for j in range(len(n3)): BoW.append([int(n3[j]),nc1[j]]) print('BoW = '+str(BoW))
# 6231510221 (21.40) 25 (2021-03-22 11:23) #Prog-08: Bag-of-words #6231510221 (21.40) Pleumpiti Pholphakwaen alpha = 'abcdefghijklmnopqrstuvwxyz0123456789' stop = open('stopwords.txt').read().lower().splitlines() stop_word = [] for line in stop: stop_word += line.split() def fhash(w,m): return sum([ord(c)*37**i for i,c in enumerate(w)])%m def convert(words,m,hash): result = [] for word in words: if word in stop_word: continue if hash: word = fhash(word,m) result.append(word) return result def get_unique(words): unique_words = [] u_w = [] for word in words: if word not in u_w: unique_words.append([word,1]) u_w.append(word) else: i = u_w.index(word) unique_words[i][1]+=1 return unique_words m=0 file_name = input('File name = ') file = open(file_name).read().lower().splitlines() hash = False while (True): command = input('Use feature hashing ? (y,Y,n,N) ') if command.lower() == 'y': hash = True m = int(input('M = ')) break elif command.lower() == 'n': break else: print('Try again.') print('-'*19) count_char = 0 count_alpha = 0 words = [] word = '' for line in file: for c in line: if c in alpha: count_alpha+=1 word+= c else: if word != '': words.append(word) word = '' count_char+=1 if word != '': words.append(word) print('char count =',count_char) print('alphanumeric count =',count_alpha) print('line count =',len(file)) print('word count =',len(words)) words = get_unique (convert(words,m,hash)) print('BoW =',words)
# 6231511921 (26.55) 26 (2021-03-22 19:24) def fhash(w,M): output = 0 for i in range(len(w)): output += ord(w[i])*(37**i) return output%int(M) def to_word(w): output = [] word = "" for i in w: p = i.lower() if p in "abcdefghijklmnopqrstuvwxyz0123456789": word += p elif word != "": output.append(word) word = "" if word not in output: if word != "": output.append(word) return output file_name = input("File name = ") while True: x = input("Use feature hashing ? (y,Y,n,N) ") if x in "yYnN": break else: print("Try again.") if x in "yY": M = input("M = ") stop_word = [] f1 = open("stopwords.txt","r") for line in f1: stop_word += line.lower().strip().split() f1.close() char_c = 0 alpha_c = 0 line_c = 0 word_c = 0 f2 = open(file_name,"r") BoW = [] chw = [] for line in f2: char_c += len(line.strip()) a = line.lower().strip().split() for i in a: t = to_word(i) for j in t: if j not in stop_word: if j not in chw: chw += [j] BoW += [[j,1]] else: ind = chw.index(j) BoW[ind][1] += 1 alpha_c += len(j) word_c += 1 line_c += 1 f2.close() print("-------------------") print("char count =",char_c) print("alphanumeric count =",alpha_c) print("line count =",line_c) print("word count =",word_c) if x in "yY": BoW_n = [] cBoW_n = [] for i in BoW: q = fhash(i[0],M) if q not in cBoW_n: BoW_n.append([fhash(i[0],M),i[1]]) cBoW_n.append(q) else: ind = cBoW_n.index(q) BoW_n[ind][1] += i[1] print("BoW =",sorted(BoW_n)) else: print("BoW =",sorted(BoW))
# 6231707621 (24.45) 27 (2021-03-20 16:55) def save(x): data=[] for line in x: s='' for e in line: if e in '\'\"();:.,?/\\<>=[]{}': s+=' ' else: s+=e data.append(s) return data def char(x): count=1 for e in x: count+=len(e)-1 return count def alphanumeric(x): count=0 for e in x: for i in range(len(e)): if 'a'<=e[i]<='z' or 'A'<=e[i]<='Z' or '0'<=e[i]<='9': count+=1 return count def word(x): s=[] for e in x: s.extend(e.split()) s.sort() return s def count(z): z.append(',?.') bow=[] k=1 for i in range(len(z)-1): if z[i]==z[i+1]: k+=1 else: bow.append([z[i],k]) k=1 return bow def bow(x,m): f=open('stopwords.txt') data2=save(f) f.close() stop=word(data2) y=[] for e in x: y.append(e.lower()) stopwords=[] for e in stop: stopwords.append(e.lower()) z=[] for e in y: if e not in stopwords: z.append(e) if m==None: bow=count(z) else: bow0=[] bow=[] for e in z: s=0 for i in range(len(e)): s+=ord(e[i])*(37**i) s=s%int(m) bow0.append(s) bow0.sort() bow=count(bow0) return bow a=input('File name = ') b=input('Use feature hashing ? (y,Y,n,N) ') while b not in ['y','Y','n','N']: print('Try again.') b=input('Use feature hashing ? (y,Y,n,N) ') if b in ['n','N']: m=None print('-------------------') else: m=input('M = ') print('-------------------') File_name=open(a,'r') data=save(File_name) File_name.close() #print(data) print('char count =',char(data)) print('alphanumeric count =',alphanumeric(data)) print('line count = ',len(data)) words=word(data) #print(words) print('word count = ',len(words)) print('BoW = ',bow(words,m))
# 6231709921 (30.00) 28 (2021-03-21 19:38) def fhash(w,m) : f = 0 for i in range(len(w)) : f += (ord(w[i])*(37**i)) f = f%m return f def unique(list1): ul = [] for e in list1: if e not in ul : ul.append(e) return ul alp = 'abcdefghijklmnopqrstuvwxyz0123456789' file_name = input('File name = ') feat = input('Use feature hashing ? (y,Y,n,N) ') while feat not in ['y','Y','n','N'] : print('Try again.') feat = input('Use feature hashing ? (y,Y,n,N) ') if feat in ['y','Y'] : m = int(input('M = ')) print('-------------------') infile = open(file_name,'r') instop = open('stopwords.txt','r') sent = '' words = [] lc = 0 cc = 0 ac = 0 wc = 0 for line in infile : line = line.strip() lc += 1 for e in line : cc += 1 e = e.lower() if e in alp : sent += e else : sent += ' ' spi = sent.split() for e in spi : ac += len(e) words += spi sent = '' wc = len(words) infile.close() print('char count = '+str(cc)) print('alphanumeric count = '+str(ac)) print('line count = '+str(lc)) print('word count = '+str(wc)) s = '' stw = [] for line in instop : line = line.strip() for e in line : e = e.lower() if e in alp : s += e else : s += ' ' sss = s.split() stw += sss s = '' instop.close() cutw = [] for e in words : if e not in stw : cutw.append(e) cutw.sort() if feat in ['n','N'] : bow = [] aws = unique(cutw) for e in aws : bow.append([e,cutw.count(e)]) print('BoW = '+str(bow)) elif feat in ['y','Y'] : fha = [] bow = [] for e in cutw : fha.append(fhash(e,m)) fha.sort() aws = unique(fha) for e in aws : bow.append([e,fha.count(e)]) print('BoW = '+str(bow))
# 6231718521 (30.00) 29 (2021-03-22 21:21) def Bagofword(w): B = [] for i in range(len(w)): n = 0 if not w[i] in w[0:i] : for j in range(len(w)): if w[i] == w[j]: n+=1 B.append([w[i],n]) return B def fhash(z,M): f = 0 o=[] for s in z: o.append(ord(s)) for i in range(len(z)): f += o[i]*37**i f = f % M return f #-------------------------------- file_name = input('File name = ') t = input('Use feature hashing ? (y,Y,n,N) ') t = t.lower() while t not in ['y','n']: print('Try Again.') t = input('Use feature hashing ? (y,Y,n,N) ') t = t.lower() stop = open('stopwords.txt','r') S = [] for line in stop: stopword = line.strip().lower().split() for e in stopword: S.append(e) stop.close() file = open( file_name ,'r') W = [] al = 0 ch = 0 lc = 0 wc = 0 for line in file: ch += len(line.strip()) lc += 1 A = '' line = line.lower() for s in line: if 'a' <= s <= 'z' or '0' <= s <= '9': A += s else: A += ' ' w = A.strip().split() wc += len(w) for e in w: al += len(e) if e not in S: W.append(e) file.close() if t == 'n': print('-------------------') print('char count = '+str(ch)) print('alphanumeric count = '+str(al)) print('line count = '+str(lc)) print('word count = '+str(wc)) print('BoW = '+str(Bagofword(W))) else: M = int(input('M =')) N = [] for e in W: N.append(fhash(e,M)) print('-------------------') print('char count = '+str(ch)) print('alphanumeric count = '+str(al)) print('line count = '+str(lc)) print('word count = '+str(wc)) print('BoW = '+str(Bagofword(N)))
# 6330170421 (22.95) 30 (2021-03-21 17:50) file_name = input('File name = ') fh = input('Use feature hashing ? (y,Y,n,N) ') nsw = [] def remove_punc(t): out = "" for i in t: if i in "\"\'/\\().,;:": out += "" else : out += i return out fn = open("stopwords.txt", "r") sw = [] for line in fn: line = line.lower() line = line.strip().split() for i in range (len(line)): sw.append(line[i]) fn.close() fs = open(file_name,"r") sp = [] char = [] line_count = 0 for line in fs: line = line.strip() char.append(line) line1 = remove_punc(line) sp.append(line1) line_count += 1 fs.close() chx = ''.join(char) char_count = len(chx) x = ' '.join(sp) y = x.split() z = ''.join(y) word_count = len(y) alpha_count = len(z) yl = [] for i in range (len(y)): yl.append(y[i].lower()) for i in range (len(yl)): if yl[i] not in sw: nsw.append(yl[i]) def bowf(c): nb = 0 for i in range (len(nsw)): if c == nsw[i]: nb += 1 return nb def fhashf(c): nfh = 0 for i in range (len(b2)): if c == b2[i]: nfh += 1 return nfh while fh not in 'y,Y,n,N' : print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') if fh == 'y' or fh == 'Y': M = input('M = ') def fhash(w,M): nfh = 0 for i in range (len(w)): nfh += (int(ord(w[i]))*(37**i)) nfh = nfh%int(M) return nfh BoW = [] b2 = [] b3 = [] for i in range(len(nsw)): b2.append(fhash(nsw[i],M)) for i in range(len(b2)): if b2[i] not in b3: b3.append(b2[i]) for i in range (len(b3)): BoW.append([b3[i],fhashf(b3[i])]) print('-------------------') print('char count = '+str(char_count)) print('alphanumeric count = '+str(alpha_count)) print('line count = '+str(line_count)) print('word count = '+str(word_count)) print('BoW = '+str(BoW)) elif fh == 'n' or fh == 'N': b1 = [] BoW = [] for i in range(len(nsw)): if nsw[i] not in b1: b1.append(nsw[i]) for i in range (len(b1)): BoW.append([b1[i],bowf(b1[i])]) print('-------------------') print('char count = '+str(char_count)) print('alphanumeric count = '+str(alpha_count)) print('line count = '+str(line_count)) print('word count = '+str(word_count)) print('BoW = '+str(BoW))
# 6330171021 (15.00) 31 (2021-03-21 13:00) #------------------------------------------- def wordAlpha_count(file_name) : eng = "abcdefghijklmnopqrstuvwxyz0123456789" list_txt = [] txt = '' for i in readFilename: if i in eng: txt += i else: if txt != '' : list_txt.append(txt) txt = '' return (' '.join(list_txt).strip().split()), len(''.join(list_txt)) #---------------------------------------------- def line_count(file_name) : return (len(readFilename.splitlines())) #---------------------------------------------- def char_count(file_name) : char = readFilename.splitlines() new_char = ''.join(char) return len(new_char) #---------------------------------------------- def list_stopword(file_name) : eng = "abcdefghijklmnopqrstuvwxyz0123456789" list_txt = [] txt = '' for i in readfh : if i in eng: txt += i else: if txt != '' : list_txt.append(txt) txt = '' return (' '.join(list_txt).strip().split()) #---------------------------------------------- def bow_N(new_word) : listBowN = [] test = [] new_word.sort() for i in new_word: txt= [] if i not in test : txt.append(i) txt.append(new_word.count(i)) test.append(i) listBowN.append(txt) return listBowN #---------------------------------------------- def bow_Y(new_word) : new_word.sort() list_fhash = [] for i in new_word : f = fhash(i,M) list_fhash.append(str(f)) listBowY = [] test = [] list_fhash.sort() for i in list_fhash: txt= [] if i not in test : txt.append(int(i)) txt.append(list_fhash.count(i)) test.append(i) listBowY.append(txt) return listBowY #----------------------------------- def fhash(w,M): f = 0 for i in range(len(w)) : f += ord(w[i])*(37**i) f = f%M return f #----------------main---------------- file_name = open(input('File_name = '),"r") fh = open('stopwords.txt','r') ufh = input('Use feature hashing ? (y,Y,n,N) ') yesno = 'yYnN' while ufh not in yesno : print('Try again.') ufh = input('Use feature hashing ? (y,Y,n,N) ') readFilename = file_name.read().lower() word,alpha = wordAlpha_count(file_name) line = line_count(file_name) char = char_count(file_name) readfh = fh.read().lower() stopword = list_stopword(file_name) new_word = [r for r in word if r not in stopword] if ufh == 'n' or ufh == 'N' : BoW = bow_N(new_word) if ufh == 'y' or ufh == 'Y' : M = int(input('M = ')) BoW = bow_Y(new_word) #---------------print---------------- print('-------------------') print('char count =',char) print('alphanumeric count =',alpha) print('line count =',line) print('word count =',len(word)) print('BoW =',BoW) #-------------close----------------- file_name.close() fh.close()
# 6330172721 (26.00) 32 (2021-03-20 18:42) def fhash(word,m) : ttl = 0 for i in range(len(word)) : ttl += ord(word[i])*(37**(i)) return ttl%m def bow(_lst, m) : # have fhash bow_lst = [0]*m for ele in _lst : bow_lst[fhash(ele,m)] += 1 bow_p_lst = [] for i in range(m) : if bow_lst[i] != 0 : bow_p_lst.append([i,bow_lst[i]]) return bow_p_lst def bow_wrd(_lst) : # no fhash n_lst = [] for ele in _lst : if ele not in n_lst : n_lst.append(ele) for i in range(len(n_lst)) : n_lst[i] = [n_lst[i],_lst.count(n_lst[i])] return sorted(n_lst) file_name = input('File name = ') f = open(file_name, "r") wrd_lst = [] alpnm_cnt = 0 ch_cnt = 0 lne_cnt = 0 wrd_cnt = 0 for line in f : lne = '' ch_cnt += len(line)-1 lne_cnt += 1 for ch in line.lower() : if ch in 'abcdefghijklmnopqrstuvwxyz0123456789' : lne += ch alpnm_cnt += 1 else : lne += ' ' wrd_lst += lne.split() f.close() wrd_cnt = len(wrd_lst) stp_lst = [] # filter stopwords stp_wrd = open("stopwords.txt","r") for line in stp_wrd : stp_lst += line.lower().split() stp_wrd.close() for i in range(len(wrd_lst)-1,-1,-1) : if wrd_lst[i] in stp_lst : wrd_lst.pop(i) while 1 : alter = input('Use feature hashing ? (y,Y,n,N) ') if alter == 'n' or alter == 'N' : alter_tmp = 0 break elif alter == 'y' or alter == 'Y' : alter_tmp = 1 M = int(input('M = ')) break print("Try again.") print('-------------------') print("char count =",ch_cnt) print("alphanumeric count =",alpnm_cnt) print("line count =",lne_cnt) print("word count =",wrd_cnt) if alter_tmp : Bw = bow(wrd_lst,M) print("BoW =",Bw) else : Bw = bow_wrd(wrd_lst) print("BoW =",Bw)
# 6330173321 (22.20) 33 (2021-03-21 18:57) def remove(t): out = "" for e in t: if e in "\"\'/\\().,;:->": out += "" else : out += e return out def fhash(t,m): re = 0 for i in t: re += ord(i) return re%int(m) file_name = input("File name = ") use_hash = input("Use feature hashing ? (y,Y,n,N) ") k = ['y', 'Y', 'n', 'N'] r = ['y' , 'Y'] while use_hash not in k : print("Try again.") use_hash = input("Use feature hashing ? (y,Y,n,N) ") if use_hash in r: m = int(input("M = ")) get = [] stp = [] chcount = 0 file1 = open("stopwords.txt", "r") file2 = open(file_name, "r") for line in file1 : if "\n" in line: stp.append(line[0:-1:]) else : stp.append(line) for line in file2 : if "\n" in line: get.append(line[0:-1:]) else : get.append(line) file1.close() file2.close() print("-------------------") alp = 0 linecount = len(get) for i in get: chcount += len(i) print("char count =",chcount) word = [] stopword = [] remain = [] BoW = [] BoW2 = [] wa = [] hashx = [] h2 = [] for i in get: i = remove(i) i = i.lower() x = i.split() word += x for i in word: alp += len(i) print("alphanumeric count =",alp) print("line count =", linecount) for i in stp: i = remove(i) x = i.split() stopword += x for i in word: if i not in stopword: remain.append(i) print("word count =",len(word)) n = 0 remain.sort() for i in remain : if i not in wa: wa.append(i) BoW.append([i,1]) else: fix = BoW[-1] t = int(fix[1]) BoW.remove(BoW[-1]) BoW.append([i,t+1]) if use_hash in r : for [word, num] in BoW: hashx.append([fhash(word,m),num]) hashx.sort() for [a,b] in hashx: if a not in h2: h2.append(a) BoW2.append([a,b]) else: fix = BoW2[-1] t = int(fix[1]) BoW2.remove(BoW2[-1]) BoW2.append([a,t+b]) if use_hash not in r: print("BoW =",BoW) elif use_hash in r : print("BoW =",BoW2)
# 6330174021 (18.43) 34 (2021-03-18 14:47) def is_alpha(n): try: int(n) return True except: return ord(n) in range(ord('a'), ord('z') + 1) or ord(n) in range(ord('A'), ord('Z') + 1) def stopword(): f = open("stopwords.txt", 'r') stp = [] for line in f: stp += [word for word in line.strip().split()] f.close() return stp def fhash(w, M): return sum([ord(w[i])*37**(i) for i in range(len(w))]) % M def display(c, a, l, w, BoW): print('-------------------') print('char count = ' + str(c)) print('alphanumeric count = ' + str(a)) print('line count = ' + str(l)) print('word count = ' + str(w)) print('BoW = ' + str(BoW)) stp = stopword() file_name = input("File name = ") fh = input("Use feature hashing ? (y,Y,n,N) ") while fh not in ('y', 'Y', 'n', 'N'): print("Try again.") fh = input("Use feature hashing ? (y,Y,n,N) ") if fh in ('Y', 'y'): M = int(input("M = ")) f = open(file_name.strip(), 'r') n_cha = 0 n_alnum = 0 n_line = 0 words = [] BoW = [] for line in f: n_cha += len(line) - 1 n_line += 1 line = line.strip() word = "" for c in line: if is_alpha(c): n_alnum += 1 word += c elif word != "": words.append(word) word = "" f.close() n_words = len(words) words = [word.lower() for word in words if word.lower() not in stp] if fh in ('Y', 'y'): words = [fhash(word, M) for word in words] b_BoW = [[word, words.count(word)] for word in words] for mem in b_BoW: if mem not in BoW: BoW.append(mem) BoW.sort() display(n_cha, n_alnum, n_line, n_words, BoW)
# 6330176221 (19.05) 35 (2021-03-21 17:46) #Prog_08: Bag-of-words #6330176221 (19.05) Natthawut Sapwatthanaphaisan def flas_h(w, M) : a = 0 G = 37 n = 1 for i in w : a += ord(i)*(G**(n-1)) n += 1 flash = a % M return flash def strr_1(line) : strr = '' for e in line : if e in "\"\'/\\,.:;()?!#><-_~" : strr += ' ' else : strr += e return strr def stop_words() : bow = '' stop = open('stopwords.txt', 'r') s = [] for line in stop : line = line.strip() if line == '' : pass else : s += line.split() file_name = open(inn1, 'r') for line in file_name : bow += line.lower() bow = strr_1(bow).split() Bow = [] for i in range(len(bow)) : if bow[i] in s : pass else : Bow.append(bow[i]) file_name.close() stop.close() return Bow def BO_W(Bow) : stack = 1 Stk = [] BoW = [] for i in Bow[:-1] : if i in Bow[stack:] : Stk.append(i) stack += 1 elif i in Stk : Stk.append(i) stack += 1 else : BoW.append([i,1]) stack += 1 if Bow[-1] in Stk : Stk.append(Bow[-1]) else : BoW.append([Bow[-1],1]) words = [] count = [] for i in range(len(Stk)) : if Stk[i] in words : for a in range(len(words)) : if Stk[i] == words[a] : count[a] = count[a] + 1 else : words.append(Stk[i]) count += [1] BB = [] for i in range(len(words)) : BB.append([words[i], count[i]]) BoW += BB return BoW def prin_t() : print('-------------------') print('char count =', char_count) print('alphanumeric count =', alphanumeric_count) print('line count =', line_count) print('word count =', word_count) #------------------------------------- inn1 = input('File name = ') file_name = open(inn1, 'r') word = [] char_count = 1 line_count = 0 for line in file_name : char_count += len(line)-1 strr = strr_1(line) word += strr.split() line_count += 1 alphanumeric_count = len(''.join(word)) word_count = len(word) inn2 = input('Use feature hashing ? (y,Y,n,N) ') while inn2 != 'y' and inn2 != 'Y' and inn2 != 'n' and inn2 != 'N' : print('Try again.') inn2 = input('Use feature hashing ? (y,Y,n,N) ') if inn2 == 'y' or inn2 == 'Y' : M = int(input('M = ')) prin_t() bow = stop_words() Bow = [] for i in bow : i = flas_h(i, M) Bow.append(i) BoW = BO_W(Bow) print('BoW =', BoW) if inn2 == 'n' or inn2 == 'N' : prin_t() Bow = stop_words() BoW = BO_W(Bow) print('BoW =', BoW) file_name.close()
# 6330177921 (30.00) 36 (2021-03-21 21:22) file_name = input('File name = ') use = input('Use feature hashing ? (y,Y,n,N) ') fn = open(file_name,'r') stw = open('stopwords.txt','r') char = []; char_stw = [] charw = []; charw_stw = [] word = ''; word_stw = '' alnum = [] line_c = 0 same = []; bow = []; c = 1 sbo = [] def fhash(a,m): b = 0 for i in range(len(a)): b += ord(a[i])*(37**i) return b%m if use not in ['n','N','y','Y']: while use not in ['n','N','y','Y']: print('Try again.') use = input('Use feature hashing ? (y,Y,n,N) ') if use == 'n' or use == 'N': #char count print('-------------------') for line in fn: for e in line: if e != '\n': char.append(e.lower()) charw.append(e.lower()) charw.append(' ') line_c += 1 print('char count =',len(char)) print(char) #alphanumeric count for e in char: if e in 'abcdefghijklmonpqrstuvwxyz0123456789': alnum.append(e) print('alphanumeric count =',len(alnum)) print('line count =',line_c) #word count for e in charw: if e not in 'abcdefghijklmonpqrstuvwxyz0123456789': word += ' ' else: word += e word = word.split() print('word count =',len(word)) #stw for line in stw: for e in line: if e != '\n': char_stw.append(e.lower()) charw_stw.append(e.lower()) charw_stw.append(' ') for e in charw_stw: if e not in 'abcdefghijklmonpqrstuvwxyz0123456789': word_stw += ' ' else: word_stw += e word_stw = word_stw.split() #BoW for e in word: if e not in word_stw: same.append(e) same.sort() same += ' ' for i in range(len(same)-1): if same[i] == same[i+1]: c += 1 else: bow.append([same[i],c]) c = 1 print('BoW =',bow) else: m = int(input('M = ')) print('-------------------') #char count for line in fn: for e in line: if e != '\n': char.append(e.lower()) charw.append(e.lower()) charw.append(' ') line_c += 1 print('char count =',len(char)) #alphanumeric count for e in char: if e in 'abcdefghijklmonpqrstuvwxyz0123456789': alnum.append(e) print('alphanumeric count =',len(alnum)) print('line count =',line_c) #word count for e in charw: if e not in 'abcdefghijklmonpqrstuvwxyz0123456789': word += ' ' else: word += e word = word.split() print('word count =',len(word)) #stw for line in stw: for e in line: if e != '\n': char_stw.append(e.lower()) charw_stw.append(e.lower()) charw_stw.append(' ') for e in charw_stw: if e not in 'abcdefghijklmonpqrstuvwxyz0123456789': word_stw += ' ' else: word_stw += e word_stw = word_stw.split() #BoW for e in word: if e not in word_stw: same.append(e) same.sort() for e in same: sbo.append(fhash(e,m)) sbo.sort() sbo += ' ' for i in range(len(sbo)-1): if sbo[i] == sbo[i+1]: c += 1 else: bow.append([sbo[i],c]) c = 1 print('BoW =',bow) fn.close() stw.close()
# 6330178521 (24.35) 37 (2021-03-21 17:43) file_name = input('File name = ') fha = input('Use feature hashing ? (y,Y,n,N) ') while not fha in ['y','Y','n','N']: print('Try again.') fha = input('Use feature hashing ? (y,Y,n,N) ') linecount = 0 char = 0 alpha = 0 word = 0 file = open(file_name,'r') x = '' y = [] for line in file: linecount += 1 for i in range(len(line)): if line[i] == '\n': pass else: char += 1 if '0' <= line[i] <= '9' or 'A' <= line[i] <= 'z': alpha += 1 for i in range(len(line)): if '0' <= line[i] <= '9' or 'A' <= line[i] <= 'z': x += line[i] else: x += ' ' words = (x.lower().strip().split()) wordcount = len(words) file.close() r = [] file = open('stopwords.txt','r') for line in file: s1 = line.lower() s = s1.strip().split() for i in range(len(s)): r.append(s[i]) file.close() def fhash(w, M): r = 0 for i in range(len(w)): r += ord(w[i])*(37**i) a = r%int(M) return a word = [] for i in range(len(words)): if not words[i] in r: word.append(words[i]) if fha == 'n' or fha == 'N': print('-------------------') print('char count =', char) print('alphanumeric count =', alpha) print('line count =', linecount) print('word count =', wordcount) z = [] b = [] p = [] for i in range(len(word)): if word[i] in z: h = z.index(word[i]) b[h] += 1 else: z.append(word[i]) b.append(1) for i in range(len(z)): p.append([z[i],b[i]]) print('BoW =', p) else: M = input('M = ') print('-------------------') print('char count =', char) print('alphanumeric count =', alpha) print('line count =', linecount) print('word count =', wordcount) z2 = [] b2 = [] p2 = [] for i in range(len(word)): if fhash(word[i],M) in z2: h = z2.index(fhash(word[i],M)) b2[h] += 1 else: z2.append(fhash(word[i],M)) b2.append(1) for i in range(len(z2)): p2.append([z2[i],b2[i]]) print('BoW =', p2)
# 6330179121 (17.85) 38 (2021-03-21 17:08) x = str(input('File name = ')) def yes() : m = int(input('M = ')) print('-------------------') print('char count =',char_count(file_name)) print('alphanumeric count =',alphanumeric_count(file_name)) print('line count =',line_count(file_name)) print('word count =',word_count(file_name)) print('BoW =',byes(file_name,m)) return None def no() : print('-------------------') print('char count =',char_count(file_name)) print('alphanumeric count =',alphanumeric_count(file_name)) print('line count =',line_count(file_name)) print('word count =',word_count(file_name)) print('BoW =',bno(file_name)) return None def open_stop() : file_name = [] file = open('stopwords.txt','r') line = file.read().split() file_name += line file.close() return file_name def word_count(file_name) : a = [] for i in file_name : i = i.split() a += i x = len(a) return x def char_count(file_name) : a = '' for i in file_name : a += i x = len(a) return x def line_count(file_name) : x = len(file_name) return x def alphanumeric_count(file_name) : keep = '' a = '' for i in file_name : a += i.lower() c = 'abcdefghijklimopqrstuvwxyz1234567890' subkeep = '' for j in a : if j in c : subkeep += j else : keep += subkeep subkeep = '' return len(keep) def byes(file_name,m) : keep = [] a = '' for i in file_name : a += i.lower() c = 'abcdefghijklimopqrstuvwxyz1234567890' subkeep = '' for j in a : if j in c : subkeep += j else : keep.append(subkeep) subkeep = '' an = [] o = open_stop() for i in keep : if i not in o and len(i) != 0 : an.append(i) k1 = [] for i in an : k1.append(fhash(i,m)) ans = [] k = [] for i in k1 : if i not in k : k.append(i) ans.append([i,1]) else : k.append('') n = k.index(i) ans[n][1] += 1 ans.append('') real_ans = [] for i in ans: if len(i) != 0 : real_ans.append(i) real_ans.sort() return real_ans def bno(file_name) : keep = [] a = '' for i in file_name : a += i.lower() c = 'abcdefghijklimopqrstuvwxyz1234567890' subkeep = '' for j in a : if j in c : subkeep += j else : keep.append(subkeep) subkeep = '' an = [] o = open_stop() for i in keep : if i not in o and len(i) != 0 : an.append(i) ans = [] k = [] for i in an : if i not in k : k.append(i) ans.append([i,1]) else : k.append('') n = k.index(i) ans[n][1] += 1 ans.append('') real_ans = [] for i in ans: if len(i) != 0 : real_ans.append(i) real_ans.sort() return real_ans def fhash(word,m) : g = 37 k = [] a = 0 for i in word : k.append(ord(i)) for i in range(len(k)) : a += k[i]*g**i ans = a % m return ans file_name = [] file = open(x,'r') line = file.read().split('\n') file_name += line file.close() i = '1' while i == '1' : has = str(input('Use feature hashing ? (y,Y,n,N) ')) if has == 'Y' or has == 'y' : ans = yes() i = '0' elif has == 'N' or has == 'n' : ans = no() i = '0' else : print('Try again') i = '1'
# 6330180721 (30.00) 39 (2021-03-18 17:39) #Prog-08: Bag-of-words #6330180721 (30.00) Nichakul Pichitwutikorn def fhash(w,m): a = 0 for e in range(len(w)): a+= ord(w[e])*(37**e) return a%m def num(lis,word): c = 0 for t in lis: if t == word:c+=1 return c def cut_repeat(listt): qr = [] for e in listt: if not e in qr: qr.append(e) return qr file_name = input('File name = ') h = input('Use feature hashing ? (y,Y,n,N) ') while h!='y' and h!='Y' and h!='n' and h!='N': print('Try again.') h = input('Use feature hashing ? (y,Y,n,N) ') if h in 'yY': m= input('M = ') book = open(file_name,'r') stop = open('stopwords.txt','r') char_al = 0; al = 0;l = 0 sen = ''; st = '' for line in book: for i in line: if 'a'<=i<='z' or 'A'<=i<='Z' or '0'<=i<='9': al+=1 sen+=i else: char_al+=1 sen+=' ' l+=1 sen = sen.lower().split() for t in stop: for s in t: if s==' ':st+=' ' else: st+=s st = st.lower().split() bow = []; ans = []; f = [] for p in sen: if not p in st:bow.append(p) print('-------------------') print('char count =',al+char_al-l+1) print('alphanumeric count =',al) print('line count =',l) print('word count =',len(sen)) if h in 'yY': for j in bow: ans.append(fhash(j,int(m))) for q in ans: f.append([q,num(ans,q)]) rrr = cut_repeat(f) rrr.sort() print('BoW =',rrr) else: for j in bow: ans.append([j,num(bow,j)]) rrr = cut_repeat(ans) rrr.sort() print('BoW =',rrr) book.close() stop.close()
# 6330181321 (21.05) 40 (2021-03-22 23:49) file_name=input('File name = ') x=input('Use feature hashing ? (y,Y,n,N) ') while x not in['y','Y','n','N']: print('Try again.') x=input('Use feature hashing ? (y,Y,n,N) ') def fhash(w,m): ans=0 for i in range(len((w))): ans+=ord((w[i]))*37**i fans=ans%m return(fans) #feature hashing if x=='y' or x=='Y': m=int(input('M = ')) #ข้อ3 stop=open('stopwords.txt','r') s1=[] for line in stop: s=line.split() s1+=s stop.close() #ข้อ4 t='' al='' w='' n=0 f=open(file_name,'r') for line in f: n+=1 for i in line: if i=='\n': t+='' else: t+=i if i=='\n'or i in[' ',',', '"', "'", '-', '_', '=', '.', '(', ')', '>', '<', ';', ':']: al+='' w+=' ' else: al+=i w+=i lw=w.lower() answ=[] lww=lw.split() wlww=[] for i in range(len(lww)): if lww[i] not in s1: wlww.append(lww[i]) for i in range(len(wlww)): if [wlww[i],wlww.count(wlww[i])]not in answ: answ+=[[wlww[i],wlww.count(wlww[i])]] answw=[] answww=[] print('-------------------') print('char count =',len(t)) print('alphanumeric count =',len(al)) print('line count =',n) print('word count =',len(w.split())) if x=='n' or x=='N': print('BoW =',answ) if x=='y' or x=='Y': for i in range(len(answ)): answw+=[[fhash(answ[i][0],m),answ[i][1]]] answw.sort() nar=[] for i in range(len(answw)-1): if answw[i][0]!=answw[i+1][0]: nar.append(answw[i][0]) nar.append(answw[-1][0]) lang=[0]*len(nar) for i in range(len(answw)): if answw[i][0] in nar: k=nar.index(answw[i][0]) lang[k] += answw[i][1] for i in range(len(nar)): answww+=[[nar[i],lang[i]]] print('BoW =',answww) f.close()
# 6330182021 (14.55) 41 (2021-03-22 00:09) file_name = input('File name = ',) x=input('Use feature hashing ? (y,Y,n,N) ',) read_file = open(file_name, 'r') st=open('stopwords.txt','r') sw='' for l in st: if l!='\n': sw+=l sw=sw.split() line='' ao='' c=0 li='' ww='' for l in read_file: c+=1 line+=l for e in line: if e!='\n': li+=e for e in li: if e!=' ' and e!='\"' and e!="\'" and e!=',' and e!='.' and e!= '!' and e!='?' and e!='/' and e!=':' and e!=';': ao+=e for e in li: if e!=' ' and e!='\"' and e!="\'" and e!=',' and e!='.' and e!= '!' and e!='?' and e!='/' and e!=':' and e!=';': ww+=e else: ww+=' ' wc=(ww.lower()).split() t=[] q=[] for e in wc: if e not in sw: t.append(e.lower()) if e not in sw and e not in q: q.append(e.lower()) n=[] for i in range(len(q)): f=0 for j in range (len(t)): if q[i]==t[j]: f+=1 n.append([q[i], f]) if x== 'y' or x== 'Y': M = input('M = ',) print('char count =',len(li)) print('alphanumeric count =',len(ao)) print('line count =',c) print('word count =',len(wc)) y=[] yn=[] G=37 for i in range (len(t)): nn=0 fhash=0 for e in t[i]: fhash+=ord(e)*(G**nn) nn+=1 yn.append(fhash%int(M)) for i in range(len(yn)): if yn[i]not in y: y.append(yn[i]) yy=[] for i in range (len(y)): fr=0 for j in range (len(yn)): if y[i]==yn[j]: fr+=1 yy.append([y[i], fr]) print('BoW =',yy) elif x=='n' or x=='N': print('char count =',len(li)) print('alphanumeric count =',len(ao)) print('line count =',c) print('word count =',len(wc)) print('BoW =',n) else: print('Try again.') read_file.close() st.close()
# 6330183621 (13.00) 42 (2021-03-21 15:07) #------------------------------------------------- def fhash(word,M): num=0 for i in range (len(word)): num+=ord(word[i])*(37**i) x=num%M return x #------------------------------------------------- def remove_stopword(list_of_data_lower): a=open('stopwords.txt') b=[] d=True while d == True: c=a.readline().lower() e=c.split() for q in range(len(e)): b.append(e[q]) if len(c) == 0: d=False new_data=[] for e in range(len(list_of_data_lower)): if list_of_data_lower[e] in b: pass else: new_data.append(list_of_data_lower[e]) a.close() return new_data #list of data without stop word #------------------------------------------------- def check(a): c='' b=['/','\\','"',"'",'(',')','-','.','>','<',';',':',','] for i in range(len(a)): if a[i] in b: c+=' ' else: c+=a[i] return c #------------------------------------------------- def char_count(file_name): data=open(file_name) a='' for line in data: a+=line.strip() data.close() return len(a) #------------------------------------------------- def alphanum_count(file_name): data=open(file_name) a='' for line in data: a+=line.strip() b='' c=[',','.',"'",'"',':',';','[',']','{','}','\\','/','-','_','=','*','^','!',' '] for e in range (len(a)): if a[e] not in c: b+=a[e] else: pass data.close() return len(b) #------------------------------------------------- def line_count(file_name): data=open(file_name) n=0 for line in data: if len(line) != 0: n+=1 data.close() return n #------------------------------------------------- def word_count(file_name): data=open('sample.txt') a='' for line in data: a+=line.strip() list_of_word=check(a).split() data.close() return len(list_of_word) #------------------------------------------------- def bow(file_name,M): data=open(file_name) a='' for line in data: a+=line.strip().lower() b=a.split() ccc=remove_stopword(b) c=check(' '.join(ccc)).split() if M != None: for g in range (len(c)): c[g]=fhash(c[g],M) d=[] #list of words e=[] #list of words and times for i in range (len(c)): if c[i] in d: pass else: d.append(c[i]) data.close() for f in range(len(d)): e.append([d[f],c.count(d[f])]) e.sort() return e #------------------------------------------------- def run(): print('-------------------') print('char count = '+str(char_count(file_name))) print('alphanumeric count = '+str(alphanum_count(file_name))) print('line count = '+str(line_count(file_name))) print('word count = '+str(word_count(file_name))) #------------------------------------------------- file_name = input( 'File name = ') again=0 while again==0: yes_or_no=input( 'Use feature hashing ? (y,Y,n,N) ' ) if yes_or_no.lower() == 'y': again=1 M=int(input('M = ')) run() print('BoW =',bow(file_name,M)) elif yes_or_no.lower() == 'n': again=1 M=None run() print('BoW =',bow(file_name,None)) else: again=0 print('Try again.')
# 6330184221 (17.45) 43 (2021-03-22 19:29) def fhash(w,M) : s = [] a = 0 G = 37 n = 1 for e in w : for y in e: a += (ord(y)*(G**(n-1))) n += 1 if n == (len(e)+1) : a %= M s.append(a) n = 1 a = 0 s.sort() count = 1 new = s[0] P = [] for i in range(1,len(s)) : if s[i] == new : count += 1 else : P.append([(s[i-1]), (count)]) new = s[i] count = 1 P.append([(s[i]), count]) return P def nhash(a) : s = [] a.sort() new = a[0] count = 1 for i in range(1,len(a)) : if a[i] == new : count += 1 else : s.append([a[i-1], count]) new = a[i] count = 1 s.append([a[i], count]) return s file_name = input('File name = ') fh = input('Use feature hashing ? (y,Y,n,N) ') check = False while check == False : if fh == 'y' or fh == 'Y' or fh == 'n' or fh == 'N' : break else : print('Try again') fh = input('Use feature hashing ? (y,Y,n,N) ') if fh == 'y' or fh == 'Y' : M = int(input('M = ')) print("-------------------") linecount = 0 charcount = 0 s = '' word = '' fn = open(file_name, 'r') for line in fn : s += ' '+line.strip().lower() linecount += 1 print('char count = '+str(len(s)-linecount)) for e in s : if e in "\"\'/\\().,;:" : word += '' else : word += e data = word.split() fn.close() alphanumeric = '' for e in word : if e != ' ' : alphanumeric += e else : alphanumeric += '' print('alphanumeric count = '+str(len(alphanumeric))) print('line count = '+str(linecount)) print('word count = '+str(len(data))) BoW = '' stopwords = '' sw = open('stopwords.txt','r') for line in sw : stopwords += ' '+line.strip() for e in data : if e in stopwords : BoW += '' else : BoW += e+' ' sw.close BoW = BoW.split() if fh == 'y' or fh == 'Y' : print('BoW = '+str(fhash(BoW,M))) else : print(nhash(BoW))
# 6330185921 (8.35) 44 (2021-03-22 23:08) Filename = input("File name = ") file_name = open(Filename, "r") def removelist(the_list, val): return [value for value in the_list if value != val] use = input("Use feature hashing ? (y,Y,n,N) ") while use != 'Y' and (use != 'y') and use !='N' and use != 'n': print('Try again.') use = input("Use feature hashing ? (y,Y,n,N) ") if use =='Y' or use == 'y': use = 1 M = input("M = ") elif use =='N' or use == 'n': use = 2 stops = open('stopwords.txt', "r") a='' for line in stops : a=a+line b=a.split() c='' for line in file_name: c=c+line #1 print('-------------------') t = c.replace('\n','') charcount=int(len(t)) print('char count =',charcount) #2 p='''!()-[]{};:'"\,<>./?@#$%^&*_~''' for e in p: if e in t: t=t.replace(e,'') t=t.replace(' ','') g='' for e in t: if 'a'<=e<='z' or 'A' <= e <= 'Z' or '1'<= e <='9': g=g+e t=g alphanumericcount=int(len(t)) print('alphanumeric count =',alphanumericcount) #3 line count file_name = open(Filename, "r") line_count = 0 for line in file_name: line_count += 1 print('line count =',line_count) #4word count file_name = open(Filename, "r") c='' for line in file_name: c=c+line t = c.replace('\n',' ') p='''!()-[]{};:'"\,<>./?@#$%^&*_~''' for e in p: if e in t: t=t.replace(e,' ') word=t.split() wordcount=len(word) print('word count =',wordcount) #bow if use ==2: file_name = open(Filename, "r") stops = open('stopwords.txt', "r") c='' for line in file_name: c=c+line t = (c.replace('\n',' ')).lower() p='''!()-[]{};:'"\,<>./?@#$%^&*_~''' for e in p: if e in t: t=t.replace(e,' ') for e in t: if 'a'<=e<='z' or 'A' <= e <= 'Z' or '1'<= e <='9': g=g+e t=t.split() t.sort() a='' for line in stops : a=a+line c=a.split() for e in c: t=removelist(t,e) a=[] for i in range ((len(t))-1): if t[i]==t[i+1]: u=2 else: a.append(t[i]) a.append(t[-1]) c=[] for e in a: w=t.count(e) c.append([e,w]) bow=c print('BoW =',bow) #bow fh def flash(a,M): y=0 for i in range (len(a)): y=y+(ord(a[i])*(37**(int(i)))) flash=y%M return y%M if use == 1: c='' file_name = open(Filename, "r") stops = open('stopwords.txt', "r") for line in file_name: c=c+line t = c.replace('\n',' ') t=t.lower() p='''!()-[]{};:'"\,<>./?@#$%^&*_~''' for e in p: if e in t: t=t.replace(e,' ') for e in t: if 'a'<=e<='z' or 'A' <= e <= 'Z' or '1'<= e <='9': g=g+e t=t.split() a='' for line in stops : a=a+line c=a.split() for e in c: t=removelist(t,e) r=[] for i in range (len(t)): r.append(flash(t[i],int(M))) r.sort() bow=[] u=[] for i in range (len(r)-1): if r[i]==r[i+1]: h=1 else: u.append(r[i]) u.append(r[-1]) for e in u: bow.append([e,(r.count(e))]) print('BoW =',bow) stops.close() file_name.close()
# 6330186521 (22.99) 45 (2021-03-22 23:30) file_name = input('File name = ') yesno = input('Use feature hashing ? (y,Y,n,N) ') while yesno not in ['y','Y','n','N']: print('Try again.') yesno = input('Use feature hashing ? (y,Y,n,N) ') if yesno == 'y' or yesno == 'Y': M = int(input('M = ')) print('-------------------') fn = open(file_name,'r') chcount = 0 alnum = 0 lncount = 0 wcount = 0 wlist=[] for line in fn: lncount +=1 for ch in line: if ch != '\n': chcount += 1 if ch.isalnum() == True: alnum +=1 w = '' for ch in line: if ch.isalnum() == True: w+=ch else: if w != '': wlist.append(w) w='' w='' wcount = len(wlist) fn.close() print('char count = ' + str(chcount)) print('alphanumeric count = ' + str(alnum)) print('line count = ' + str(lncount)) print('word count = ' + str(wcount)) def fhash(w, M): hashed = 0 G = 37 for i in range(len(w)): hashed += ord(w[i]) * G**i re = hashed % M return re fstop = open('stopwords.txt','r') stoplist = [] for line in fstop: for word in line.strip().split(): word = word.lower() # print(word) if word not in stoplist: stoplist.append(word) fstop.close() # hash..only in bow???? K = [] for w in wlist: w = w.lower() # print(w) if w not in stoplist: added = False if yesno == 'y' or yesno == 'Y': h = int(fhash(w, M)) for i in range(len(K)): if K[i][0] == h: K[i][1]+=1 added = True break if added == False: K.append([h,1]) elif yesno == 'n' or yesno == 'N': for i in range(len(K)): if K[i][0] == w: K[i][1] +=1 added = True break if added==False: K.append([w,1]) K.sort() print('BoW = ' + str(K))
# 6330187121 (29.00) 46 (2021-03-21 21:20) #1 and #2----------------------------------------------------------- file_name = input('File name = ') fhornot = input('Use feature hashing ? (y,Y,n,N) ') while not fhornot in 'yYnN': print('Try again.') fhornot = input('Use feature hashing ? (y,Y,n,N) ') if fhornot.lower() == 'y': M = int(input('M = ')) print('-------------------') #3------------------------------------------------------------------ #list_of_words------------------------------------------------------ def listword(file_name): fn = open(file_name) newword = '' for line in fn: for e in line: if e in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz': newword += e.lower() elif e in '0123456789': newword += e else: newword += ' ' wordlist = newword.split() fn.close() return wordlist #stopword----------------------------------------------------------- def stopword(file_name): fn = open(file_name) stopwo = '' for line in fn: stopwo += line stopword = stopwo.split() return stopword #removestopword----------------------------------------------------- def removestopword(file_name): wordlist = listword(file_name) stoplist = stopword('stopwords.txt') stopwor = '' for e in stoplist: stopwor += e lastcode = '' for x in wordlist: s = x.lower() if not s in stopwor: lastcode += x + ' ' codelist = lastcode.split() codelist.sort() return codelist #4------------------------------------------------------------------ #char_count--------------------------------------------------------- def char_count(file_name): fn = open(file_name) nc = 0 nl = 0 for line in fn: nc += len(line) nl += 1 n = nc - nl + 1 fn.close() return n #alphabet_count----------------------------------------------------- def alphabet_count(file_name): fn = open(file_name) new = '' for line in fn: for e in line: if e.lower() in 'abcdefghijklmnopqrstuvwxyz': new += e elif e in '0123456789': new += e fn.close() return len(new) #line_count--------------------------------------------------------- def line_count(file_name): fn = open(file_name) c = 0 line = fn.readline() while len(line) > 0: c += 1 line = fn.readline() fn.close() return c #word_count--------------------------------------------------------- def word_count(file_name): wordlist = listword(file_name) return len(wordlist) #BoW---------------------------------------------------------------- def BoWnofh(file_name): codelist = removestopword(file_name) lastcode = ' '.join(codelist) ans = [] for m in codelist: num = 0 for i in range(len(codelist)): if m == codelist[i]: num += 1 ans.append(num) nnn = []*len(codelist) repeat = [] for i in range (len(codelist)): ann = [] ann.append(codelist[i]) ann.append(ans[i]) if not ann in repeat: nnn.append(ann) repeat.append(ann) return nnn #fh----------------------------------------------------------------- def fh(word): fhw = 0 for i in range (len(word)): fhw += ord(word[i])*(37**i) fhn = fhw%M return fhn #BoWwithfh---------------------------------------------------------- def BoWwithfh(file_name): codelist = removestopword(file_name) lastcode = ' '.join(codelist) ana = [] for m in codelist: ff = fh(m) ana.append(ff) ana.sort() ans = [] for e in ana: fi = ana.count(e) ans.append(fi) nnn = []*M repeat = [] for i in range (len(ans)): ann = [] ann.append(ana[i]) ann.append(ans[i]) if not ann in repeat: nnn.append(ann) repeat.append(ann) return nnn #------------------------------------------------------------------- print('char count =' ,char_count(file_name)) print('alphanumeric count =',alphabet_count(file_name)) print('line count =',line_count(file_name)) print('word count =',word_count(file_name)) if fhornot.lower() == 'n': print('BoW =',BoWnofh(file_name)) else : print('BoW =',BoWwithfh(file_name))
# 6330188821 (14.00) 47 (2021-03-22 23:18) file_name = input("File name = ") BoW = input("feature hashing ? (y,Y,n,N) ") M = - 1 while BoW not in "nNyY": print ("try again") BoW = input("feature hashing ? (y,Y,n,N) ") if BoW in "Yy": M = int(input("M = ")) BoW = True else: BoW = False print("-------------------") a = [] stop = open("stopwords.txt" , "r") for line in stop: for x in line.strip().split(): x = x.lower() if x not in a: a.append(x) stop.close() len1 = 0 len2 = 0 linecount = 0 words = [] file = open(file_name , "r") for line in file: linecount += 1 for b in line: len1 += 1 if ("A"<= b <= "Z") or ("a"<= b <="z") or ("0" <= b <= "9"): len2 += 1 if b == "\n": len1 -= 1 word = '' for b in line: if ('A' <= b <= 'Z') or ('a' <= b <= 'z') or ('0' <= b <= '9'): word += b else: if len(word) != 0: words.append(word) word = "" file.close() def get(words, stopWords, isBoW, M): k = [] for p in words: p = p.lower() if p in stopWords: pass else: found = False if BoW: G = 37 r = 0 for i in range(len(x)): p = ord(x[i]) p = p * (G**i) r += p Edit = r % M else: for i in range(len(k)): if k[i][0]==Edit: k[i][1] += 1 found = True break if not found: k.append([Edit, 1]) else: for i in range(len(k)): if k[i][0] == p: k[i][1] += 1 found = True break if not found : k.append([p, 1]) return k print("char count=", len1) print("alphanumeric count", len2) print("line count=", linecount) print("word count =", len(words)) print("BoW =", get(words, a, BoW, M))
# 6330189421 (30.00) 48 (2021-03-22 22:34) def fhash(w,M): sum=0 for i in range(len(w)): sum+= ord(w[i])*((37)**i) return sum % M n=input("File name = ") file_name= open(n,"r") m=input("Use feature hashing ? (y,Y,n,N) ") while m != "y" and m!= "Y" and m!= "n" and m!="N": print("Try again.") m=input("Use feature hashing ? (y,Y,n,N) ") stops=open("stopwords.txt","r") st=[] count=0 alpha=0 lcount=0 sti="" for line in stops: st+=line.split() for line in file_name: for i in line.lower(): if i != "\n": count+=1 if i in "abcdefghijklmnopqrstuvwxyz0123456789": alpha+=1 if i not in "abcdefghijklmnopqrstuvwxyz0123456789": sti+=" " else :sti+=i lcount+=1 wcount=len(sti.split()) word=[] for i in sti.split(): if i not in st: word.append(i) if m == "n" or m =="N": print("-------------------") print("char count =",count) print("alphanumeric count =",alpha) print("line count =",lcount) print("word count =",wcount) a=[] b=[] for i in range(len(word)): if word[i] not in a: a.append(word[i]) b.append(1) else:b[a.index(word[i])]+=1 bow=[] for i in range(len(a)): bow+=[[a[i],b[i]]] print("BoW =",bow) if m == "Y" or m =="y": M=int(input("M = ")) print("-------------------") print("char count =",count) print("alphanumeric count =",alpha) print("line count =",lcount) print("word count =",wcount) bow1=[] A=[] Z=[] U=[] for i in range(len(word)): A.append(fhash(word[i],M)) for i in range(len(A)): if A[i] not in U: U.append(A[i]) Z.append(1) else:Z[U.index(A[i])]+=1 for i in range(len(U)): bow1+=[[U[i],Z[i]]] print("BoW =",bow1)
# 6330190021 (19.20) 49 (2021-03-21 14:24) #------------------------------------------- def char_count( s ) : count = 0 for e in s : if 'A' <= e <= 'z' : count += 1 return count def num_count( s ) : count = 0 for e in s : if '0' <= e <= '9' : count += 1 return count def BoW( l, stopwords, condi ) : x, y, z, w = [], [], [], [] count = 0 for e in l : x.append(e.lower()) x.sort() for e in x : if e not in stopwords : y.append(e) if condi in 'yY' : for i in range(len(y)) : y[i] = fhash(y[i],M) for e in y : if e not in z : z.append(e) for e in z : for k in y : if e == k : count += 1 w.append([e,count]) count = 0 w.sort() return w def fhash( w, M ) : n = len(w) fhash = 0 for i in range(n) : fhash += ord(w[i])*37**i return fhash%M #------------------------------------------- file_name = input('File name = ') hashing = input('Use feature hashing ? (y,Y,n,N) ').strip() while hashing not in 'yYnN' : print('Try again.') hashing = input('Use feature hashing ? (y,Y,n,N) ').strip() if hashing in 'yY' : M = int(input('M = ')) fl = open('stopwords.txt', 'r') stopwords = [] for line in fl : for e in line.strip().split() : stopwords.append(e.lower()) fl.close() fl = open(file_name, 'r') line_count = 0 words_info = '' for line in fl : words_info += line[:-1] line_count += 1 words_info = words_info+line[-1] fl.close() print('-------------------') print('char count =',len(words_info)) alp_count = char_count(words_info)+num_count(words_info) print('alphanumeric count =',alp_count) print('line count =',line_count) special_char = '!@#$%^&*()_+-*/{}[]():;\"\',.?<>' new_words = '' for e in words_info : if e in special_char : new_words += ' ' else: new_words += e new_words = new_words.split() print('word count =',len(new_words)) if hashing in 'yY' : LoW = BoW(new_words,stopwords,hashing) print('BoW =',LoW) else: print('BoW =',BoW(new_words,stopwords,hashing))
# 6330191621 (18.32) 50 (2021-03-22 12:59) def remove_n(text): r = '' for e in text : if e == '\n': r += ' ' else: r += e return r def info(text): c1 = 0 c2 = 0 for item in text: if item in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789': c1 += 1 for i in range (len(text)): if text[i:i+1] == '\n': c2 += 1 onelinetext = remove_n(text) wordlst = onelinetext.split(' ') print('char count = ' + str(len(text) - c2)) print('alphanumeric count = ' + str(c1)) print('line count = ' + str(c2)) print('word count = ' + str(len(wordlst)-1)) def frq(item,list): f = 0 for i in range(len(list)): if list[i] == item: f += 1 return f def remove_punction(text): r = '' for e in text : if e in '\"\'/\\().,;:': r += ' ' else: r += e return r def get_unique(list): ulst = [] for item in list: if item not in ulst: ulst.append(item) return ulst def bow(text,stopword): bowlst = [] cleantxt = remove_punction(text) cleantxt = cleantxt.casefold() wordlst = cleantxt.split() stopwordlst = stopword.split() for word in wordlst: if not word in stopwordlst: bowlst.append([word,frq(word,wordlst)]) return get_unique(bowlst) def fhash(w,M): n = 0 for i in range(len(w)): n += ord(w[i])*(37**i) f = n % M return f def bowf(text,stopword,M): flst = [] fhlst = [] cleantxt = remove_punction(text) cleantxt = cleantxt.casefold() wordlst = cleantxt.split() stopwordlst = stopword.split() for word in wordlst: if word not in stopwordlst: flst.append(fhash(word,M)) for x in flst: fhlst.append([x, frq(x, flst)]) return get_unique(fhlst) def main(): fn = open(input('File name = '),'r') fn2 = open('stopwords.txt','r') n = '' t = '' s = '' for line in fn: t += line for line in fn2: s += line while not n in ['y','Y','n','N']: n = input('Use feature hashing ? (y,Y,n,N) ') if not n in ['y','Y','n','N']: print('Try again.') if n in 'Yy': M = int(input('M = ')) print('-------------------') info(t) print('BoW =', bowf(t,s,M)) if n in 'Nn': print('-------------------') info(t) print('BoW =', bow(t,s)) fn2.close() fn.close() #Program main()
# 6330192221 (22.99) 51 (2021-03-22 23:09) c=0 alpha=0 l=0 w=[] x=[] b=[] Bag=[] logic=['y','Y','n','N'] def char_count(line): c=0 c+=(len(line)-1) return c #------------------------------- def alphanumeric_count(line): c1=0 for t in line: if 'a'<=t<='z' or 'A'<=t<='Z' or '0'<=t<='9': c1+=1 return c1 #------------------------------- def line_count(line): l=0 if len(line)!=0: l+=1 return l #------------------------------- def word_count(line): word=[] wordn=[] s="" for t in line: if 'a'<=t<='z' or 'A'<=t<='Z' or '0'<=t<='9': s+=t else: if s!="": word.append(s.lower()) s="" return word #------------------------------- def BoW(lis): c=[] for i in range(len(lis)): if [lis[i],lis.count(lis[i])] not in c: c.append([lis[i],lis.count(lis[i])]) return c #----------------------------- def fhash(w,M): G=37 s=0 for i in range(len(w)): s+=ord(w[i])*(G**i) s=s%M return s #----------------------------- def hashedBoW(wordlist,M): A=[] for word in wordlist: A.append(fhash(word,M)) return BoW(A) #----------------------------- file_name=input("File name = ") fin = open(file_name,"r") fin2= open("stopwords.txt","r") a=input("Use feature hashing ? (y,Y,n,N) ") for line in fin: c+=char_count(line) alpha+=alphanumeric_count(line) l+=line_count(line) w+=word_count(line) else: c+=1 lw=len(w) for line in fin2: x+=line.split() for i in range(len(w)): if w[i] not in x: b.append(w[i].lower()) if a=='y' or a=='Y': M=int(input("M = ")) print("-------------------") print("char count =",c) print("alphanumeric count =",alpha) print("line count =",l) print("word count =",lw) print("BoW =",hashedBoW(b,M)) elif a=='n' or a=='N': print("-------------------") print("char count =",c) print("alphanumeric count =",alpha) print("line count =",l) print("word count =",lw) print("BoW =",BoW(b)) else: while a not in logic: print("Try again.") a=input("Use feature hashing ? (y,Y,n,N) ") if a=='y' or a=='Y': M=int(input("M = ")) print("-------------------") print("char count =",c) print("alphanumeric count =",alpha) print("line count =",l) print("word count =",lw) print("BoW =",hashedBoW(b,M)) elif a=='n' or a=='N': print("-------------------") print("char count =",c) print("alphanumeric count =",alpha) print("line count =",l) print("word count =",lw) print("BoW =",BoW(b))
# 6330193921 (30.00) 52 (2021-03-20 22:26) #-------------------------------------------------- def fhash(w, M): fh = 0 for i in range(len(w)): fh += ord(w[i])*(37**i) fh = fh%int(M) return fh def ch_count(file_name): file = open(file_name) c = 0 for line in file: for e in line: if e == '\n': c += 0 else: c += 1 file.close() return c def ch_num(file_name): file = open(file_name) c = 0 for line in file: for e in line: if 'A' <= e.upper() <= 'Z': c += 1 if '0' <= e <= '9': c += 1 file.close() return c def w_count(file_name): file = open(file_name) c = 0 for line in file: s = '' for e in line: if 'A' <= e.upper() <= 'Z' or '0' <= e <= '9': s += e else: s += ' ' s = s.split() c += len(s) file.close() return c def wl_count(words,word): c = 0 for e in words: if e == word: c += 1 return c def l_count(file_name): file = open(file_name) c = 0 for line in file: c += 1 file.close() return c def BoW(file_name): words = only_words(file_name) stopwords = only_words('stopwords.txt') cut = [] BoW = [] repeat = [] fh = [] for e in words: if e not in stopwords: cut.append(e) if YON in ['y','Y']: for i in range(len(cut)): fh.append(fhash(cut[i],M)) if fh[i] not in repeat: repeat.append(fh[i]) for i in range(len(repeat)): BoW.append([repeat[i],fh.count(repeat[i])]) BoW.sort() if YON in ['n','N']: for e in cut: if e not in repeat: BoW.append([e, wl_count(cut,e)]) repeat.append(e) BoW.sort() return BoW def only_words(file_name): file = open(file_name) s = '' for line in file: for e in line: if 'A' <= e.upper() <= 'Z' or '0' <= e <= '9': s += e.lower() else: s += ' ' file.close() return s.split() #------------------------------------------------ file_name = input('File name = ') YON = input('Use feature hashing ? (y,Y,n,N) ') if YON in ['y','Y']: M = input('M = ') while YON not in ['y','Y','n','N']: print('Try again.') YON = input('Use feature hashing ? (y,Y,n,N) ') if YON in ['y','Y']: M = input('M = ') print('-------------------') print('char count = '+str(ch_count(file_name))) print('alphanumeric count = '+str(ch_num(file_name))) print('line count = '+str(l_count(file_name))) print('word count = '+str(w_count(file_name))) print('BoW = '+str(BoW(file_name)))
# 6330194521 (13.16) 53 (2021-03-22 00:22) #Prog-08: Bag-of-words #6330194521 (13.16) Name Taechit Pornsukasem fn = input("File name: ") file_name = open(fn , 'r') f = file_name.readline() word = [] while len(f) > 0: word.append(f) f = file_name.readline() file_name.close() while True: ufh = input("Use feature hashing? (y,Y,n,N) ") if ufh == 'y': M = int(input(("M = "))) break elif ufh == 'Y': M = int(input(("M = "))) break elif ufh == 'n': break elif ufh == 'N': break else: print("Try again.") continue def fhash(w,M): summ = 0 for i in range(len(w)): summ += ord(w[i])*(37**i) last = summ % M return last cc = 0 for i in range(len(word)): for j in word[i]: cc += 1 char_count = cc-len(word) print('char count =',char_count) alp = [] for i in range(len(word)): for j in word[i]: if j != " ": alp.append(j) for i in alp: if i in ['\'','\"',',','.','(',')',':',';','\\','/']: alp.remove(i) alp_count = len(alp)-len(word) print('alphanumeric count =',alp_count) line_count = len(word) print('line count =',line_count) res = "" for i in range(len(word)): for j in word[i]: if j not in ['\'','\"',',','.','(',')',':',';','\\','/']: res += j.lower() res = res.split() print('word count =',len(res)) stop = open('stopword.txt',"r") sto = stop.readline() stp = [] while len(sto) > 0: stp.append(sto) sto = stop.readline() stop.close() stopword = "" for i in range(len(stp)): for j in stp[i]: stopword += j stopword = stopword.split() want = [] for i in res: if i not in stopword: want.append(i) def bow1(want): w = [] count = [] for i in want: if i not in w: w.append(i) count.append(1) else: for j in range(len(w)): if w[j] == i: count[j] += 1 fin = [] for i in range(len(w)): tmp = [] tmp.append(w[i]) tmp.append(count[i]) fin.append(tmp) return fin def bow2(want): num = [] count = [] fq = [] for i in want: num.append(fhash(i,M)) for j in num: if j not in fq: fq.append(j) count.append(1) else: for k in range(len(fq)): if fq[k] == j: count[k] += 1 fin = [] for r in range(len(fq)): tmp = [] tmp.append(fq[r]) tmp.append(count[r]) fin.append(tmp) fin.sort() return fin if ufh == 'y': print('BoW =',bow2(want)) elif ufh == 'Y': print('BoW =',bow2(want)) elif ufh == 'n': print('BoW =',bow1(want)) elif ufh == 'N': print('BoW =',bow1(want))
# 6330197421 (22.90) 54 (2021-03-21 13:13) file_name = input("File name = ") feature_hashing = input("Use feature hashing ? (y,Y,n,N) ") while True: if feature_hashing == "y" or feature_hashing == "Y": feature_hashing = True M = int(input("M = ")) break elif feature_hashing == "n" or feature_hashing == "N": feature_hashing = False break else: print("Try again.") feature_hashing = input("Use feature hashing ? (y,Y,n,N) ") print("-------------------") def fhash(w,M): answer = 0 for i in range(len(w)): answer += ord(w[i])*37**i answer = answer%M return answer stopwords = [] with open("stopwords.txt") as Filehandler: for line in Filehandler: stopwords += line.split() for word in stopwords: word = word.lower() character_count = 0 alphanumeric_count = 0 line_count = 0 words = [] file_name_no_stopwords = [] with open(file_name) as Filehandler: for line in Filehandler: if line[-1] == "\n": character_count += len(line[:-1]) else: character_count += len(line) for e in line: if "0" <= e <= "9" or "A" <= e <= "z": alphanumeric_count += 1 line_count += 1 for word in line.split(): f = "" for e in word: if "0" <= e <= "9" or "A" <= e <= "z": f += e words.append(f.lower()) if f.lower() not in stopwords: file_name_no_stopwords.append(f.lower()) word_count = len(words) if feature_hashing == False: BoW = [] for word in file_name_no_stopwords: duplicate = False for word_and_number_of_word in BoW: if word_and_number_of_word[0] == word: word_and_number_of_word[1] += 1 duplicate = True break if duplicate == False: BoW.append([word, 1]) BoW.sort() else: list_of_fhash = [] for w in file_name_no_stopwords: list_of_fhash.append(fhash(w,M)) BoW = [] for n in list_of_fhash: duplicate = False for fhash_and_number_of_fhash in BoW: if fhash_and_number_of_fhash[0] == n: fhash_and_number_of_fhash[1] += 1 duplicate = True break if duplicate == False: BoW.append([n, 1]) BoW.sort() print("char count = ", character_count) print("alphanumeric count =", alphanumeric_count) print("line count =", line_count) print("word count =", word_count) print("BoW =", BoW)
# 6330198021 (21.80) 55 (2021-03-22 13:36) def fhash(w,M): result = 0 result1 = 0 for i in range(len(w)): result += ord(w[i])*37**i result1 += int(result)%M return result1 #x = fhash('football',4) #print(x) #กรณีทดสอบ def remove_punc(t): output ='' for e in t: if e in '\'\"/\\().,;:': output += ' ' else: output += e return output def remove(u): output = '' for e in u: if e == '\n': output += ''#ทำให้หายไปเลย else: output += e return output def Remove(v): output = '' for e in v: if e == '\n': output += ' '#ให้เป็นช่องว่าง else: output += e return output file = input() print('File name = '+file) file_name = open(file,'r') file_name2 = open('stopwords.txt','r') LINE = '' Line = '' for line in file_name: LINE += line #อ่านทีละอัน Line += remove(line) #อ่านทีละอันโดยตัดตัวขึ้นบรรทัดใหม่ออก LINE2 = '' Line2 = '' for line in file_name2: LINE2 += line Line2 += Remove(line) x1 = input() while x1 not in ['y','Y','n','N']: print('Use feature hashing ? (y,Y,n,N)' ,x1) print('Try again.') x1 = input() if x1 in ['y','Y','n','N']: break if x1 in ['y','Y']: print('Use feature hashing ? (y,Y,n,N)' ,x1) m = int(input()) print('M =' ,m) print('-'*19) elif x1 in ['n','N']: print('Use feature hashing ? (y,Y,n,N)' ,x1) print('-'*19) x2 = [] x2 = remove_punc(Line) x2 = x2.lower() x2 = x2.split()#คำใน sample.txt x3 = [] x3 = remove_punc(Line2) x3 = x3.lower() x3 = x3.split()#คำใน stopwords.txt char_count = 0 char_count += len(Line) print('char count =' ,char_count) alphanumeric = 0 Line = Line.lower() for e in Line: if e in ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','0','1','2','3','4','5','6','7','8','9']: alphanumeric += 1 print('alphanumeric count =' ,alphanumeric) line_count = 1 for e in LINE: if e == '\n': line_count += 1 print('line count =' ,line_count) word_count = 0 x2 = remove_punc(Line) x2 = x2.split() word_count += len(x2) print('word count =' ,word_count) if x1 in ['n','N']: #เอาคำที่เหลือรอดมาทำ BoW f = [] for i in range(len(x2)): if x2[i] not in x3: f.append(x2[i]) #คำที่เหลือรอด result = [] for r1 in f: k = 0 for r2 in f: if r1 == r2: k += 1 result.append([r1,k]) result.sort() BoW =[] for i in range(len(result)): if result[i] != result[i-1]: BoW.append(result[i]) print('BoW =' ,BoW) if x1 in ['y','Y']: f = [] for i in range(len(x2)): if x2[i] not in x3: f.append(x2[i]) Result = [] for x4 in f: Result.append(fhash(x4,m)) Result.sort() # list ค่าจากการคำนวณ fhash real_Result = [] for R1 in Result: #ไล่ค่าทีละตัว k = 0 for R2 in Result: if R1 == R2: k += 1 real_Result.append([R1,k]) # นับจำนวนว่ามีกี่ตัว BOW = [] for i in range(len(real_Result)): if real_Result[i] != real_Result[i-1]: BOW.append(real_Result[i]) print('BoW =' ,BOW) file_name.close() file_name2.close()
# 6330199721 (0.00) 56 (2021-03-22 17:35) file_name = input('File name = ') fn = open(file_name.strip(), 'r') a = input('Use feature hashing ? (y,Y,n,N) ') while 1>0: if a in 'nN': break elif a in 'yY': M = input('M = ') break else : print('Try again.') a = input('Use feature hashing ? (y,Y,n,N) ') print('-------------------') s = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890' cc = 1 ac = 0 lc = 0 wc = 0 for i in fn: lc+=1 for b in range(len(i)): cc+=1 if i[b] in s: ac+= 1 if i[b+1] not in s: wc+=1 print('char count =',cc-lc) print('alphanumeric count =',ac) print('line count =',lc) print('word count =',wc) if a in 'nN': fn = open(file_name.strip(), 'r') st = open("stopwords.txt", 'r') s = 'abcdefghijklmnopqrstuvwxyz1234567890' w = '' for l in fn: l = l.strip('\n') l = l.lower() for i in l: if i in s: w += i else: w += ' ' w = w.split() for l in st: l = l.split() for i in l: while i in w: w.remove(i) bow = [] for i in w: x = w.count(i) if [i,x] not in bow: bow += [[i,x]] print('BoW =',bow) def fhash(x,M): a = 0 for i in range(len(x)): c = ord(x[i])*(pow(37,i)) a += c d = a%int(M) return str(d) if a in 'Yy': fn = open(file_name.strip(), 'r') st = open("stopwords.txt", 'r') s = 'abcdefghijklmnopqrstuvwxyz1234567890' w = '' for l in fn: l = l.strip('\n') l = l.lower() for e in l: if e in s: w += e else: w += ' ' w = w.split() for l in st: l = l.split() for i in l: while i in w: w.remove(i) j=[] for i in w: j += fhash(i,M) bow = [] for i in j: x = j.count(i) if [int(i),x] not in bow: bow += [[int(i),x]] bow.sort() print('BoW =',bow) fn.close() st.close()
# 6330200621 (19.80) 57 (2021-03-22 00:15) file_name = input('File name = ') fn = open(file_name.strip(), 'r') fh = input('Use feature hashing ? (y,Y,n,N) ') while True: if fh not in 'yYnN': print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') if fh in 'yY': M = input('M = ') break if fh in 'nN': break #------------------------------------------------------------ def everything(fn): ac = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0'] anc = 0 cc = 0 lc = 0 tap = '' for line in fn: line = line.strip('\n') line = line.lower() lc += 1 for e in line: cc += 1 if e in ac: anc += 1 tap += e else: tap += ' ' tap = tap.split() wc = len(tap) return anc,cc,lc,wc #------------------------------------------------------------- anc,cc,lc,wc = everything(fn) print('-'*len('Use feature hashing')) print('char count =',cc) print('alphanumeric count =',anc) print('line count =',lc) print('word count =',wc) fn.close() #--------------------------------------------------- def bow1(): fn = open(file_name.strip(), 'r') st = open("stopwords.txt", 'r') ac = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0'] b = '' for line in fn: line = line.strip('\n') line = line.lower() for e in line: if e in ac: b += e else: b += ' ' b = b.split() for line in st: line = line.split() for e in line: while e in b: b.remove(e) bow = [] for e in b: z = b.count(e) if [e,z] not in bow: bow.append([e,z]) fn.close() st.close() return bow #--------------------------------------------------- if fh == 'n' or fh == 'N': bow = bow1() print('BoW =',bow) #--------------------------------------------------- def fhash(a,M): summ=0 for i in range (len(a)): summ += ord(a[i])*(37)**i c = summ % int(M) return str(c) #---------------------------------------- def bow2(): fn = open(file_name.strip(), 'r') st = open("stopwords.txt", 'r') ac = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0'] b = '' for line in fn: line = line.strip('\n') line = line.lower() for e in line: if e in ac: b += e else: b += ' ' b = b.split() for line in st: line = line.split() for e in line: while e in b: b.remove(e) c=[] for f in b: c.append(fhash(f,M)) bow = [] for e in c: z = c.count(e) if [int(e),z] not in bow: bow.append([int(e),z]) bow.sort() fn.close() st.close() return bow #------------------------------------------- if fh == 'y' or fh == 'Y': bbb = bow2() print('BoW =',bbb)
# 6330201221 (24.90) 58 (2021-03-22 19:50) #Prog-08: Bag-of-words #6330201221 (24.90) Thatphong Hengchun num = ['0','1','2','3','4','5','6','7','8','9'] alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'] file_name = input('File name = ').strip() check = '' M = 0 while check == '': useHash = input('Use feature hashing ? (y,Y,n,N) ').lower() if useHash == 'y': M += int(input('M = ')) check += 'y' elif useHash == 'n': check += 'n' else : print('Try again') def nFhash(w): # w = list of words freq = [] checker = [] for word in w : if word not in checker: checker.append(word) freq += [0]*len(checker) for i in range(len(w)): for j in range(len(checker)): if w[i] == checker[j]: freq[j] += 1 break return [[checker[k],freq[k]] for k in range(len(checker))] def fhash(word,m): f = 0 for i in range(len(word)): f += ord(word[i])* (37 ** i) return f%m def yFhash(w,n): #w = list of words freq = [] checker = [] for i in range(len(w)): w[i] = fhash(w[i],n) # w = list of fhase(word,m) return nFhash(w) def charInLine(s): return len(s.strip()) #w = ['shane','likes','football','big','fan','football','team','arsenal'] #for e in w: # print(fhash(e,M)) stop_words = [] stFile = open('stopwords.txt','r') for line in stFile : stop_words += line.strip().split() stop_words = list(map(str.lower, stop_words)) #print(stop_words) txFile = open(file_name,'r') charCount = 0 lineCount = 0 text = '' words = [] for line in txFile : words += line.lower().split() charCount += charInLine(line) lineCount += 1 #print(words) alpnumCount = 0 for word in words: for alp in word : if alp not in alphabet and alp not in num : continue else : text += alp alpnumCount += 1 text += ' ' #print(text) cWords = text.split() wordCount = len(cWords) dWords = [] for f in cWords: if f not in stop_words: dWords.append(f) print('-------------------') print('char count =',charCount) print('alphanumeric count =',alpnumCount) print('line count =',lineCount) print('word count =',wordCount) if check == 'y': print('BoW =',sorted(yFhash(dWords,M))) else : print('BoW =',sorted(nFhash(dWords)))
# 6330202921 (21.05) 59 (2021-03-22 23:21) def listOfStopWord(): l = [] f = open('stopwords.txt','r') for line in f: l += line.split() f.close() return l def fhash(w,M): val = 0 for i in range(len(w)): val += ord(w[i])*(37**i) val %= M return val def check(word,l,noHash,M): stop = listOfStopWord() if word.lower() in stop: return if not noHash: word = fhash(word,M) found = False for p in l: if p[0]==word: p[1] = p[1]+1 found = True return if not found: l.append([word,1]) return def cal(filename,noHash,M): l = [] f = open(filename, "r") stop = listOfStopWord() charcount = 0 alphanumericcount = 0 linecount = 0 wordcount = 0 for line in f: tmp = 0 b = False line = line.strip() for i in range(len(line)): if line[i] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789': alphanumericcount+=1 if line[i] not in 'abcdefghijklmnoqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' and b == True: if i-tmp>0 : check(line[tmp:i],l,noHash,M) wordcount+=1 b = False if line[i] in 'abcdefghijklmnoqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'and b == False: tmp = i b = True charcount+=1 if b == True: check(line[tmp:i],l,noHash,M) wordcount+=1 linecount+=1 f.close() print('char count =',charcount) print('alphanumeric count =',alphanumericcount) print('line count =',linecount) print('word count =',wordcount) return l def main(): filename = input('File name = ') check = input('Use feature hashing ? (y,Y,n,N) ') while True: print('-------------------') if check.lower()=='y': M = int(input('M = ')) BoW = cal(filename,False,M) break elif check.lower()=='n': M = 0 BoW = cal(filename,True,M) break else: check = input('Try again. ') print('BoW =',BoW) main()
# 6330203521 (30.00) 60 (2021-03-22 13:03) def char_count (file_name) : a = 0 for line in file_name : for c in line : if c != "\n" : #ตอนเคาะบรรทัดใหม่จะมีรหัวนี้ซ่อนอยู่! a += 1 return a def alpha_count (file_name) : a = 0 for line in file_name : for c in line : if ("A"<= c <= "Z") or ("a"<= c <= "z") or (c in "0123456789") : a += 1 return a def line_count (file_name) : a = 0 for line in file_name : a += 1 return a def word_count (file_name) : sen = "" for line in file_name : for c in line : if ("A"<= c <= "Z") or ("a"<= c <= "z") or (c in "0123456789") : sen += c else : sen += " " a = sen.lower().strip().split() return a def show_result (a) : file_name = open(a,"r") print("char count =",char_count(file_name)) file_name.close() file_name = open(a,"r") #ถ้าไม่มีมันจะรับและอ่านครั้งเดียว print("alphanumeric count =",alpha_count (file_name)) file_name.close() file_name = open(a,"r") print("line count =",line_count(file_name)) file_name.close() file_name = open(a,"r") print("word count =",len(word_count(file_name))) file_name.close() def BoW(a) : data_s = [] file_name = open(a,"r") stop = open("stopwords.txt","r") for line in stop : data_s += line.lower().strip().split() data = word_count(file_name) ans = [] for c in data : if c not in data_s : ans.append(c) file_name.close() stop.close() return ans def fhash(w,M) : data = [] for c in w : num = 0 i = 0 for a in c : num += ord(a)*(37**i) i += 1 data.append(num%int(M)) ans = [] count = 1 try : data = data + [max(data)+1] except : return [] data.sort() f = data[0] for i in range(1,len(data)) : if f == data[i] : count += 1 else : ans.append([f,count]) count = 1 f = data[i] return ans def nofhash(data) : ans = [] count = 1 data.sort() data.append(" ") f = data[0] for i in range(1,len(data)) : if f == data[i] : count += 1 else : ans.append([f,count]) count = 1 f = data[i] return ans #----------------------------------------------------------------------------------- a = input("File name = ") while True : b = input("Use feature hashing ? (y,Y,n,N) ") if b == "y" or b == "Y" : M = input("M = ") print("-------------------") show_result(a) word = BoW(a) ansf = fhash(word,M) print("BoW =",ansf) break elif b == "n" or b == "N" : print("-------------------") show_result(a) word = BoW(a) ansnf = nofhash(word) print("BoW =",ansnf) break else : print("Try again.")
# 6330205821 (30.00) 61 (2021-03-22 14:48) def alc(line): c = 0 for i in range(len(line)): if 'a' <= line[i] <= 'z': c +=1 elif '0' <= line[i] <= '9': c+=1 return c #-------------------------------- def charcount(line): ch =0 ch += len(line) return ch #-------------------------------- def wc(line): w =0 b ='' for e in line: if e not in 'abccdefghijklmnopqrstuvwxyz' and e not in '0123456789': b += ' ' else: b += e c = b.strip().split() w = len(c) return w,c #----------------------------------- def BoW(word_all): b =[] x = [] cut = open('stopwords.txt','r') for l in cut: x += l.strip().split() cut.close() for e in word_all: found = False if e not in x: for i in range(len(b)): if e == b[i][0]: found =True break if found == False: b.append([e,word_all.count(e)]) b.sort() return b #----------------------------------------- def fhash(w,M): su =0 for i in range(len(w)): su += ord(w[i])*(37)**(i) ans = su % int(M) return ans #-------------------------------------------- def bowfh(word_all,M): bo =[] x = [] ff =[] ss = [] cut = open('stopwords.txt','r') for l in cut: x += l.strip().split() cut.close() for e in word_all: found = False if e not in x: ff.append(fhash(e,M)) for i in ff: if i not in ss: ss.append(i) for i in range(len(ss)): bo.append([ss[i],ff.count(ss[i])]) bo.sort() return bo file_name = input('File name = ') fh = input('Use feature hashing ? (y,Y,n,N) ') wrong = True while fh != 'n' and fh != 'N' and fh != 'y' and fh!= 'Y': print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') if fh == 'n' or fh== 'N': pass if fh == 'y' or fh== 'Y': m = input('M = ') print('-------------------') f = open(file_name, 'r') a = 0 cha =0 l =0 word = 0 word_all = [] bow =[] for line in f: line = line.strip().lower() a += alc(line) cha += charcount(line) l += 1 word += wc(line)[0] word_all += wc(line)[1] print('char count =',cha) print('alphanumeric count =',a) print('line count =',l) print('word count =',word) bow +=BoW(word_all) f.close() if fh == 'n' or fh == 'N': print('BoW =',bow) else: print('BoW =',bowfh(word_all,m))
# 6330206421 (30.00) 62 (2021-03-21 22:44) alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'] num = ['0','1','2','3','4','5','6','7','8','9'] alc = 0 check = 0 f1 = input("File name = ") f1 = f1.strip() while check == 0: sel = input("Use feature hashing ? (y,Y,n,N) " ) sel = sel.strip() if sel == "y" or sel == "Y": M = int(input("M = ")) check += 1 elif sel == "n" or sel == "N": check += 2 else: print("Try again") def fhash(w,M): sumx = 0 x=[] x[:0]=w for i in range(len(x)): constan = ord(x[i]) sumx += constan*(37**i) sumx = sumx % M return sumx stw = [] stop = open("stopwords.txt", "r") for line in stop: z = line.strip() z = z.lower() stw.append(z) listToStr = ' '.join([str(elem) for elem in stw]) stopword = listToStr.split() stop.close() t = "" forlist = "" k = [] infile = open(f1, "r") for line in infile: point = line.strip() point = point.lower() k.append(point) t = t + point forlist = forlist +" "+point list1=[] list1[:0]=forlist for i in list1: if i in alphabet: alc += 1 elif i in num: alc += 1 else : pass list2=list1 newworld = [] for i in list2: if i in alphabet: newworld.append(i) elif i in num: newworld.append(i) else : newworld.append(" ") mother = ''.join([str(elem) for elem in newworld]) wordlist = mother.split() line = len(wordlist) char = len(t) print ("-"*19) print ("char count = "+str(char)) print ("alphanumeric count = "+str(alc)) print ("line count = "+str(len(k))) print ("word count = "+str(line)) new = [] if check == 2 : for i in wordlist: if i not in stopword: new.append(i) else : pass new2 = [] for i in new: if i not in new2: new2.append([i,new.count(i)]) else : pass new3 = [] for i in new2: if i not in new3: new3.append(i) else : pass print ("BoW = "+str(new3)) new = [] if check == 1 : for i in wordlist: if i not in stopword: new.append(i) else : pass countfhash = [] for i in new: total = fhash(i,M) countfhash.append(str(total)) new2 = [] for i in countfhash: if i not in new2: new2.append([int(i),countfhash.count(i)]) else : pass new3 = [] for i in new2: if i not in new3: new3.append(i) else : pass new3.sort() print("BoW = "+str(new3)) infile.close()
# 6330208721 (26.50) 63 (2021-03-21 11:12) sample = open(input("File name = "), "r") text = '' line_count = 0 for line in sample: li = line.strip('\n') text += li + " " line_count += 1 text.strip() sample.close() stop = open('stopwords.txt', "r") stopword = '' for line in stop: stopword += line stop.close() non_alpha = ['(', ')', '-', '_', '[', ']', '"', "'", ';', ':', '>', '<', '.',',','~','^','*','$','#','@','+','=','{','}'] def remove_nonal(a): for i in range(len(a)): if a[i] in non_alpha: a = a[:i] + " " + a[i+1:] a = a.strip() return a def remove_stop(a): b = '' a = remove_nonal(a).lower().split() for i in a: if i not in stopword: b+= i+ " " return b.strip() def bow(a): a = a.lower().split() b = [] count = 0 d = [] e = [] for i in range(len(a)): for k in range(len(a)): if a[i] == a[k] and a[i] not in d: count += 1 d.append(a[i]) e.append(count) count = 0 b.append([d[i],e[i]]) for i in b: if i[1]==0: b.remove(i) b.sort() return b def fhash(w,M): sum = 0 for i in range(len(w)): sum += ord(w[i])*(37**i) ans = sum%M return ans def bow_hash(a,M): #bคือที่แปรรูปแล้ว a2 = [] d = [] e = [] b = [] count = 0 a = a.split() for i in a: a2.append(fhash(i,M)) for i in range(len(a2)): for k in range(len(a2)): if a2[i] == a2[k] and a2[i] not in d: count += 1 d.append(a2[i]) e.append(count) count = 0 b.append([d[i],e[i]]) for i in b: if i[1]==0: b.remove(i) b.sort() return b yn = input("Use feature hashing ? (y,Y,n,N) ") while yn not in "YyNn": print("Try again.") yn = input("Use feature hashing ? (y,Y,n,N) ") if yn in "Yy": M = int(input("M = ")) print("-------------------") print("char count = " + str(len(text)-line_count)) refine_text = remove_nonal(text).split() print('alphanumeric count =',len("".join(refine_text))) print("line count =", line_count) print("word count =",len(refine_text)) print("BoW =",bow_hash(remove_stop(text),M)) elif yn in "Nn": print("-------------------") print("char count = " + str(len(text) - line_count)) refine_text = remove_nonal(text).split() print('alphanumeric count =', len("".join(refine_text))) print("line count =", line_count) print("word count =", len(refine_text)) print("BoW =",bow(remove_stop(text)))
# 6330209321 (22.65) 64 (2021-03-22 21:03) with open('stopwords.txt') as f: data = f.readlines() stopword = [] for i in range(len(data)): x = data[i].strip().split() for j in range(len(x)): stopword.append(x[j]) def fhash(w,M): G = 37 res = 0 for i in range(len(w)): res += ord(w[i].lower())*(37**(i)) return res%M word = "qwertyuiopasdfghjklzxcvbnm0123456789" with open(input('File name = '),'r',encoding='utf8') as f: rawData = f.readlines() lineCount = len(rawData) for i in range(len(rawData)): rawData[i] = rawData[i].strip('\n') data = [] for i in range(len(rawData)): if rawData[i] != "": data.append(rawData[i]) # --------------------------------- charCount = 0 alphaNum = 0 wordCount = 0 noStopWord = [] # --------------------------------- for i in range(len(data)): # lineCount += 1 for j in range(len(data[i])): if data[i][j].lower() in word: alphaNum += 1 elif data[i][j].lower() not in word: if j == 0: pass elif data[i][j-1] in word: wordCount += 1 charCount+=1 if data[i][-1] in word: wordCount += 1 # Remove Stop Word --------------------------------- w = data[i].split() for j in range(len(w)): if w[j].lower() not in stopword: noStopWord.append(w[j].lower()) # ------------------------------------------------- betterNoStopWord = [] for i in range(len(noStopWord)): formatString = "" for j in range(len(noStopWord[i])): if noStopWord[i][j] in word: formatString += noStopWord[i][j] betterNoStopWord.append(formatString) # ------------------------------------------------- bow = [] useFhash = True M = 0 while True: x = input("Use feature hashing ? (y,Y,n,N) ") if x in ['y','Y']: M = int(input("M = ")) break elif x in ['n','N']: useFhash = False break else: print("Try again.") # ------------------------------------------------- for i in range(len(betterNoStopWord)): if useFhash: for j in range(len(bow)): if bow[j][0] == fhash(betterNoStopWord[i],M): bow[j][1] += 1 break else: bow.append([fhash(betterNoStopWord[i],M),1]) else: for j in range(len(bow)): if bow[j][0] == betterNoStopWord[i]: bow[j][1] += 1 break else: bow.append([betterNoStopWord[i],1]) bow.sort() # ------------------------------------------------- print("-------------------") print("char count =",charCount) print("alphanumeric count =",alphaNum) print("line count =",lineCount) print("word count =",wordCount) print("BoW =",bow)
# 6330210921 (22.35) 65 (2021-03-19 00:14) alp = "!@$%^&*()_+-={}[]:;\"\'<,>.?/\n" ################################### def count_alpnum(s) : c = 0 for e in s : if "0" <= e <= "9" or "a" <= e <= "z" or \ "A" <= e <= "Z" : c += 1 return c def file_list(file_name) : fn = open(file_name) file_str = "" while True : a = fn.readline() if len(a) == 0 : break for e in a : if e not in alp : file_str += e.lower() else : file_str += " " file_list = file_str.split() return file_list def check_str_in_list(str_list,not_in) : str_b_check = [] for e in str_list : if e not in not_in : str_b_check.append(e) return str_b_check def count_words_in_list(words,lis) : c = 0 for e in lis : if e == words : c += 1 return c def find_num_flash(s,M) : c = 0 for i in range(len(s)) : c += ord(s[i])*37**i c %= M return c def count_num(n,l) : c = 0 for e in l : if e == n : c += 1 return c ################################### def char_count(file_name) : fn = open(file_name) c = 0 for e in fn : if "\n" in e : c += e.find("\n") else : c += len(e) fn.close() return c def alphanumeric(file_name) : fn = open(file_name) c = 0 for e in fn : c += count_alpnum(e) fn.close() return c def line_count(file_name) : fn = open(file_name) c = 0 for e in fn : c += 1 fn.close() return c def word_count(file_name) : fn = open(file_name) c = 0 for e in fn : e_l = e.split() c += len(e_l) fn.close() return c def n_bow(file_sample,file_stop) : a = file_list(file_sample) b = file_list(file_stop) c = check_str_in_list(a,b) c.sort() check = [] cc = [] for i in range(len(c)) : if c[i] not in check : d = count_words_in_list(c[i],c) check.append(c[i]) cc.append([c[i],d]) return cc def feture_hashing(sample,M) : a = check_str_in_list(file_list("sample.txt"),file_list("stopwords.txt")) c = [] for e in a : c.append(find_num_flash(e,M)) check = [] f_h = [] c.sort() for e in c : if e not in check : d = count_num(e,c) f_h.append([e,d]) check.append(e) return f_h ################################### print("File name = ",end = "") file_name = input() while True : print("Use feature hashing ? (y,Y,n,N) ",end = "") func = input() if func == "y" : break elif func == "Y" : break elif func == "n" : break elif func == "N" : break print("Try again.") if func == "y" or func == "Y" : print("M = ",end = "" ) M = int(input()) print("-------------------") print("char count = ",end = "" ) print(char_count(file_name)) print("alphanumeric count = ",end = "" ) print(alphanumeric(file_name)) print("line count = ",end = "" ) print(line_count(file_name)) print("word count = ",end = "" ) print(word_count(file_name)) print("BoW = ",end = "" ) if func == "y" or func == "Y" : print(feture_hashing(file_name,M)) if func == "n" or func == "N" : print(n_bow(file_name,"stopwords.txt"))
# 6330211521 (7.63) 66 (2021-03-22 16:51) File_name = input('File name = ').strip() feature_hashing = input('Use feature hashing ? (y, Y, n, N) ') while feature_hashing not in ['y', 'Y', 'n', 'N'] : print("Try again") feature_hashing = input('Use feature hashing ? (y, Y, n, N) ') if feature_hashing == 'y' or feature_hashing == 'Y' : M = input('M = ') fin = open(File_name, "r") fin_1 = open("stopwords.txt", "r") char_count = 0 alphanumeric_count = 0 line_count = 0 word_count = 0 c = "\"\'/\\,.:;" c1 = '' sample = '' for line in fin : line_count += 1 char_count += len(line) for i in range(len(line)) : if 'a'<=line[i]<='z' or 'A'<=line[i]<='Z' or '0'<=line[i]<='9' : alphanumeric_count += 1 if line[i] in c : c1 += " " else : c1 += line[i] sample += c1 c1 = '' word_count = len(sample.split()) stopword = '' a1 = '' for line in fin_1 : for i in range(len(line)): if line[i] in c : a1 += ' ' else : a1 += line[i] stopword += a1 a1 = '' stopword = stopword.split() sample1 = sample.lower().split() sample1_1 = [] for ch in sample1 : if ch in stopword : sample1_1 += [] else : sample1_1 += [ch] BoW = [] fre = 0 sample1_1_1 = [] for i in range(len(sample1_1)) : for j in range(len(sample1_1)) : if sample1_1[i] == sample1_1[j] : fre += 1 BoW += [[sample1_1[i], fre]] fre = 0 BoW2 = [] for ch in BoW : if ch in BoW2 : BoW2 += [] elif ch not in BoW2 : BoW2 += [ch] fin.close() fin_1.close() print("-------------------") print("char_count =", char_count - (line_count-1) ) print("alphanumeric_count =", alphanumeric_count) print("line_count =", line_count) print("word_count =", word_count) if feature_hashing == 'n' or feature_hashing == 'N' : print("BoW =", BoW2) elif feature_hashing == 'y' or feature_hashing == 'Y' : BoW3 = [] sum = 0 for i in range(len(BoW2)) : for j in range(len(BoW2[i][0])) : sum += ord(BoW2[i][0][j])*(37**j) BoW3 += [sum%int(M)] num1 = 0 BoW4 = [] BoW3_1 = [] for i in range(len(BoW3)) : if BoW3[i] not in BoW3_1 : BoW3_1 += [BoW3[i]] for i in range(len(BoW3_1)) : for j in range(len(BoW3)) : if BoW3_1[i] == BoW3[j] : num1 += 1 BoW4 += [[BoW3_1[i], num1]] num1 = 0 print("BoW =", BoW4)
# 6330212121 (25.20) 67 (2021-03-22 14:40) k='yYnN' #p=' .,:;?()[]\"\' -_\\!' engandmath='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' p = '''!()-[]{};:'"\,<>./?@#$%^&*_~ ''' p2 = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' def fhash(w,m): sum=0 for i in range(len(w)): sum+=ord(w[i])*(37**i) return sum%m def bow(f_name,m): #################stopwords#################### stopwords=[] infile = open('stopwords.txt', "r") for line in infile: line=line.strip().split() for e in line: stopwords.append(e) infile.close() #################stopwords#################### #print(stopwords) word='' infile = open(f_name, "r") for line in infile: line=line.strip().lower() #print(line) #word+= line for e in line: if e not in p2: word+= e else: word+=' ' word+=' ' #print(word) #print(word.split()) word=word.split() n_word=[] for e in word: if e not in stopwords: n_word.append(e) #print(n_word) n_word2=[] for i in range(len(n_word)): if n_word[i] not in n_word2: n_word2.append(n_word[i]) #print(n_word2) ############################## if m==-1: BoW=[] for e in n_word2: c=0 for ee in n_word: if e==ee: c+=1 BoW.append([e,c]) BoW.sort() print('BoW =',BoW) ################################### else: BoW_hash1=[] BoW_hash2=[] BoW_hash3=[] for e in n_word: BoW_hash1.append(fhash(e,m)) for i in range(len(BoW_hash1)): if BoW_hash1[i] not in BoW_hash2: BoW_hash2.append(BoW_hash1[i]) for e in BoW_hash2: c=0 for ee in BoW_hash1: if e == ee: c+=1 BoW_hash3.append([e,c]) BoW_hash3.sort() print('BoW =',BoW_hash3) infile.close() def easy(f_name): infile = open(f_name, "r") char_count=0 line_apb=[] line_count=0 word_count=0 for line in infile: line1 = line.strip() line2='' char_count+=len(line1) line_count+=1 for e in line1: if e not in p2: line2+=e else: line2+=' ' line3= line2.split() word_count+=len(line3) for i in range(len(line1)): if line1[i] in engandmath: line_apb.append(line1[i]) infile.close() print('-------------------') print('char count =',char_count) print('alphanumeric count =',len(line_apb)) print('line count =',line_count) print('word count =',word_count) def main(): file_name=input('File name = ') method=input('Use feature hashing ? (y,Y,n,N) ') while not method in k: print('Try again.') method=input('Use feature hashing ? (y,Y,n,N) ') if method=='y' or method=='Y': m=int(input('M= ')) easy(file_name) bow(file_name,m) else: easy(file_name) bow(file_name,-1) #print(fhash('shane',4)) main()
# 6330213821 (15.00) 68 (2021-03-22 23:59) def fhash(a,m): for i in range(len(b)): b = ord(a[i]) c = 37**i d += b*c d = d%m return d file_name = input('File name = ') j = input('Use feature hashing ? (y,Y,n,N) ') for i in range(1000): if j not in ['y','Y','n','N']: print('try again.') j = input('Use feature hashing ? (y,Y,n,N) ') if j == 'n' or 'N': pass else : m = input('M = ') def stop_word(file): sw = open('stopwords.txt', 'r') d = '' for i in (sw): d += i a = d.split() return a sw.close() n = 0 a = 0 wc = 0 f = open(file_name, 'r') for line in f: n += 1 d = [] def char_count(file_name): sw = open(file_name) c = 0 l = 0 for line in sw: c += len(line) l += 1 n = c - l + 1 sw.close() return n def rmsw(n): f = open(file_name, 'r') f.close def word_count(n): wc = 0 f = open(file_name, 'r') for line in f: line = f.readline() b = line.split() d.append(b) wc += len(d) return wc f.close() def ac(file_name): a = 0 f = open(file_name, 'r') for line in f: for e in line: if e.lower() in 'abcdefghijklmnopqrstuvwxyz': a += 1 if e in '0123456789': a += 1 return a f.close() f.close() print('char count =',char_count(file_name)) print('alphanumeric count =',ac(file_name)) print('line count =',n) print('word count =',word_count(n)) print('BoW =')
# 6330214421 (26.00) 69 (2021-03-21 17:32) def charcount(a): infile = open(a,"r") x = 0 for line in infile: r = line.strip() x += len(r) infile.close() return x def alcount(a): infile = open(a,"r") x = "" for line in infile: r = line.strip() for i in r: if "a" <= i <= "z" or "A" <= i <= "Z" or "0" <= i <= "9": x += i infile.close() return len(x) def linecount(a): infile = open(a,"r") x = 0 for line in infile: x+= 1 infile.close() return x def wordcount(a): infile = open(a,"r") o = [] for line in infile: y = "" r = line.strip() for i in range(len(r)): if "a" <= r[i] <= "z" or "A" <= r[i] <= "Z" or "0" <= r[i] <= "9": y += r[i] if not ("a" <= r[i] <= "z" or "A" <= r[i] <= "Z" or "0" <= r[i] <= "9") : y += " " x = y.split() for i in x: o.append(i) infile.close() return len(o) def fhash(a,f): x = 0 for i in range(len(a)): r = ord(a[i])*(37**i) x += r k = x%f return k file_name = input("File name = ") b = input("Use feature hashing ? (y,Y,n,N) ") while b not in ["y","Y","n","N"]: print("Try again.") b = input("Use feature hashing ? (y,Y,n,N) ") if b in ["n","N"]: print("-------------------") print("char count =",charcount(file_name)) print("alphanumeric count =",alcount(file_name)) print("line count =",linecount(file_name)) print("word count =",wordcount(file_name)) x1 = open("stopwords.txt" , "r") y1 = [] for line in x1: r1 = line.strip() if r1 != "": s1 = r1.split() for i in s1: y1.append(i) x1.close() x2 = open(file_name , "r") o = [] for line in x2: y3 = "" r2 = line.strip().lower() for i in range(len(r2)): if "a" <= r2[i] <= "z" or "A" <= r2[i] <= "Z" or "0" <= r2[i] <= "9": y3 += r2[i] if not ("a" <= r2[i] <= "z" or "A" <= r2[i] <= "Z" or "0" <= r2[i] <= "9") : y3 += " " k = y3.split() for i in k : o.append(i) x2.close() y4 = [] for i in o: if i not in y1: y4.append(i) y4.sort() y5 = [] c = 1 for i in range(len(y4)-1): if y4[i] == y4[i+1]: c += 1 if y4[i] != y4[i+1]: y5.append([y4[i],c]) c = 1 j = y4.index(y4[len(y4)-1]) if j == len(y4)-1: y5.append([y4[-1],1]) elif j != len(y4)-1: c = len(y4)-j y5.append([y4[-1],c]) print("BoW =",y5) if b in ["Y","y"]: m = int(input("M = ")) print("-------------------") print("char count =",charcount(file_name)) print("alphanumeric count =",alcount(file_name)) print("line count =",linecount(file_name)) print("word count =",wordcount(file_name)) x1 = open("stopwords.txt" , "r") y1 = [] for line in x1: r1 = line.strip() if r1 != "": s1 = r1.split() for i in s1: y1.append(i) x1.close() x2 = open(file_name , "r") o = [] for line in x2: y3 = "" r2 = line.strip().lower() for i in range(len(r2)): if "a" <= r2[i] <= "z" or "A" <= r2[i] <= "Z" or "0" <= r2[i] <= "9": y3 += r2[i] if not ("a" <= r2[i] <= "z" or "A" <= r2[i] <= "Z" or "0" <= r2[i] <= "9") : y3 += " " k = y3.split() for i in k : o.append(i) x2.close() y4 = [] for i in o: if i not in y1: y4.append(i) y5 = [] for i in y4: u = fhash(i,m) y5.append(u) y5.sort() y6 = [] c = 1 for i in range(len(y5)-1): if y5[i] == y5[i+1]: c += 1 if y5[i] != y5[i+1]: y6.append([y5[i],c]) c = 1 j = y5.index(y5[-1]) if j == len(y5)-1: y6.append([y5[-1],1]) elif j != len(y5)-1: c = len(y5)-j y6.append([y5[-1],c]) print("BoW =",y6)
# 6330215021 (30.00) 70 (2021-03-22 05:06) def cha_count(x): c = 0 for i in x: c += len(i) return c def alpha_count(x): c = 0 num = "0123456789" a = "abcdefghijklmnopqrstuvwxyz" for i in x: for e in i: if e in num or e in a: c += 1 return c def create_y(x): num = "0123456789" a = "abcdefghijklmnopqrstuvwxyz" y = [] for i in x: t = "" for e in i: if e in num or e in a: t += e else: t += " " for c in t.split(): y.append(c) return y def Fhash(w, M): s = 0 for i in range(len(w)): s += (37**i)*ord(w[i]) g = s%M return g def BOW(x, d, M): if d == "y": BoW = [] l = [] for i in range(len(x)): l.append(Fhash(x[i], M)) for e in range(M): a = l.count(e) if a != 0: BoW.append([e,a]) else: BoW = [] l = [] for i in x: if i not in l: l.append(i) for c in l: a = x.count(c) BoW.append([c,a]) return BoW file_name = input("File name = ") fhash = input("Use feature hashing ? (y,Y,n,N) ") fhash = fhash.lower() while fhash not in ['y','n']: print("Try again.") fhash = input("Use feature hashing ? (y,Y,n,N) ") fhash = fhash.lower() M = 1 if fhash == "y": M = int(input("M = ")) fn = open("stopwords.txt", "r") stopwords = [] for c in fn: stopwords.append(c.lower().strip()) lsw = [] for i in stopwords: p = i.split() for k in p: lsw.append(k) file = open(file_name, "r") words = [] for c in file: words.append(c.lower().strip()) lw = [] for i in words: p = i.split() for k in p: lw.append(k) lnsw = [] y = create_y(lw) for i in y: if i not in lsw: lnsw.append(i) print("-------------------") print("char count =", cha_count(words)) print("alphanumeric count =", alpha_count(words)) print("line count =", len(words)) print("word count =", len(create_y(words))) print("BoW =", sorted(BOW(lnsw, fhash, M))) fn.close() file.close()
# 6330216721 (30.00) 71 (2021-03-22 23:58) debug_assertion = False run = True def debug_assert(exp): """ :type exp: bool | () -> bool """ if not debug_assertion: return if callable(exp): assert exp() else: assert exp def is_lower_alnum(w: str, exclude_empty=True): return (not exclude_empty or w != '') and all('a' <= c <= 'z' or '0' <= c <= '9' for c in w) def fhash(w: str, m: int, g=37): debug_assert(lambda: is_lower_alnum(w)) return sum(ord(c) * pow(g, n, m) for n, c in enumerate(w)) % m def tests(): debug_assert(lambda: fhash('shane', 4) == 3) debug_assert(lambda: fhash('football', 4) == 3) debug_assert(lambda: fhash('team', 4) == 3) debug_assert(lambda: fhash('likes', 4) == 0) debug_assert(lambda: fhash('big', 4) == 2) debug_assert(lambda: fhash('fan', 4) == 1) debug_assert(lambda: fhash('arsenal', 4) == 2) debug_assert(lambda: word_split("") == []) debug_assert(lambda: word_split("a") == ['a']) debug_assert(lambda: word_split("ab c") == ['ab', 'c']) debug_assert(lambda: word_split("ab cdef1 g2 3 4") == ['ab', 'cdef1', 'g2', '3', '4']) debug_assert(lambda: word_split('Abc:a18 ("Okay")') == ['Abc', 'a18', 'Okay']) # debug_assert(lambda: process("empty.txt", None, []) == (0,0,0,0,[])) def handle_input(): file_name = input("File name = ") while True: prompt = input("Use feature hashing ? (y,Y,n,N) ") if prompt.lower() == "y": should_feature_hash = True break elif prompt.lower() == "n": should_feature_hash = False break print("Try again.") m = int(input("M = ")) if should_feature_hash else None return file_name, m def read_stopwords(): stopwords = [] with open("stopwords.txt", "r") as file: for line in file: stopwords += [i.lower() for i in line.split()] return stopwords def word_split(string: str): words = [] w = [] for c in string: if c.isalnum(): w.append(c) else: if len(w) > 0: words.append("".join(w)) w = [] if len(w) > 0: words.append("".join(w)) return words def process(file_name: str, m, stopwords): """ :type m: int | None :type stopwords: list[(str, int)] """ line_count = 0 char_count = 0 alphanum_count = 0 word_count = 0 bow = [] with open(file_name, "r") as file: line_count = len([line for line in file]) with open(file_name, "r") as file: for line in file: last_line = line char_count += len(line.replace('\n', '')) alphanum_count += sum(c.isalnum() for c in line) words = word_split(line) word_count += len(words) for word in (w.lower() for w in words if w.lower() not in stopwords): key = word if m is None else fhash(word, m) i = next((i for i, (k, v) in enumerate(bow) if k == key), None) if i is not None: bow[i][1] += 1 else: bow.append([key, 1]) return char_count, alphanum_count, line_count, word_count, bow def print_info(char_count, alphanum_count, line_count, word_count, bow): bow.sort(key=lambda x: -x[1]) print(""" char count = {} alphanumeric count = {} line count = {} word count = {} BoW = {} """.strip().format(char_count, alphanum_count, line_count, word_count, bow)) if debug_assertion: tests() if run: file_name, m = handle_input() stopwords = read_stopwords() print("-------------------") info = process(file_name, m, stopwords) print_info(*info)
# 6330217321 (30.00) 72 (2021-03-22 21:25) def fhash(w, M): s = 0 for c in range (len(w)): s += ord(w[c])*37**c s = s%M return s def bow(w): w = w.lower() new = "" for e in w: if "0" <= e <= "9" or "a" <= e <= "z": new += e else: new += " " new = new.split() no = [] out = [] for e in new: if e not in no: no.append(e) for i in range (len(no)): if no[i] not in stopword: fre = w.count(no[i]) out.append([no[i], fre]) out.sort() return out def bhash(w, M): w = w.lower() new = "" for e in w: if "0" <= e <= "9" or "a" <= e <= "z": new += e else: new += " " fre = 0 ah = [] out = [] new = new.split() for i in range (len(new)): if new[i] not in stopword: ah.append(fhash(new[i], M)) for i in range (len(ah)): fre = ah.count(ah[i]) out.append([ah[i], fre]) out2 = [] for e in out: if e not in out2: out2.append(e) out2.sort() return out2 print("File name = ", end = "") file_name = input() char = 0 al = 0 for_word_count = "" word = 0 bh = [] alll = "" li = [] stopword = "" stop = open("stopwords.txt", "r") for line in stop: line = line.lower() for e in line: stopword += e stop.close() stopword = stopword.split() fn = open(file_name, "r") for line in fn: line = line.lower() l = line.split() for e in l: alll += e + " " for e in line: if e != "\n": char += 1 if "0" <= e <= "9" or "a" <= e <= "z": al +=1 if "0" <= e <= "9" or "a" <= e <= "z" or e == " " or e == "\n": if e == "\n": for_word_count += " " else: for_word_count += e elif not "0" <= e <= "9" or "a" <= e <= "z" or e == " " or e == "\n": for_word_count += " " li.append(line) fn.close() for_word_count = for_word_count.split() word = len(for_word_count) print("Use feature hashing ? (y,Y,n,N) ", end = "") f = input() while f not in "y Y n N": print("Try again.") print("Use feature hashing ? (y,Y,n,N) ", end = "") f = input() if f == "n" or f == "N": print("-------------------") print("char count =",char) print("alphanumeric count =",al) print("line count =",len(li)) print("word count =",word) print("BoW =",bow(alll)) elif f == "y" or f == "Y": print("M = ", end = "") M = int(input()) print("-------------------") print("char count =",char) print("alphanumeric count =",al) print("line count =",len(li)) print("word count =",word) print("BoW =",bhash(alll, M))
# 6330219621 (12.00) 73 (2021-03-21 16:48) def stopwords() : spfile = open("stopwords.txt","r") stp = list() for line in spfile : sw = line.split() for s in sw : stp.append(s) spfile.close() return stp def readfile(file_name) : read = list() file = open(file_name,"r") for line in file : line = line.strip() read.append(line) file.close() return read def cutsymbol(word_list) : readword = '' for e in word_list : e = e.strip() for k in e : if k in '\\\"^&*(){}[]<>?%$#@!+-=_|.,:;/\'~' : readword += ' ' else : readword += k.lower() refile = readword.split() return refile def count(word_list) : count = 0 for w in word_list : count += len(w) return count def cutstop(word_list) : cutword = list() for w in word_list : if w not in stopwords() : cutword.append(w) return cutword def fhash(word_list,M) : fha_list = list() for e in word_list : s,n = 0,0 for w in e : s += ord(w)*(37**n) n += 1 s = s % M fha_list.append(s) return fha_list def B_O_W(word_list) : word_list.sort() count = 1 BOW = list() for i in range(len(word_list)-1) : if word_list[i] == word_list[i+1] : count += 1 else : BOW.append([word_list[i],count]) count = 1 BOW.append([word_list[-1],count]) return BOW def main() : file_name = input("File name = ").strip() relist = cutstop(cutsymbol(readfile(file_name))) feature = input("Use feature hasing ? (y,Y,n,N) ").strip() while feature not in ['y','Y','n','N'] : print("Try again") feature = input("Use feature hasing ? (y,Y,n,N) ").strip() if feature in ['y','Y'] : M = int(input("M = ")) relist = fhash(relist,M) print("-------------------") print("char count =",count(readfile(file_name))) print("alphanumric count =",count(cutsymbol(readfile(file_name)))) print("line count =",len(readfile(file_name))) print("word count =",len(cutsymbol(readfile(file_name)))) print(B_O_W(relist)) #------------------------------------------- main()
# 6330221821 (25.00) 74 (2021-03-18 20:46) def stopw(wordlist): l,g = [],[] file = open('stopwords.txt', 'r') for line in file: c = line.split() for e in c: l.append(e) file.close() for e in wordlist: if e not in l: g.append(e) return g def read(name): alp = 'abcdefghijklmnopqrstuvwxyz\ ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' linec = 0 b = '' file = open(name, 'r') for line in file: #line count linec += 1 #string for e in line: if e in alp: b += e else: b += ' ' file.close() return b, linec def counter(string, linec): #char count charc = len(string)-linec+1 #alp num count a = string.split() b = ''.join(a) alpnumc = len(b) #word count wordc = len(a) return charc, alpnumc, wordc def bow(string): a = string.lower().split() b = stopw(a) b.sort() z = [] i = 1 for e in range(len(b)-1): if b[e] == b[e+1]: i += 1 else: z.append([b[e], i]) i = 1 z.append([b[-1],i]) return z def fhash(word, M): b = 0 for e in range(len(word)): a = ord(word[e]) b += a*(37**e) return b%M def bowfhash(string, M): a = string.lower().split() b = stopw(a) c = [] for e in b: c.append(fhash(e, M)) c.sort() z = [] i = 1 for e in range(len(c)-1): if c[e] == c[e+1]: i += 1 else: z.append([c[e],i]) i = 1 z.append([c[-1],i]) return z def main(file_name): y = input('Use feature hashing ? (y,Y,n,N) ') while y not in 'NnYy': print('Try again.') y = input('Use feature hashing ? (y,Y,n,N) ') if y in 'Yy': M = int(input('M = ')) string, linec = read(file_name) charc, alpnumc, wordc = counter(string, linec) print('-------------------') print('char count =',charc) print('alphanumeric count =',alpnumc) print('line count =',linec) print('word count =',wordc) if y in 'Nn': print('BoW =', bow(string)) else: print('BoW =', bowfhash(string, M)) file_name = input('File name = ') main(file_name)
# 6330222421 (15.60) 75 (2021-03-21 11:00) file_name = input("File name = ") feature = input("Use feature hashing ? (y,Y,n,N) ") if feature in 'Yy' : M = int(input("M = ")) check = True elif feature in 'Nn' : check = False else : while feature not in 'YyNn': print("Try again") feature = input("Use feature hashing ? (y,Y,n,N) ") if feature in 'Yy' : M = int(input("M = ")) check = True elif feature in 'Nn' : check = False wordd = '' char = [] line_count = 0 apl = 0 fin = open(file_name, "r") for line in fin : line_count += 1 for e in line : if e in 'qwertyuiopasdfghjklzxcvbnm1234567890QWERTYUIOPASDFGHHJKLMNBVCXZ' : wordd += e.lower() else : wordd += ' ' char += line.strip() word = wordd.split() words_count = len(word) char_count = len(char) for i in range(len(char)-1) : if char[i] not in 'qwertyuiopasdfghjklzxcvbnm1234567890' : apl += 1 alpha_count = char_count - apl fin.close() stp = [] fin = open('stopwords.txt', "r") for line in fin : stp += line.split() bow = [] for i in word : #find stp if i not in stp : bow.append(i) bow.sort() fin.close() def fhash(bow,M) : a = 0 for i in range(len(bow)) : fh = ord(bow[i]) a += fh*(37**i) fhash = a%M return fhash def BoW(bow) : a = 1 b = [] i = 0 while i < len(bow)-1 : if bow[i] == bow[i+1] : a += 1 else : b.append([bow[i],a]) a = 1 i += 1 b.append([bow[-1],a]) return b def ffhash(bow,M) : f = [] for i in bow : f.append(fhash(i,M)) f.sort() a = 1 b = [] for i in range(len(f)-1) : if f[i] == f[i+1] : a += 1 else : b.append([f[i],a]) a = 1 b.append([f[-1],a]) return b if check == True : bbbb = ffhash(bow,M) else : bbbb = BoW(bow) print("-------------------") print("char count = " , char_count) print("alphanumeric count = " , alpha_count) print("line count = " , line_count) print("word count = " , words_count) print("BoW = " , bbbb)
# 6330223021 (30.00) 76 (2021-03-21 18:50) def read_stopwords(): file = open('stopwords.txt', 'r'); stop_words = '' for i in [e.strip() for e in file.readlines()]: stop_words += i+' ' file.close() return stop_words.split() def read_file(file_name): file = open(file_name, 'r'); temp = '' for i in [line.strip() for line in file.readlines()]: temp += i+' ' file.close(); words = '' for i in temp: if i.isalnum() == False: words += ' ' else: words += i return words.lower() def alphanum_count(file_name): c = 0 for i in read_file(file_name): for e in i: if e.isalnum() == True: c += 1 return c def line_count(file_name): file = open(file_name, 'r') temp = file.readlines() file.close() return len(temp) def char_count(file_name): file = open(file_name, 'r'); temp = []; c = 0 for i in [line for line in file.readlines()]: if '\n' in i: c += 1 temp += i file.close() return len(temp) - c def main(file_name): print('-------------------') print('char count =', char_count(file_name)) print('alphanumeric count =', alphanum_count(file_name)) print('line count =', line_count(file_name)) print('word count =', len(read_file(file_name).split())) def words_nostop(file_name): words = [] for i in read_file(file_name).split(): if i in read_stopwords(): words += [] else: words += [i] return ' '.join(words) def BoW(file_name): x = []; y = []; bow = [] for i in words_nostop(file_name).split(): if i not in x: x.append(i) y.append(1) else: y[x.index(i)] += 1 for i in range(len(x)): bow.append([x[i], y[i]]) return sorted(bow) def fhash(w, M): c = 0 for i in range(len(w)): c += ord(w[i])*37**i return c % M def BoW_fhash(file_name, M): y = []; x = []; z = []; bow = [] for i in range(len(words_nostop(file_name).split())): y.append(fhash(words_nostop(file_name).split()[i], M)) for i in range(len(y)): if y[i] not in x: x.append(y[i]) z.append(1) else: z[x.index(y[i])] += 1 for i in range(len(x)): bow.append([x[i], z[i]]) return sorted(bow) #----------------------------------------------------- file_name = input('File name = ') use_fh = input('Use feature hashing ? (y,Y,n,N) ') while use_fh not in 'nNyY': print('Try again.') use_fh = input('Use feature hashing ? (y,Y,n,N) ') if use_fh in 'Yy': M = int(input('M = ')) main(file_name) print('BoW =', BoW_fhash(file_name, M)) else: main(file_name) print('BoW =', BoW(file_name))
# 6330224721 (29.00) 77 (2021-03-21 21:38) file_name = input("File name = ") method = input("Use feature hashing ? (y,Y,n,N) ") while method not in "yYnN": print("Try again.") method = input("Use feature hashing ? (y,Y,n,N) ") if method in "yY": M = int(input("M = ")) print("-------------------") #=================================================== stopWords = [] infile = open("stopwords.txt", "r") for line in infile: stopWords += line.split() infile.close() #=================================================== infile2 = open(file_name, "r") isBlankFile = True charCount = 0 alphaCount = 0 lineCount = 0 wordList = [] for line in infile2: isBlankFile = False charCount += len(line) - 1 start = 0 stop = 0 for e in line: if ('0' <= e <= '9') or ('a' <= e <= 'z') or ('A' <= e <= 'Z'): alphaCount += 1 stop += 1 tempWord = line[start:stop] else: tempWord = line[start:stop] if len(tempWord) != 0: wordList.append(tempWord.lower()) tempWord = '' start = stop + 1 stop = start if len(tempWord) != 0: wordList.append(tempWord.lower()) lineCount += 1 # For blank file case if not isBlankFile: charCount += 1 wordCount = len(wordList) #========Compute BoW======== finishCutWordList = [word for word in wordList if word not in stopWords] BoW = [] if method in 'yY': wordToNumberList = [] for word in finishCutWordList: number = 0 i = 0 for e in word: number += ord(e)*((37)**i) i += 1 wordToNumberList.append(number % M) temp = [] for num in wordToNumberList: if num not in temp: BoW.append([num,wordToNumberList.count(num)]) temp.append(num) elif method in 'nN': temp = [] for word in finishCutWordList: if word not in temp: BoW.append([word,finishCutWordList.count(word)]) temp.append(word) BoW.sort() #========Output======== print("char count =",charCount) print("alphanumeric count =",alphaCount) print("line count =",lineCount) print("word count =",wordCount) print("BoW =",BoW)
# 6330225321 (22.90) 78 (2021-03-22 00:28) file_name = input('File name = ') a = input('Use feature hashing ? (y,Y,n,N) ') x = open('stopwords.txt','r') y = open(file_name,'r') while a != 'n' and a != 'N' and a != 'y' and a != 'Y' : print('Try again.') a = input('Use feature hashing ? (y,Y,n,N) ') o = '' for q in y : r = q.lower() for e in r : if 'a'<=e<='z' or 'A'<=e<='Z' or '1'<=e<='9': o +=e else : o+=' ' p = o.split() m = '' h = '' for w in x : if w[-1] == '\n' : h +=w[0:-1]+' ' else : h +=' ' +w t = h.split() for e in p : if e in t : m += ' ' else : m += ' ' +e n = m.split() n.sort() BoW = [] for i in range(len(n)) : z = m.count(n[i]) if n[i-1] != n[i] : BoW.append([n[i],z]) def fhash(w,M) : B = 0 for i in range(len(w)) : B+=ord(w[i])*(37**i) C = B%int(M) return C if a== 'y' or a=='Y' : M = input('M = ',) Bows =[] for e in n : Bows.append(fhash(e,M)) Bows.sort() E = [] for i in range(len(Bows)) : W = Bows.count(Bows[i]) if Bows[i-1] !=Bows[i] : E.append([Bows[i],W]) x.close() y.close() print('-------------------') b = open(file_name,'r') c = 0 line = b.readline().strip() while len(line) > 0 : c += len(line) line = b.readline().strip() print('char count =',c) b.close() d = open(file_name,'r') f = '' for line in d : for e in line : if 'a'<=e<='z' or 'A'<=e<='Z' or '1'<=e<='9' : f+=e else : f +='' print('alphanumeric count =',len(f)) d.close() g = open(file_name,'r') i = 0 for h in g : if h[-1] == '\n' : i +=1 print('line count =',i+1) g.close() j = open(file_name,'r') n ='' for l in j : for m in l : if 'a'<=m<='z' or 'A'<=m<='Z' or '1'<=m<='9': n+= m else : n+=' ' k = n.split() print('word count =',len(k)) j.close() if a== 'n' or a=='N' : print('BoW =',BoW) if a== 'y' or a=='Y' : print('BoW =',E)
# 6330226021 (30.00) 79 (2021-03-22 22:44) def fhash(word, M): Fhash = 0 M = int(M) for i in range(len(word)): Fhash += ord(word[i])*(37**i) return Fhash % M def BoW(clause): bow = [] clause = clause.split() clause.sort() n = 1 b_word = None for word in clause: if word == b_word: n += 1 b_word = word else: bow.append([b_word, n]) b_word = word n = 1 bow.append([b_word, n]) bow = bow[1::] return bow file_name = input('File name = ') FH = input('Use feature hashing ? (y,Y,n,N) ') while True: if FH not in ['y','Y','n','N']: print('Try again.') FH = input('Use feature hashing ? (y,Y,n,N) ') else: break if FH in ['y','Y']: M = input('M = ') stopwords = open('stopwords.txt', 'r') list_of_stopwords = '' for line in stopwords: for e in line: if e != '\n': list_of_stopwords += e else: list_of_stopwords += ' ' list_of_stopwords = list_of_stopwords.split() stopwords.close() open_file = open(file_name, 'r') char_count = '' alphanumeric_count = '' line_count = 0 word_count = '' for line in open_file: line = line.lower() line_count += 1 for e in line: if e != '\n': char_count += e for e in line: if 'a' <= e <= 'z' or '0' <= e <= '9': alphanumeric_count += e for e in line: if 'a' <= e <= 'z' or '0' <= e <='9': word_count += e else: word_count += ' ' word_count = word_count.split() open_file.close() print('-------------------') print('char count =', len(char_count)) print('alphanumeric count =', len(alphanumeric_count)) print('line count =', line_count) print('word count =', len(word_count)) no_stop = [] for e in word_count: if e not in list_of_stopwords: no_stop.append(e) if FH in ['y','Y']: Fhash = '' for word in no_stop: Fhash += str(fhash(word, M)) + ' ' bow_ = BoW(Fhash.strip()) bow = [] for fh, n in bow_: bow.append([int(fh), n]) else: bow = BoW(' '.join(no_stop)) print('BoW =', bow)
# 6330227621 (30.00) 80 (2021-03-22 17:11) print("File name = ",end='') file_name = input() print("Use feature hashing ? (y,Y,n,N) ",end='') ans = input().lower() #//////////////////////////////////////////////////////////////////////////// while ans not in ['y','Y','n','N']: print('Try again.') print("Use feature hashing ? (y,Y,n,N) ",end='') ans = input().lower() #+++++++++++++++++NONONONONONO+++++++++++++++++++++++++++ if ans =='n': print('-------------------') I=open(file_name, "r") CHAR=0 L=0 W=0 ALPHA=0 nL='' for line in I: #.....CHAR......... if line[-1]!='\n': CHAR+= len(line) else: CHAR+= len(line)-1 #.......ALPHA...... LL=line.lower() for i in range(len(LL)): if ('a'<=LL[i]<='z' or '0'<=LL[i]<='9'): ALPHA+=1 #...................... LL=line.lower() for i in range(len(LL)): if ('a'<=LL[i]<='z' or '0'<=LL[i]<='9'): nL+= LL[i] else: nL+=' ' #....L..... L+=1 x1=nL.split() W+= len(x1) I.close() print('char count =',CHAR) print('alphanumeric count =',ALPHA ) print('line count =',L ) print('word count =',W ) #..........BoW............ S=open('stopwords.txt','r') s=[] for line in S: s+=line.split() S.close() x2=[] for i in range(len(x1)): if x1[i] not in s: x2.append(x1[i]) y=[] z=[] for i in range(len(x2)): if x2[i] not in y: y.append(x2[i]) for i in range(len(y)): z.append([y[i],0]) for i in range(len(x2)): n=y.index(x2[i]) z[n][1]+=1 print('BoW =',z) #++++++++++++++++++++YESYESYESYESYES+++++++++++++++++++++++ elif ans=='y': print('M = ',end='') M=input() print('-------------------') #........................... I=open(file_name, "r") CHAR=0 L=0 W=0 ALPHA=0 nL='' for line in I: #.....CHAR......... if line[-1]!='\n': CHAR+= len(line) else: CHAR+= len(line)-1 #.......ALPHA...... LL=line.lower() for i in range(len(LL)): if ('a'<=LL[i]<='z' or '0'<=LL[i]<='9'): ALPHA+=1 #...................... LL=line.lower() for i in range(len(LL)): if ('a'<=LL[i]<='z' or '0'<=LL[i]<='9'): nL+= LL[i] else: nL+=' ' #....L..... L+=1 x1=nL.split() W+= len(x1) I.close() print('char count =',CHAR) print('alphanumeric count =',ALPHA ) print('line count =',L ) print('word count =',W ) #............................. I=open(file_name, "r") nL='' for line in I: LL=line.lower() for i in range(len(LL)): if ('a'<=LL[i]<='z' or '0'<=LL[i]<='9'): nL+= LL[i] else: nL+=' ' x1=nL.split() I.close() S=open('stopwords.txt','r') s=[] for line in S: s+=line.split() S.close() x2=[] for i in range(len(x1)): if x1[i] not in s: x2.append(x1[i]) m1=[] m2=[] m3=[] a=0 A=[] for i in range(int(M)): m1.append(i) m2.append([i,0]) for i in range(len(x2)): for j in range(len(x2[i])): a+=(ord(x2[i][j])*(37**j)) a=a%int(M) A.append(a) a=0 for i in range(len(A)): n=m1.index(A[i]) m2[n][1]+=1 for i in range(len(m2)): if m2[i][1]!=0: m3.append(m2[i]) print('BoW =',m3) #..................................................
# 6330228221 (22.80) 81 (2021-03-20 22:24) file_name = input('File name = ').strip() f = open( file_name , 'r') file = [] count = 0 char_count = 0 for line in f : file.append(line) char_count += len(line) count+=1 f.close() file = "".join(file) ######################################################## x = input('Use feature hashing ? (y,Y,n,N) ') while x not in 'y,Y,n,N' : print('Try again.') x = input('Use feature hashing ? (y,Y,n,N) ') ######################################################## stop = open('stopwords.txt' ,'r') stopword = '' for line in stop : stopword += line stopword = stopword.split() stop.close() ######################################################## def BoW (w): BoW = '' final = [] final1 = [] bow = word_list_no_stopword(w) for i in range(len(bow)): if bow[i] not in final : final.append(bow[i]) final.append(bow.count(bow[i])) f1 = final[::2] f2 = final[1::2] for i in range(len(f1)): final1.append([f1[i],f2[i]]) return final1 def fhash (w,M): ans = 0 for i in range(len(w)): ans += ord(w[i])*(37**i) ans = ans % M return ans def word_list (w): w = w.lower() w1 = '' for i in w : if i not in '.;/\\:;,()!#%"\'': w1+=i else : w1+=' ' wword = w1.split() return wword def word_list_no_stopword (w): BoW = '' wword = word_list(w) for i in range(len(wword)) : if wword[i] in stopword : BoW += '' else : BoW += wword[i] BoW += ' ' bow = BoW.split() return bow ##this is ans word_count = len(word_list(file)) alphanumeric_count = len("".join(word_list(file))) char_count -= (count-1) line_count = count BoW = BoW(file) ##this is ans if x == 'y' or x == 'Y' : M = int(input('M = ')) print('-------------------') ans = [] all_BoW_fhash = [] fhash_string = word_list_no_stopword(file) for i in range(len(fhash_string)): all_BoW_fhash.append(str(fhash(fhash_string[i],M))) BoW_fhash = [] for i in range(len(all_BoW_fhash)): if all_BoW_fhash[i] not in BoW_fhash: BoW_fhash.append(all_BoW_fhash[i]) BoW_fhash.append(all_BoW_fhash.count(all_BoW_fhash[i])) BoW_fhash1 = BoW_fhash[::2] BoW_fhash2 = BoW_fhash[1::2] for i in range(len(BoW_fhash1)): ans.append([int(BoW_fhash1[i]),BoW_fhash2[i]]) print('char count =',char_count) print('alphanumeric count =',alphanumeric_count) print('line count =',line_count) print('word count =',word_count) print('BoW =',ans) if x == 'n' or x == 'N' : print('-------------------') print('char count =',char_count) print('alphanumeric count =',alphanumeric_count) print('line count =',line_count) print('word count =',word_count) print('BoW =',BoW)
# 6330229921 (30.00) 82 (2021-03-20 23:40) def fhash(w, M): a = 0 for i in range(len(w)): a += ord(w[i]) * (37 ** i) b = a % M return b def countrepeat(w, words): a = [] for e in words: if e not in a: a.append(e) b = 0 for e in words: if e == w: b += 1 return b file_name = input('File name = ') ans = input('Use feature hashing ? (y,Y,n,N) ') while ans not in ['Y','y','N','n']: print('Try again.') ans = input('Use feature hashing ? (y,Y,n,N) ') stopwords = open('stopwords.txt', 'r') file = open(file_name, 'r') swline = stopwords.readline() sw = [] while len(swline) > 0: a = swline.lower() sw += a.split() swline = stopwords.readline() Letters = 'abcdefghijklmnopqrstuvwxyz' Nums = '0123456789' fline = file.readline() char_count = 0 line_f = 0 sen = '' while len(fline) > 0: b = fline.lower() for i in range(len(b)): if b[i] in 'abcdefghijklmnopqrstuvwxyz0123456789': sen += b[i] else: sen += ' ' line_f += 1 char_count += len(fline) sen += ' ' fline = file.readline() alpha = 0 for i in range(len(sen)): if sen[i] in Letters or sen[i] in Nums: alpha += 1 w = sen.split() words = [] for e in w: if e not in sw: words.append(e) a = [] same = [] for e in words: if e not in a: a.append(e) for e in a: b = [] b.append(e) b.append(countrepeat(e,words)) same.append(b) if ans in 'Nn': print('-------------------') print('char count =', char_count - line_f + 1) print('alphanumeric count =', alpha) print('line count =', line_f) print('word count =', len(w)) print('BoW =', same) elif ans in 'Yy': M = int(input('M = ')) n = [] g = [] fh = [] for e in words: c = fhash(e, M) n.append(c) for e in n: if e not in g: g.append(e) g.sort() for e in g: f = [] f.append(e) f.append(countrepeat(e,n)) fh.append(f) print('-------------------') print('char count =', char_count - line_f + 1) print('alphanumeric count =', alpha) print('line count =', line_f) print('word count =', len(w)) print('BoW =', fh) stopwords.close() file.close()
# 6330230421 (21.40) 83 (2021-03-18 21:41) def run(): file_name = open(input("File name = "),"r") text,n_line = reading(file_name) f = input("Use feature hashing ? (y,Y,n,N) ") while not f in ['y','Y','n','N']: print("Try again.") f = input("Use feature hashing ? (y,Y,n,N) ") else : if f == 'y' or f == 'Y': M = int(input("M = ")) else : M = "" print('-'*19) print("char count =",len(text)) print("alphanumeric count =",alpha_count(text)) print("line count =",n_line) print("word count =",w_count(text)) print("BoW =",BoW(text,f,M)) file_name.close() def reading(file): t = "" l = 0 for line in file: l += 1 t += line.strip().lower() return t,l def fhash(w,M): c = 0 for i in range(len(w)): c += ord(w[i])*((37)**i) return c % M def alpha_count(text): n = 0 for c in text: if c in alpha or c in num: n += 1 return n def w_count(text): t = "" for i in text: if i in alpha or i in num: t += i else: t += " " words = t.split() return str(len(words)) def frequency(word,f,M): words = [] fre = [] c = 0 for i in range(len(word)): w_check = word[i] for i in word: if w_check == i: c += 1 if not w_check in words: words.append(w_check) fre.append(c) c = 0 c = 0 result = [] for i in range(len(words)): result.append([words[i],fre[i]]) if f == "n" or f == "N": return result if f == "y" or f == "Y": fre_fhash = [] for i in range(len(word)): fre_fhash.append(fhash(word[i],M)) fre_fhash.sort() k = 0 result = [] for i in fre_fhash: for e in fre_fhash: if i == e: k += 1 if not i in [result[z][0] for z in range(len(result))]: result.append([i,k]) k = 0 k = 0 return result def BoW(text,f,M): t = "" for i in text: if i in alpha or i in num: t += i else: t += " " words = t.split() bag_w = [] for i in words: if not i in stopwords: bag_w.append(i) return frequency(bag_w,f,M) #----------------------------------------- alpha = "abcdefghijklmnopqrstuvwxyz" num = "0123456789" #----------------------------------------- file_sw = open("stopwords.txt","r") stopwords = [] for line in file_sw: stopwords += line.split() file_sw.close() run()
# 6330232721 (30.00) 84 (2021-03-22 21:02) alpha = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' alpha2 = alpha.lower() alpha3 = '1234567890' #fhash func def fhash(w,M): x = 0 Fnum = 0 for i in range(len(w)): Fnum += ord(w[i])*(37**x) x += 1 return (Fnum % M) # character count def char_count(a): ch_c=0 lnum=0 inF = open(a, "r") for line in inF: for i in range(len(line)): ch_c += 1 lnum+=1 inF.close() ch_c = ch_c-(lnum-1) return ch_c # alphanumeric count def alpha_count(a): alnum = 0 inF = open(a, "r") for line in inF: for i in range(len(line)): if line[i] in alpha or line[i] in alpha2 \ or line[i] in alpha3: alnum+=1 inF.close() return alnum # line count def line_count(a): l = 0 inF = open(a, "r") for line in inF: l+=1 inF.close() return l # change words to list def cutLetM(a): inF = open(a, "r") b = '' for line in inF.readlines(): b+=line+' ' b=b.lower() c='' for i in range(len(b)): if b[i] in alpha or b[i] in alpha2 \ or b[i] in alpha3: c+=b[i] else: c+=' ' c = c.split() inF.close() return c # change stopwords to list def cutLetS(e): inF = open(e, "r") b = '' for line in inF.readlines(): b+=line+' ' b=b.lower() c='' for i in range(len(b)): if b[i] in alpha or b[i] in alpha2 \ or b[i] in alpha3: c+=b[i] else: c+=' ' c = c.split() inF.close() return c # word count def word_count(a): wnum = len(cutLetM(a)) return wnum # Final BoW without hashF def FinalBoW(c,d): check = [] g = [] h = [] c.sort() d.sort() for i in range(len(c)): if c[i] not in d: check.append(c[i]) for i in range(len(check)): if check[i] not in h: h.append(check[i]) g.append([check[i],0]) g[-1][1]+=1 return g # Final BoW with hash def FinalBoWH(c,d,M): check = [] g = [] h = [] f = [] c.sort() d.sort() for i in range(len(c)): if c[i] not in d: check.append(c[i]) for i in range(len(check)): g.append(fhash(check[i],M)) g.sort() for i in range(len(check)): if g[i] not in f: f.append(g[i]) h.append([g[i],0]) h[-1][1]+=1 return h a = input('File Name = ') b = input('Use feature hashing ? (y,Y,n,N) ') sword = 'stopwords.txt' c = cutLetM(a) d = cutLetS(sword) while b not in ['y','Y','n','N']: print('Try again.') b = input('Use feature hashing ? (y,Y,n,N) ') if b in ['y','Y']: M = int(input('M = ')) print('-------------------') print('char count =',char_count(a)) print('alphanumeric count =',alpha_count(a)) print('line count =',line_count(a)) print('word count =',word_count(a)) if b in ['y','Y']: print('BoW =',FinalBoWH(c,d,M)) if b in ['n','N']: print('BoW =',FinalBoW(c,d))
# 6330233321 (21.85) 85 (2021-03-22 17:19) file_name = input('File name = ') file = open(file_name,'r') line = file.readlines() file.close() s = open('stopwords.txt','r') stop = s.readlines() s.close() k = 0 while k < 1: usefh = input('Use feature hasing ? (y,Y,n,N) ') if usefh == 'y' or usefh == 'Y': fh = True break elif usefh == 'n' or usefh == 'N': fh = False break else: print('Try again.') if fh == True: M = input('M = ') print('------------------') # stop stopword = '' for e in stop: stopword += e.strip()+' ' stopword = stopword.split() # char count t = '' for e in line: t += e.strip() t = t.lower() print('char count = '+str(len(t))) # alphanumeric count def alpha(texts): m = '' for i in range(len(texts)): if texts[i].isalnum() == True: m += texts[i] return 'alphanumeric count = '+str(len(m)) print(alpha(t)) # line count def line_count(texts): count = 0 for c in texts: if c.find('\n') != -1: count += 1 return 'line count = '+str(count+1) print(line_count(line)) # word count def word_count(texts, i): m = '' j = [] for c in texts: if c.isalnum() == True: m += c if c.isalnum() == False: if not m == '': j.append(m) m = '' if i == 0: return 'word count = '+str(len(j)) else: return j print(word_count(t, 0)) # BoW if n N def BoW_nN(): word = word_count(t, 1) bow = [] new_word = [] for c in word: if not c in stopword: new_word.append(c) new_word.sort() #new_word.append('') count = 1 for i in range(len(new_word)-1): if new_word != []: x = new_word.pop(0) if not x in new_word: bow.append([x, count]) elif x in new_word: while x in new_word: count += 1 new_word.pop(0) bow.append([x, count]) count = 1 return print('BoW = '+str(bow)) # flash def flash(w, M): fla = 0 for i in range(len(w)): fla += ord(w[i])*(37**i) fla = fla%int(M) return fla # BoW if y Y def BoW_yY(): word = word_count(t, 1) bow = [] new_word = [] for c in word: if not c in stopword: new_word.append(c) flash_word = [] for c in new_word: flash_word.append(flash(c, M)) flash_word.sort() count = 1 for i in range(len(flash_word)-1): if flash_word != []: x = flash_word.pop(0) if not x in flash_word: bow.append([x, count]) elif x in flash_word: while x in flash_word: count += 1 flash_word.pop(0) bow.append([x, count]) count = 1 return print('BoW = '+str(bow)) if usefh == 'n' or usefh == 'N': BoW_nN() else: BoW_yY()
# 6330234021 (9.87) 86 (2021-03-22 22:03) def char_count(file_name): ### fn = open(file_name) c = 0 for line in fn : for e in line : if e != '\n' : c += 1 fn.close() return c def alphanumeric_count(file_name) : ### fn = open(file_name) c = '' c_1 = "\"\'/\\,.:; " for line in fn : for e in line : if e not in c_1 : c += e fn.close() return len(c)- line_count(file_name) + 1 def line_count(file_name) : ### fn = open(file_name) c = 0 for line in fn : c += 1 fn.close() return c def word_count(file_name) : ### f = open(file_name) c = '' x = 0 alp = 'abcdefghijklmnopqrstuvwxyz0123456789' for line in f : for g in line : if g.lower() in alp : c += g else : c += ' ' x += len(c.split()) c = '' f.close() return x def BoW(file_name , stopwords) : ### f1 = open(file_name) f2 = open(stopwords) l = [] cc = '' d2 = [] alp = 'abcdefghijklmnopqrstuvwxyz0123456789' c ='' for line in f1 : for g in line : if g.lower() in alp : cc += g.lower() else : cc += ' ' for line in f2 : c += ' ' if line[-1] == '\n' : line = line[:-1] for g in line : c += g.lower() c1 = c.split() c2 = '' for r in cc.split() : if r not in c1 : c2 += r + ' ' d = c2.split() for d1 in d : if d1 not in d2 : d2.append(d1) for e in d2 : c3 = 0 w = 0 while c2.find(e,w) != -1 : c3 += 1 w += c2.find(e,w) + 1 l.append([e,c3]) l.sort() f1.close() f2.close() return l def feature_harshing(l,M) : ### x = [] y = '' for l1 in l : c = 0 c1 = 0 for l3 in l1[0] : c += ord(l3)*(37**c1) c1 += 1 flash = c % M y += str(flash)*l1[1] for i in range(M): c2 = 0 c3 = 0 while y.find(str(i),c3) != -1 : c2 += 1 c3 = y.find(str(i),c3) + 1 if c2 != 0 : x.append([i,c2]) return x def display(file_name , stopwords , x) : print("-------------------") print("char_count =", char_count(file_name)) print("alphanumeric_count =", alphanumeric_count(file_name)) print("line_count =", line_count(file_name)) print("word_count =", word_count(file_name)) if x == '0' : print('BoW =' , BoW(file_name , stopwords) ) if x == '1' : print('BoW =' , feature_harshing(BoW(file_name , stopwords),M)) file_name = input("File name = ") x = input("use feature hashing ? (y,Y,n,N)") while x not in ['y','Y','n','N'] : print('Try again.') x = input("use feature hashing ? (y,Y,n,N)") if x == 'y' or x == 'Y' : M = int(input("M = ")) display(file_name , 'stopwords.txt' , '1') else : display(file_name , 'stopwords.txt' , '0')
# 6330235621 (26.00) 87 (2021-03-22 18:15) def flash(w,M) : a=0 for i in range(len(w)) : a+=ord(w[i])*(37**i) b=a%M return b def fea_hash() : x=input("Use feature hashing ? (y,Y,n,N) ") if x=="Y" or x=="y" : return True elif x=="N" or x=="n" : return False else : print("Try again.") def remove(x) : a="" for e in x : if "A"<=e<="Z" or "a"<=e<="z" or "0"<=e<="9" : a+=e else : a+=" " return a file_name=input("File name = ") xx=fea_hash() while xx!=True and xx!=False : xx=fea_hash() if xx==True : M=int(input("M = ")) print("-"*19) stopwords=[] fst = open("stopwords.txt","r") for line in fst : a=line.split() for e in a : stopwords.append(e.lower()) fst.close() fin=open(file_name,"r") char=0 alph_num=0 linecount=0 word=0 wordlist=[] BoW=[] q=1 for line in fin : if "\n" in line : char+=len(line)-1 if "\n" not in line : char+=len(line) for ch in line : if "A"<=ch<="Z" or "a"<=ch<="z" or "0"<=ch<="9" : alph_num+=1 linecount+=1 word_count=remove(line).split() word+=len(word_count) for i in word_count : if i.lower() not in stopwords : wordlist.append(i.lower()) wordlist.sort() print("char count =",char) print("alphanumeric count =",alph_num) print("line count =",linecount) print("word count =",word) if xx==False : for i in range(len(wordlist)-1) : if wordlist[i]==wordlist[i+1] : q+=1 else : BoW.append([wordlist[i],q]) q=1 BoW.append([wordlist[-1],q]) print("BoW =",BoW) if xx==True : t=[] for k in wordlist : t.append(flash(k,M)) t.sort() for i in range(len(t)-1) : if t[i]==t[i+1] : q+=1 else : BoW.append([t[i],q]) q=1 BoW.append([t[-1],q]) print("BoW =",BoW) fin.close()
# 6330236221 (25.15) 88 (2021-03-21 23:32) file_name = open(input("File name = "),"r") use_fea = input("Use feature hashing ? (y,Y,n,N)"" ") while use_fea not in ["y","Y", "n","N"]: print("Try agin.") use_fea = input("Use feature hashing ? (y,Y,n,N)"" ") if use_fea in ["y","Y"]: M = int(input("M = "" ")) elif use_fea in ["n","N"]: M = 0 stop_word = open("stopwords.txt","r") character_count = 0 alpha_count = 0 line_count = 0 word_count = 0 k = 0 G = "" C = "" q = [] G1 = [] for line in file_name: k += 1 x = 0 alpha = 0 if "\n" in line: x += len(line) - 1 character_count += x else: character_count += len(line) for i in range(len(line)): if "A"<= line[i] <= "Z" or "0"<=line[i]<="9" or "a" <= line[i] <= "z": alpha += 1 alpha_count += alpha alpha = 0 p = "" for i in range(len(line)): if line[i] in "abcdefghijklmnopqrstuvwxyz" : p += line[i] C += line[i] elif line[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": p += line[i] C += line[i] elif line[i] in "0123456789": p += line[i] C += line[i] else: p += " " C += " " x = p.split() word_count += len(x) line_count += k for line in stop_word: for i in range(len(line)): if line[i] in "abcdefghijklmnopqrstuvwxyz" : G += line[i] elif line[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": G += line[i] elif line[i] in "0123456789": G += line[i] else: G += " " G = G.split() for i in G: G1.append(i.lower()) C = C.split() if M != 0: def flash(w,z): sums = 0 for i in range(len(w)): sums += ord(w[i])*(37**i) fhash = (sums)%z return fhash bow2 = [] N = [] V =[] X = [] for i in C: if i.lower() not in G1: q.append(i) for i in q: bow1 = flash(i,M) bow2.append(bow1) for i in bow2: if i not in N: N.append(i) N.sort() for i in N: c = 0 for k in range(len(bow2)): if i == bow2[k]: c += 1 V.append(c) for i in range(len(N)): X.append([N[i],V[i]]) print("-------------------") print("char count = "+str(character_count)) print("alphanumeric count = "+str(alpha_count)) print("line count = "+str(line_count)) print("word count = "+str(word_count)) print("BoW = "+str(X)) else: N1 = [] V1 = [] X1 = [] for i in C: if i.lower() not in G1: q.append(i) for i in q: if i not in N1: N1.append(i) for i in N1: c = 0 for k in range(len(q)): if i == q[k]: c +=1 V1.append(c) for i in range(len(N1)): X1.append([N1[i],V1[i]]) print("-------------------") print("char count = "+str(character_count)) print("alphanumeric count = "+str(alpha_count)) print("line count = "+str(line_count)) print("word count = "+str(word_count)) print("BoW = "+str(X1)) file_name.close() stop_word.close()
# 6330238521 (28.40) 89 (2021-03-22 17:03) def fhash(w,M): G = 37 y = 0 for i in range(len(w)): x = (ord(w[i])*(G**(i))) y += x return y%M file_name = input("File name = ") x = input("Use feature hashing ? (y,Y,n,N) ").lower() while x != "y" and x != "n": print("Try again.") x = input("Use feature hashing ? (y,Y,n,N) ").lower() y = open("stopwords.txt","r") list_stopwords = [] for line in y: list_stopwords += line.strip().split() y.close() if x == "y": M = int(input("M = ")) openf = open(file_name,"r") char_count = 0 for line in openf: char_count += len(line.strip()) print("char count = ",char_count) openf.close() openf = open(file_name,"r") alphanum = 0 for line in openf: alphanumeric_count = [c for c in line if c.isalnum()] alphanum += len(alphanumeric_count) print("alphanumeric count =",alphanum) openf.close() openf = open(file_name,"r") line_count = 0 for line in openf: if line != "\n": line_count += 1 openf.close() print("line count = ",line_count) openf = open(file_name,"r") w = 0 for line in openf: a = "".join([c if c.isalnum() else " " for c in line]) a = a.split() w += len(a) openf.close() print("word count = ",w) openf = open(file_name,"r") bow = [] for line in openf: line = line.lower() a = "".join([c if c.isalnum() else " " for c in line]) a = a.split() cutlaew = [c for c in a if c not in list_stopwords] for word in cutlaew: added = False for a in range(len(bow)): if (word if not x == "y" else fhash(word, M)) == bow[a][0]: bow[a][1] += 1 added = True break if not added: bow.append([(word if not x == "y" else fhash(word, M)), 1]) bow.sort() openf.close() print("BoW = ",bow)
# 6330239121 (18.39) 90 (2021-03-21 23:18) def replace_punctuation(s): t = "" for e in s: if e in "\"\'/\\,.:;()[]{}": t += " " else: t += e return t file_name = input("File name = ") fh = input("Use feature hashing ? (y,Y,n,N) ") if fh not in "yYnN": while fh not in "yYnN": print("Try again.") fh = input("Use feature hashing ? (y,Y,n,N) ") if fh in "yY": M = input("M = ") print("-------------------") elif fh in "yY": M = input("M = ") print("-------------------") else: print("-------------------") stop = open( "stopwords.txt", "r") stopwords = [] for line in stop: stopwords += line.split() stop.close() main = open( file_name , "r") char_count = 0 line_count = 0 for line in main: char_count += len(line) line_count += 1 print ("char count =",char_count - line_count ) main.close() main = open( file_name , "r") alphanumeric_count = 0 line_count = 0 for line in main: wordslist = replace_punctuation(line).split() alphanumeric_count += sum(len(word) for word in wordslist) line_count += 1 print ("alphanumeric count =",alphanumeric_count ) main.close() main = open( file_name , "r") line_count = 0 for line in main: line_count += 1 print ("line count =",line_count ) main.close() main = open( file_name , "r") word_count = 0 for line in main: wordslist = replace_punctuation(line).split() word_count += len(wordslist) print ("word count =",word_count ) main.close() main = open( file_name , "r") Bagofwords = [] for line in main: Bagofwords += replace_punctuation(line.lower()).split() main.close() def fhash(w,x): G = 37 a = 0 for i in range(len(w)): a += ord(w[i])*G**(i) a = a%int(x) return a if fh in "nN": BoW = [] for e in Bagofwords: if e not in stopwords and [e,Bagofwords.count(e)] not in BoW: BoW.append([e,Bagofwords.count(e)]) BoW.sort() print ("BoW =",BoW) else: if fh in "yY": BoW = [] L = [] for e in Bagofwords: if e not in stopwords: L.append(fhash(e,M)) for e in L: if [e,L.count(e)] not in BoW: BoW.append([e,L.count(e)]) BoW.sort() print ("BoW =",BoW)
# 6330240721 (24.00) 91 (2021-03-22 22:28) def fhash(w,M) : olo = 0 for i in range(len(w)): olo += ord(w[i])*(37**i) shibal = olo % int(M) return(shibal) file_name = input('File name = ') lol = 69 while lol == 69: WayV = input('Use feature hashing ? (y,Y,n,N) ') if WayV == 'Y' or WayV == 'y': M = int(input('M = ')) yes = True;break elif WayV == 'N' or WayV == 'n': yes = False;break else: print('Try again.') NCT = [] X1 = [] l = 1 TXT = open(file_name,"r") line_text = TXT.readlines() for line in TXT: X = line.strip('\n') X1 += X U = line.strip('\n').split() NCT += U if '\n' in line: l+=1 TXT.close() line_count = len(line_text) line_textz = '' for e in line_text: line_textz += e.strip('\n') line_textzz = '' for e in line_text: line_textzz += e line_textx = line_textzz +'\n' Lucas = '' Daniel = [] nct=0 for e in line_textx: if e in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890': Lucas += e nct+=1 elif len(Lucas) != 0: Daniel.append(Lucas.lower()) Lucas = '' x = [] k = [] Wanna_One = open("stopwords.txt","r") for line in Wanna_One: a = line.strip('\n').split() x += a Wanna_One.close() for e in Daniel: if e in x: pass else : k.append(e) Izone = [] for e in line_textx: Izone += e.lower() words = [] temp = '' for e in Izone: if e in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890': temp += e elif len(temp) != 0: words.append(temp) temp = '' wc = len(words) BOW = [] for e in k: if [e, k.count(e)] in BOW: pass else: BOW.append([e, k.count(e)]) if yes == False : jj = ['kuay',69] if yes == True : BOW = [] for e in k: BOW.append([e, k.count(e)]) for e in k: jj =[] for i in range(len(BOW)): jj.append(fhash(BOW[i][0],M)) BoW = [] for i in jj: if [i, jj.count(i)] not in BoW: BoW.append([i, jj.count(i)]) else : pass BoW.sort() if yes == False : BoW = [] for e in k: if [e, k.count(e)] in BoW: pass else: BoW.append([e, k.count(e)]) BoW.sort() print('-------------------') print('char count = '+str(len(line_textz)) ) print('alphanumeric count = '+str(nct) ) print('line count = '+str(len(line_text)) ) print('word count = '+str(wc) ) print('BoW = '+str(BoW))
# 6330241321 (22.99) 92 (2021-03-22 23:59) def fhash(w, M) : a = 0 for i in range(len(w)) : a = a + (ord(w[i]) * ((37)**i)) a = a % M return a file_name = input("File name = ") ans = input("Use feature hashing ? (y,Y,n,N) ") M = 0 while ans != "n" and ans != "N" and ans != "y" and ans != "Y" : print("Try again.") ans = input("Use feature hashing ? (y,Y,n,N) ") if ans.lower() == "n": ans = False else : M = int(input("M = ")) ans = True print("-"*19) l1 = 0 l2 = 0 lineCount = 0 words = [] file_words = open(file_name, "r") for line in file_words : lineCount = lineCount + 1 for c in line : l1 = l1 + 1 if c == "\n" : l1 = l1 - 1 if ("0" <= c <= "9") or ("a" <= c <= "z") or ("A" <= c <= "Z") : l2 = l2 + 1 word = "" for c in line : if ("0" <= c <= "9") or ("a" <= c <= "z") or ("A" <= c <= "Z") : word = word + c else : if len(word) != 0 : words.append(word) word = "" file_words.close() stopwords = [] file_stopwords = open("stopwords.txt", "r") for line in file_stopwords : for w in line.split() : w = w.lower() if w not in stopwords : stopwords.append(w) file_stopwords.close() a = [] for c in words : c = c.lower() if c in stopwords : pass else : have = False if ans : d = fhash(c, M) for i in range(len(a)) : if a[i][0] == d : a[i][1] = a[i][1] + 1 have = True break if not have : a.append([d, 1]) else: for i in range(len(a)) : if a[i][0] == c : a[i][1] = a[i][1] + 1 have = True break if not have : a.append([c, 1]) print("char count =", l1) print("alphanumeric count =", l2) print("line count =", lineCount) print("word count =", len(words)) print("BoW =", a)
# 6330242021 (0.00) 93 (2021-03-22 23:50) AL=['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'] NUM=['0','1','2','3','4','5','6','7','8','9'] print('File name =',end=' ') file_name=input() print('Use feature hashing ? (y,Y,n,N)',end=' ') fh=input() while fh not in ['y','Y','n','N']: print('Try again.') print('Use feature hashing ? (y,Y,n,N)',end='') fh=input() if fh=='n' or fh=='N': print('-------------------') file=open(file_name,'r'); lc=0 for line in file: f=line.lower() ch=0 if line[-1]=='\n': ch+=len(line)-1 else: ch+=len(line) alp=0 for e in f: if e in AL or e in NUM: alp+=1 if line!='\n': lc+=1 for i in range(len(f)): if f[i] in AL or f[i] in NUM: ww+=f[i] else: ww+=' ' w1=ww.split(); w2+=len(w1) file.close() s=open(stopword.txt,'r'); ss=[] for line in s: ss+=line.split() s.close() w3=[] for i in range(len(w1)): if w1[i] not in ss: w3.append(w1[i]) d=[]; e=[] for i in range(len(w3)): if w3[i] not in d: d.append(w3[i]) for i in range(len(d)): e.append([d[i],0]) for i in range(len(w3)): j=d.index(w3[i]); e[j][1]+=1 elif fh=='y' or fh=='Y': print('-------------------') M=int(input()); G=37 file=open(file_name,'r'); lc=0 for line in file: f=line.lower() ch=0 if line[-1]=='\n': ch+=len(line)-1 else: ch+=len(line) alp=0 for e in f: if e in AL or e in NUM: alp+=1 if line!='\n': lc+=1 for i in range(len(f)): if f[i] in AL or f[i] in NUM: ww+=f[i] else: ww+=' ' w1=ww.split(); w2+=len(w1) file.close() s=open(stopword.txt,'r'); ss=[] for line in s: ss+=line.split() s.close() w3=[] for i in range(len(w1)): if w1[i] not in ss: w3.append(w1[i]) cal=0; calc=0 for i in range(len(w3)): for k in range(len(w3[i])): cal+=(ord(w3[i])*(G**k)) calc=cal%M e=[] for i in range(len(w3)): if w3[i] not in d: d.append(w3[i]) for i in range(0,M): e.append([i,0]) for i in range(len(w3)): j=d.index(w3[i]); e[j][1]+=1 print('char count =',ch) print('alphanumeric count =',alp) print('line count =',lc) print('word count =',w2) print('BoW =',e)
# 6330243621 (19.05) 94 (2021-03-18 23:40) def fhash(w, M) : c = 0 for i in range(len(w)) : c += ord(w[i])*(37**i) return c%M def fehas(y) : if fehash in ['y','Y'] : fh = [] ss = osw(feh,sw) for i in ss : fh.append(fhash(i, M)) bb = bow(fh) return bb else : fh = [] ss = osw(feh,sw) bb = bow(ss) return bb def osw(b1,b2) : ss = [] for i in b1 : if i not in b2 : ss.append(i) return ss def texttoword(fs) : sw = [] c,d,f = 0,0,0 for i in fs : i = fullword(i) f += len(i) i = i.lower().split() for e in i : sw.append(e) d += len(e) c += 1 return sw,c,d,f def fullword(cc) : cp = '' for i in range(len(cc)) : if cc[i] in '\'\"\\/-_,.:;()<>' : cp += ' ' elif cc[i] == '\n' : cp += '' else : cp += cc[i] return cp def bow(fh) : c = 1 p = [] fh.sort() for i in range(1,len(fh)) : if fh[i-1] == fh[i] : c += 1 else : p.append( [fh[i-1],c]) c = 1 p.append( [fh[-1],c]) return p #----------------------------------------------------- file_name = input('File name = ') fehash = input('Use feature hashing ? (y,Y,n,N) ') while fehash not in ['y','Y','n','N'] : print('Try again.') fehash = input('Use feature hashing ? (y,Y,n,N) ') if fehash in ['y','Y'] : M = int(input('M = ')) print('-'*19) fs = open('stopwords.txt', 'r') fn = open(file_name, 'r') sw,c1,d1,f1 = texttoword(fs) feh,c2,d2,f2 = texttoword(fn) print('char count =', f2) print('alphanumeric count =', d2) print('line count =', c2) print('word count =', len(feh)) print('BoW =', fehas(feh)) fs.close() fn.close()
# 6330245921 (23.80) 95 (2021-03-22 20:17) #Prog-08: Bag-of-words #6330245921 (23.80) Teetat Karuhawanit def somchai(c): v = open(c,'r') x = '' b = [] for j in v.readlines(): b+= [j.strip()] splitted = '' for i in b: splitted += i.lower()+' ' x = splitted.split() v.close() return ' '.join(x) def paisan(file_name): u = open(file_name) x = u.readlines() alphacount = 0 for b in range(len(x)): x[b] = x[b].strip('\n').lower() for n in x[b]: if n in 'abcdefghijklmnopqrstuvwxyz0123456789': alphacount += 1 u.close() return alphacount def chate(file_name): z = 0 c = somchai(file_name) for i in range(len(c)): z+=1 return((z-thanarat(file_name))+1) def thanarat(file_name): f = open(file_name,'r') v = f.readlines() f.close() return len(v) def pannarai(file_name): d = somchai(file_name) c = len(d.split()) return c def sukree(file_name): a = somchai(file_name) b = somchai('stopwords.txt') x = '' for i in a: if i not in 'abcdefghijklmnopqrstuvwxyz0123456789': x += ' ' else: x += i x = x.split() l = [] for i in x: if i in b: l += [] else: l += [i] return l def fhash(W,M): x = 0 s = 0 G = 37 for i in W: x += ord(i)*(G**s) s += 1 d = x % M return d def kirati(): v = sukree(file_name) a = [] b = [] c = [] d = 0 for i in v: if i not in a: a.append(i) b.append(1) else: b[a.index(i)] += 1 for i in a: c += [[i,b[d]]] d += 1 return c def parngod(M): a = [] b = [] c = [] d = 0 for i in sukree(file_name): if fhash(i,M) not in a: a += [fhash(i,M)] b.append(1) else: b[a.index(fhash(i,M))] += 1 for i in a: c += [[i,b[d]]] d += 1 return c file_name = input('File name = ') x = input('Use feature hashing ? (y,Y,n,N) ') while x not in 'yYnN': print('Try again.') x = input('Use feature hashing ? (y,Y,n,N) ') if x in 'Yy': M = int(input('M = ')) print('-------------------') print('char count =',chate(file_name)) print('alphanumeric count =',paisan(file_name)) print('line count =',thanarat(file_name)) print('word count =',pannarai(file_name)) print('BoW =',parngod(M)) else: print('-------------------') print('char count =',chate(file_name)) print('alphanumeric count =',paisan(file_name)) print('line count =',thanarat(file_name)) print('word count =',pannarai(file_name)) print('BoW =',kirati())
# 6330246521 (30.00) 96 (2021-03-21 18:26) def main(): file_name = input('File name = ') x = read_file(file_name) y = read_file('stopwords.txt') list_of_noStopwords = cut_stopwords(x[0], y[0]) Bo = BoW(list_of_noStopwords) fh = input('Use feature hashing ? (y,Y,n,N) ').lower() while fh != 'y' and fh != 'n': print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ').lower() if fh == 'y': M = int(input('M = ')) print('-------------------') print('char count =', x[1]) print('alphanumeric count =', x[2]) print('line count =', x[3]) print('word count =', x[4]) print('BoW =', BoW_to_fhash(Bo, M)) elif fh == 'n': print('-------------------') print('char count =', x[1]) print('alphanumeric count =', x[2]) print('line count =', x[3]) print('word count =', x[4]) print('BoW =', sorted(Bo)) def read_file(file_name): fn = open(file_name, 'r') words = '' char_count = 0 alph_count = 0 line_count = 0 for e in fn: char_count += len(e)-1 z = e e = e.strip().lower() for i in range(len(e)): if e[i] not in 'abcdefghijklmnopqrstuvwxyz0123456789': e = e[:i] + ' ' + e[i+1:] e = e.strip() words += e + ' ' line_count += 1 if '\n' not in z: char_count += 1 list_of_words = words.split() word_count = len(list_of_words) for k in list_of_words: alph_count += len(k) fn.close() return [list_of_words, char_count, alph_count, line_count, word_count] def cut_stopwords(list_of_words, list_of_stopwords): list_of_noStopwords = [] for e in list_of_words: if e not in list_of_stopwords: list_of_noStopwords.append(e) return list_of_noStopwords def BoW(list_of_noStopwords): x = [] for e in list_of_noStopwords: if e not in x: x.append(e) f = [0]*len(x) for e in list_of_noStopwords: if e in x: f[x.index(e)] += 1 BoW = [] for i in range(len(x)): BoW.append([x[i], f[i]]) return BoW def fhash(w,M): fh = 0 for i in range(len(w)): fh += ord(w[i]) * 37**i fh %= M return fh def BoW_to_fhash(BoW, M): m = [] u = [] for k in BoW: k[0] = fhash(k[0], M) BoW.sort() for e in BoW: m += ([e[0]]*e[1]) for e in m: if e not in u: u.append(e) l = [0]*len(u) for e in m: if e in u: l[u.index(e)] += 1 Bfh = [] for i in range(len(u)): Bfh.append([u[i], l[i]]) return Bfh #----------------------------------------------- main()
# 6330247121 (30.00) 97 (2021-03-22 15:29) filename = input("File name = ") yn = input("Use feature hashing ? (y,Y,n,N) ") yn = yn.lower() while not(yn == "y" or yn == "n") : print("Try again.") yn = input("Use feature hashing ? (y,Y,n,N) ") yn = yn.lower() if yn == "y" : M = int(input("M = ")) print("-------------------") f = open(filename) fl = f.readlines() charcount = 0 for i in range(len(fl)) : fl[i] = fl[i].strip("\n").lower() charcount += len(fl[i]) print("char count =" ,charcount) acount = 0 wl = [] for i in fl : ws = " " for j in i : if j in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" : acount += 1 ws += j else : ws += " " wl += ws.split() print("alphanumeric count =",acount) print("line count =",len(fl)) print("word count =",len(wl)) stopwords = open("stopwords.txt").readlines() for i in range(len(stopwords)) : stopwords[i] = stopwords[i].strip("\n") stopwords[i] = stopwords[i].lower() stop = [] for i in stopwords : stop += i.split() wns = [] for i in wl : if i not in stop : wns.append(i) bag1 = [] bag2 = [] for w in wns : if yn == "y" : u = 0 for i in range(len(w)): o = ord(w[i]) p = 37**i y = o*p u += y w = u%M if w in bag1 : index = bag1.index(w) bag2[index] += 1 elif w not in bag1 : bag1.append(w) bag2.append(1) bagofwords = [] for i in range(len(bag1)) : bagofwords += [[bag1[i],bag2[i]]] bagofwords.sort() print("BoW =",bagofwords)
# 6330248821 (24.80) 98 (2021-03-22 20:51) alp = 'abcdefghijklmnopqrstuvwxyz' num = '0123456789' file = input('File name = ') x = input('Use feature hashing ? (y,Y,n,N) ').lower() M = '' def nFhash(w): BoW = [] bow = [] count = [] for word in w: if word not in bow: bow.append(word) count.append(int(1)) else : for i in range(len(bow)): if word == bow[i]: count[i] += 1 for j in range(len(bow)): BoW.append([bow[j],count[j]]) return BoW def yFhase(w,m): fhase = [] BoW = [] bow = [] count = [] for word in w: f = 0 for i in range(len(word)): f += ord(word[i]) * (37 ** i) fhase.append(f % int(m)) for e in fhase : if e not in bow: bow.append(e) count.append(1) else: for j in range(len(bow)): if e == bow[j]: count[j] += 1 for k in range(len(bow)): BoW.append([bow[k],count[k]]) return BoW while x not in'ny': print('Try again') x = input('Use feature hashing ? (y,Y,n,N) ').lower() if x == 'y': M = input('M = ') sFile = open('stopwords.txt','r') stop_words = [] for line in sFile: stop_words += line.split() stop_words = list(map(str.lower,stop_words)) sFile.close() wFile = open(file,'r') charCount = 0 alpCount = 0 lineCount = 0 wordCount = 0 words = [] text = '' for line in wFile: lineCount += 1 words += line.split() words = list(map(str.lower,words) ) for char in line.strip(): charCount += 1 for word in words: for alpnum in word: if alpnum in alp or alpnum in num: text += alpnum text += ' ' clearedWords = text.split() print(clearedWords) wordCount += len(clearedWords) for i in range(len(clearedWords)): alpCount += len(clearedWords[i]) print('-------------------') print('char count =',charCount) print('alphanumeric count =',alpCount) print('line count =',lineCount) print('word count =',wordCount) deletedWord = [] for w in clearedWords: if w not in stop_words: deletedWord.append(w) if x == 'y': print('BoW =',sorted(yFhase(deletedWord,M))) else : print('Bow =',sorted(nFhash(deletedWord)))
# 6330249421 (26.00) 99 (2021-03-21 22:15) char=0 alpha=0 lines=0 ms='' bow='' stopwords='' answer=[] answeranswer=[] file_name=input('File name = ') want=input('Use feature hashing ? (y,Y,n,N) ') while want not in ['Y','y','N','n']: print('Try again.') want=input('Use feature hashing ? (y,Y,n,N) ') if want=='y' or want=='Y' : Ball=True #ต้องใช้อีก M=int(input('M = ')) else : Ball=False #ต้องใช้อีก coke=open('stopwords.txt', 'r') stopwords='' for line in coke:#######งงว่าทำไมsplit'i me myself\n'ทำไมได้เป็น['i','me','myself'] for i in range(len(line)): if line[i]=="\n": stopwords+=' ' else: stopwords+=line[i] stopwords=stopwords.split() #stopwords=stopwords.join() coke.close() print('-------------------') fn=open(file_name, 'r') for line in fn: lines+=1 for i in range(len(line)) : if line[i]!="\n" : char+=1 if 'a'<=line[i]<='z' or 'A'<=line[i]<='Z' or '0'<=line[i]<='9': alpha+=1 #else : #lines+=1 if 'a'<=line[i]<='z' or 'A'<=line[i]<='Z' or '0'<=line[i]<='9': ms+=line[i] else: ms+=' ' print('char count = '+str(char)) print('alphanumeric count = '+str(alpha)) print('line count = '+str(lines)) fn.close() ms=ms.lower() a=ms.split() #list of ms words=len(a) print('word count = '+str(words)) #ms=ข้อความที่ตัดอักขระที่ไม่ใช่ตัวอิ้ง,เลขและเป็นตัวพิมเล็กทั้งหมด for i in range(len(a)): if a[i] in stopwords: pass else: answer.append(a[i]) #answerคือlistของmsที่ตัดstop wordออก answer.sort() # answer.append(answer[len(answer)-1]+'xecrvtbynumi') nn=1 for i in range(len(answer)-1): if answer[i] != answer[i+1]: answeranswer.append([answer[i],nn]) nn=1 else : nn+=1 #ได้ค่าansweranswer ใช้กับฺBall==False #หาBowแบบปกติ if Ball==True : f_answer=[] f=0 f_answeranswer=[] answer=answer[:-1:] #เริ่มหาแบบf for i in range(len(answer)): for k in range(len(answer[i])): f+=(37**(k)*ord((answer[i])[k])) f_answer.append(f%M) f=0 f_answer.sort() f_answer.append(f_answer[len(f_answer)-1]+1) nn=1 for i in range(len(f_answer)-1): if f_answer[i] != f_answer[i+1]: f_answeranswer.append([f_answer[i],nn]) nn=1 else : nn+=1 #print('char count = '+str(char))## #print('alphanumeric count = '+str(alpha)) ### #print('line count = '+str(lines)) ## #print('word count = '+str(words)) if Ball==False : print('BoW =',answeranswer) else: print('BoW =',f_answeranswer)
# 6330250021 (29.00) 100 (2021-03-20 11:58) def fhash(w,M): n=0 for i in range(len(w)): n+=ord(w[i])*37**i return n%M file_name=input('File name = ') fh=input('Use feature hashing ? (y,Y,n,N) ').lower() while fh not in 'yn': print('Try again.') fh=input('Use feature hashing ? (y,Y,n,N) ').lower() if fh=='y': M=int(input('M = ')) stop=open('stopwords.txt','r') sw=[] for line in stop: if len(line.strip())!=0: sw+=line.split() stop.close() ch=0 al=0 li=0 word='' f=open(file_name,'r') for line in f: li+=1 ch+=len(line.strip()) for a in line.lower(): if '0'<=a<='9' or 'a'<=a<='z': word+=a al+=1 else: word+=' ' wordlist=word.split() wd=len(wordlist) fhlist=[] bow=[] if fh=='n': for w in wordlist: if [w,wordlist.count(w)] not in bow and w not in sw: bow.append([w,wordlist.count(w)]) elif fh=='y': for w in wordlist: if w not in sw: fhlist.append(fhash(w,M)) for x in fhlist: if [x,fhlist.count(x)] not in bow: bow.append([x,fhlist.count(x)]) bow.sort() print('-------------------') print('char count =',ch) print('alphanumeric count =',al) print('line count =',li) print('word count =',wd) print('BoW =',bow)
# 6330251621 (26.00) 101 (2021-03-22 10:50) file_name = input('File name = ') b = input('Use feature hashing ? (y,Y,n,N) ') n = 0 while n == 0: if b in ['y','Y']: M = input('M = ') n = 1 elif b in ['n','N']: pass n = 1 else: print(('Try again.')) b = input('Use feature hashing ? (y,Y,n,N) ') z = open( file_name, 'r') line_count = 0 for line in z: line_count +=1 z.close() z = open( file_name, 'r') alp = 0 char = 0 s = z.read() for i in s: if i.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': alp += 1 elif i == ' ': char += 1 elif i == '\n': pass else: char +=1 char_total = alp + char w = [] word = 0 for e in s: if e.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': w.append(e) elif e not in 'abcdefghijklmnopqrstuvwxyz0123456789': if len(w) != 0: word += 1 w = [] if len(w) != 0: word += 1 z.close() print('-------------------') print('char count = '+str(char_total)) print('alphanumeric count = '+str(alp)) print('line count = '+str(line_count)) print('word count = '+str(word)) sam = '' stop = '' for i in s: if i.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': sam += i.lower() else : sam += ' ' st = open('stopwords.txt', 'r') sto = st.read() for i in sto: if i.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': stop += i.lower() else : stop += ' ' sam = sam.split() stop = stop.split() for e in stop: q = True while q == True: if e in sam: sam.remove(e) q = True else: q = False Bow = [] def cut(K): finish = [] for i in range(len(K)-1): if K[i] != K[i+1]: finish.append(K[i]) if K[len(K)-1] != K[len(K)-2] and K[len(K)-1] not in finish: finish.append(K[len(K)-1]) if K[len(K)-1] == K[len(K)-2] and K[len(K)-1] not in finish: finish.append(K[len(K)-1]) return finish if b in ['n','N']: for i in sam: Bow.append([i,sam.count(i)]) Bow.sort() finish = cut(Bow) print('BoW = '+str(finish)) elif b in ['y','Y']: def fhash(w,M): f = 0 n = 0 for i in w: f += ord(i)*(37**n) n += 1 fh = f%int(M) return fh for i in sam: Bow.append(fhash(i,M)) Bow.sort() Bowlast = [] for i in Bow: Bowlast.append([i,Bow.count(i)]) finish = cut(Bowlast) print('BoW = '+str(finish))
# 6330252221 (13.30) 102 (2021-03-22 22:10) def fhash(w,M): a=0 for i in range(len(w)): b=int(ord(w[i])) c=(37)**(i) d=b*c a+=d a=int(a%int(M)) return a def remove(t): out='' for e in t: if e not in " \"\'/\()<>[].,:;-_&#!* ": out += e else: out+=' ' return out def word(t): t=remove(t) a=t.split() a=len(a) return a def char(t): a=0 for e in t: a+=len(e) return a def alphanumeric(t): t=t.lower() a=0 for e in t: if e in "abcdefghijklmnopqrstuvwxyz0123456789": a += 1 return a def word1(t): a=[] t=remove(t) t=t.lower() t=t.split() for i in range(len(t)): a.append(t[i]) return a #--------------------------- file=input("File name = ") text=open(file,'r') hashing=input('Use feature hashing ? (y,Y,n,N) ') #-------------------------- stopword=open('stopwords.txt','r') stopwords=[] for line in stopword: line=line.split() for i in range(len(line)): stopwords.append(line[i]) stopword.close() #------------------------- char_count=0 line_count=0 word_count=0 alphanumeric_count=0 word2=[] for line in text: line_count+=1 a=char(line) char_count+=a b=alphanumeric(line) alphanumeric_count+=b c=word(line) word_count+=c d=word1(line) word2+=d char_count=(char_count)-(line_count)+1 word3=[] for e in word2: if e not in stopwords: word3.append(e) word3.sort() #---------------------- while hashing not in ['y','Y','n','N']: print('Try again.') hashing=input('Use feature hashing ? (y,Y,n,N) ') if hashing.lower() == 'y': M=input("M = ") #------------กรณี y Bow1=[] BoW=[] for i in range(len(word3)): a=fhash(word3[i],M) Bow1.append(a) Bow1.sort() n=1 for i in range(len(Bow1)-1): if Bow1[i]==Bow1[i+1]: n+=1 else: BoW+=[[Bow1[i],n]] n=1 BoW+=[[Bow1[-1],n]] if hashing.lower() == 'n': BoW=[] n=1 for i in range(len(word3)-1): if word3[i]==word3[i+1]: n+=1 else: BoW+=[[word3[i],n]] n=1 BoW+=[[word3[-1],n]] #--------------------- print('-------------------') print('char count =',char_count) print('alphanumeric count =',alphanumeric_count) print('line count =',line_count) print('word count =',word_count) print('BoW =',BoW) text.close()
# 6330253921 (22.27) 103 (2021-03-22 23:21) fn = input('File name = ') f = open(fn) k = f.read() def chcount(k): s = '' for i in range(len(k)): if k[i] == '\n': s += '' else: s += k[i] return s def alphacount(k): m = chcount(k) alpha = 'abcdefghijklmnopqrstuvwxyz' ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' num = '0123456789' c = 0 for i in range(len(m)): if m[i] in alpha or m[i] in ALPHA or m[i] in num: c += 1 return c def habuntud(k): c = 1 for i in range(len(k)): if k[i] == '\n': c += 1 return c def wordcount(k): s = '' for i in range(len(k)): if k[i] == '\n' or k[i] == '"' \ or k[i] == "'" or k[i] == ','\ or k[i] == '(' or k[i] == ')'\ or k[i] == '[' or k[i] == ']'\ or k[i] == '.' or k[i] == '!'\ or k[i] == '?' or k[i] == '/'\ or k[i] == ':' or k[i] == ';'\ or k[i] == '{' or k[i] == '}'\ or k[i] == '+' or k[i] == '-'\ or k[i] == '*' or k[i] == '^'\ or k[i] == '#' or k[i] == '&'\ or k[i] == '=' or k[i] == '$'\ or k[i] == '~' or k[i] == '_'\ or k[i] == '%' or k[i] == '|': s += ' ' else: s += k[i] a = s.split() return a s = open('stopword.txt') ss = s.read() stopwords = wordcount(ss) def fhash(a,m): d = 0 for i in range(len(a)): d += ord(a[i])*(37**i) e = d % int(m) return e def check(a): if a == 'n' or a == 'N' or a == 'y' or a == 'Y': return a else: print('Try again.') x = input('Use feature hashing ? (y,Y,n,N) ') if x == 'n' or x == 'N' or x == 'y' or x == 'Y': return x else: return check(x) def kunsum(k): m = wordcount(k) g = [] for i in m: if i.lower() in stopwords: pass else: g.append(i) return g def bow2(a,m): c = kunsum(a) d = [] f = [] k = [] l = [] u = 0 for i in range(len(c)): d.append(fhash(c[i],m)) for y in range(m): k.append(y) for e in range(len(d)): if y == d[e]: u += 1 f.append(u) u = 0 for i in range(m): if f[i] != 0: l.append([k[i],f[i]]) return l def bow1(h): a = wordcount(h) x = [] y = 0 z = [] for i in range(len(a)): if a[i].lower() in stopwords: pass else: for e in range(len(a)): if i+e >= len(a): break elif a[i] == a[i+e] and a[i] in x: a[i+e] = stopwords[0] d = x.index(a[i]) z[d] = str(int(z[d])+1) break elif a[i] == a[i+e] and a[i] not in x: x.append(a[i]) y += 1 z.append(str(y)) y = 0 m = [] for i in range(len(x)): m += [[x[i],int(z[i])]] m.sort() return m fd = input('Use feature hashing ? (y,Y,n,N) ') fh = check(fd) if fh == 'n' or fh == 'N': print('-'*19) print('char count = '+str(len(chcount(k)))) print('alphanumeric count = '+str(alphacount(k))) print('line count = '+str(habuntud(k))) print('word count = '+str(len(wordcount(k)))) print('BoW = '+str(bow1(k))) else: m = int(input('M = ')) print('-'*19) print('char count = '+str(len(chcount(k)))) print('alphanumeric count = '+str(alphacount(k))) print('line count = '+str(habuntud(k))) print('word count = '+str(len(wordcount(k)))) print('BoW = '+str(bow2(k,m))) f.close() s.close()
# 6330254521 (30.00) 104 (2021-03-22 23:03) def fhash(w,m): compute = 0 for i in range(len(w)): compute+=ord(w[i])*(37**i) result = compute%m return result def basic_count(file_name):#count for line ,charactor and only number and alphabet z=open(file_name,'r') charcount=0 engnum_count=0 linecount=0 while True: y=z.readline() if len(y)!=0: linecount+=1 charcount+=len(y) for e in y: if "a" <= e <="z" or "A" <= e <= "Z" or "0"<= e <="9": engnum_count+=1 else: break z.close() charcount_true = charcount-linecount+1 return charcount_true,engnum_count,linecount def split_word(file_name): list_word =[] string_word='' file_count=open(file_name,'r') while True: line_word=file_count.readline().lower() if len(line_word)!=0: for e in line_word: list_word.append(e) for i in range(len(list_word)): if "a" <= list_word[i] <="z" or "A" <= list_word[i] <= "Z" or "0"<= list_word[i] <="9": string_word+=list_word[i] else: list_word[i] = ' ' string_word+=list_word[i] list_word = [] else: break file_count.close() real_list = string_word.split() return real_list,len(real_list) def function_in_main(file_name): charcount_true,engnum_count,linecount = basic_count(file_name) print("char count =",charcount_true) print("alphanumeric count =",engnum_count) print("line count =",linecount) list_word,num_word = split_word(file_name) print("word count =",num_word) stopword_string = open("stopwords.txt","r") stopword_list = stopword_string.read().lower().split() stopword_string.close() list_word_cut = [] for e in list_word: if e not in stopword_list: list_word_cut.append(e) list_word_cut.sort() return list_word_cut def main(): file_name =input("File name = ") while True: Use_Hashing=input("Use feature hashing ? (y,Y,n,N) ") if Use_Hashing == 'n' or Use_Hashing == 'N': list_word_cut = function_in_main(file_name) Bow_list=[] nonrepeat_list = [] for word in list_word_cut: if word not in nonrepeat_list: nonrepeat_list.append(word) for i in range(len(nonrepeat_list)): n=0 for e in list_word_cut: if e == nonrepeat_list[i]: n+=1 Bow_list.append([nonrepeat_list[i],n]) print("BoW = ",Bow_list) break elif Use_Hashing == 'y' or Use_Hashing == 'Y': M = int(input("M = ")) list_word_cut = function_in_main(file_name) fhash_list = [] nonrepeatfhash = [] Bow_hash = [] for e in list_word_cut: fhash_list.append(fhash(e,M)) for word in fhash_list: if word not in nonrepeatfhash: nonrepeatfhash.append(word) for i in range(len(nonrepeatfhash)): n=0 for e in fhash_list : if e == nonrepeatfhash[i]: n+=1 Bow_hash.append([nonrepeatfhash[i],n]) Bow_hash.sort() print("BoW = ",Bow_hash) break else: print("Try again.") #------------------------------------------------------------------------------------------------- main()
# 6330255121 (10.00) 105 (2021-03-22 23:47) file_name = input('File name = ') fh = input('Use feature hashing ? (y,Y,n,N) ') while fh != 'y' and fh != 'Y' and fh != 'n' and fh != 'N': print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') if fh == 'y' or fh == 'Y': M = int(input('M = ')) print('-------------------') def tostopwords(): stopwordlist = [] stopwords = open('stopwords.txt', 'r') for line in stopwords: for s in line.split(): s = s.lower() if s not in stopwordlist: stopwordlist.append(s) stopwords.close() return stopwordlist lenght = 0 wfile = open(file_name, 'r') for w in wfile: for ww in w: if ww != '\n': lenght += 1 wfile.close() l = 0 words = [] word = '' wfile = open(file_name, 'r') for w in wfile: for ww in w: if ('a'<=ww<='z') or ('A'<=ww<='Z') or ('0'<=ww<='9'): word += ww else: l += len(word) words.append(word) word = '' wfile.close() words2 = [] for w in words: if w != '': words2.append(w) linecount = 0 wfile = open(file_name, 'r') for line in wfile: linecount += 1 wfile.close() def fhash(word,M): G = 37 v = 0 for i in range(len(word)): v += ord(word[i])*(G**i) f = v % M return f #---------------------------------------------------------------- for w in words: w = w.lower() print('char count =', lenght) print('alphanumeric count =', l) print('line count =', linecount) print('word count =', len(words2)) print('BoW =', )
# 6330256821 (0.00) 106 (2021-03-22 23:56) def fhash(w,M) : k=[] x=0 for e in w: k.append(e) for i in range(len(k)): x+=ord(k[i])*(37**k[i]) fhash_re=x%M return fhash_re def remove_punc(t): out="" for e in t: if e in "\'\"/\\().,;:": out+=" " else: out+=e return out a=input("File name =") x=input("Using feature hashing ? ") while x not in ['y','Y','n','N']: print("Try again.") x=input("Using feature hashing ? ") if x in ['y','Y']: M=int(input('M=')) fn=open("stopwords.txt",'r') l=[] line1=fn.readline() for line1 in fn: l.append(line) fn.close l_new=l.join(" ") bow_word=l_new.split() print("-"*20) sample=open("sample.txt","r") s=[] line=sample.readline() for line in sample: s.append(line) sample.close s_new=s.join(" ") sample_word=s_new.split() z=0 for e in range(len(s_new)): z+= len(e) print("char count=",z) alphabet=remove_punc(s_new) alphabet_1=alphabet.split() b=0 for e in alphabet: b+=len(e) print("alphanumeric count =", b) print("line count =",len(s)) print("word count =",len(sample_word))
# 6330257421 (22.20) 107 (2021-03-22 19:43) def fhash(w, M): G = 37 r = 0 for i in range(len(w)): r += (ord(w[i]) * (G**i)) return r % M tx = input('File name = ') hashing = input('Use feature hashing ? (y,Y,n,N) ') if(hashing == ''): hashing = 'a' while hashing not in 'yYnN ': hashing = input('Use feature hashing ? (y,Y,n,N) ') if(hashing == ''): hashing = 'a' print('Try again.') if(hashing in 'yY'): m = int(input('M = ')) print('-------------------') i = 0 j = 0 lc = 0 words = [] file = open(tx, 'r') for l in file: lc=lc+1 for k in l: i=i+1 if(k == '\n'): i=i-1 if('a'<=k<='z')or('A'<=k<='Z')or('0'<=k<='9'): j=j+1 word = '' for k in l: if('a'<=k<='z')or('A'<=k<='Z')or('0'<=k<='9'): word=word+k elif len(word) != 0: words.append(word) word = '' file.close() stopword = [] stop = open('stopwords.txt', 'r') for line in stop: for word in line.strip().split(): word = word.lower() if word not in stopword: stopword.append(word) stop.close() r = [] for c in words: c = c.lower() if c not in stopword: if hashing in 'yY': cEdit = fhash(c, m) for x in range(len(r)): if r[x][0] == cEdit: r[x][1] += 1 break else: r.append([cEdit, 1]) else: for x in range(len(r)): if r[x][0] == c: r[x][1] += 1 break else: r.append([c, 1]) print('char count =', i) print('alphanumeric count =', j) print('line count =', lc) print('word count =', len(words)) print('BoW =', r)
# 6330258021 (30.00) 108 (2021-03-22 20:09) def readlines(fn) : fin = open(fn,'r') lines,line = [],fin.readline().strip('\n') while len(line) != 0 : lines.append(line.strip('\n')) line = fin.readline() fin.close() return lines def fhash(w,m) : h = 0 for i in range(len(w)) : h += ord(w[i])*(37**i) return h % m file_name = input('File name = ').strip() mode = input('Use feature hashing ? (y,Y,n,N) ').strip().lower() while mode != 'n' and mode != 'y' : print('Try again.') mode = input('Use feature hashing ? (y,Y,n,N) ').strip().lower() if mode == 'y' : m = int(input('M = ').strip()) print('-------------------') lines = readlines(file_name) cc,ca,words = 0,0,'' for line in lines : cc += len(line) words += ' ' for c in line : if c.isalnum() : words += c ca += 1 else : words += ' ' print('char count = %d' % cc) print('alphanumeric count = %d' % ca) print('line count = %d' % len(lines)) words,uwords,cwords,stopwords = words.split(),[],[],[] print('word count = %d' % len(words)) for i in readlines('stopwords.txt') : stopwords += i.lower().split() for word in words : word = word.lower() if word in stopwords : continue if mode == 'y' : word = fhash(word,m) if word not in uwords : uwords.append(word) cwords.append(1) else : cwords[uwords.index(word)] += 1 bow = [] for i in range(len(uwords)) : bow.append([uwords[i],cwords[i]]) print('BoW = %s' % sorted(bow))
# 6330259721 (15.25) 109 (2021-03-21 11:33) file_name = input("File name = " ) check = False while not check: is_feature_hashing = input("Use feature hashing ? (y,Y,n,N)").lower() if is_feature_hashing not in "yn": print("Try again.") else: check = True #read stopwords.txt -> convert to list fn_stopwords = open("stopwords.txt","r") li_of_stopwords= [] for line in fn_stopwords: li_of_stopwords += line.split() fn_stopwords.close() #read file_name,line_count,char_count fn_file_name = open(file_name,"r") in_file = [] line_count = 0 char_count = 0 for line in fn_file_name: for e in line.strip(): char_count+=1 line_count +=1 in_file += line.lower().split() #seperated words alp = "abcdefghijklmnopqrstuvwxyz" number = "0123456789" seperated_word = [] for word in in_file: word_check = "" for e in word: if ( e != word[-1]) and ((e in alp) or (e in number)): word_check += e elif ( e == word[-1]) and ((e in alp) or (e in number)): word_check += e seperated_word.append(word_check) else: seperated_word.append(word_check) word_check = "" for e in seperated_word: if e== "": seperated_word.remove(e) #alphanumeric count alphanumeric_count = 0 for word in seperated_word: for e in word: alphanumeric_count +=1 #word count word_count = len(seperated_word) #feature hashing cut_stopwords = [] for word in seperated_word: #remove stopwords from seperated_word list if word not in li_of_stopwords: cut_stopwords.append(word) preBoW = [] for word in cut_stopwords: check = 0 if len(preBoW) !=0: for li in preBoW: if word == li[0]: li[1] +=1 check = 1 if check == 0: preBoW.append([word,1]) def print_other(char_count, alphanumeric_count,line_count,word_count): print("char count = "+ str(char_count)) print("alphanumeric count = "+str(alphanumeric_count)) print("line count = "+str(line_count)) print("word count = "+str(word_count)) def fhash(w,m): ans = 0 for i in range(len(w)): ans += (ord(w[i])*(37**i)) return ans%m if is_feature_hashing == "n": preBoW.sort() print("-------------------") print_other(char_count, alphanumeric_count,line_count,word_count) print("BoW = "+str(preBoW)) elif is_feature_hashing == "y": m = input("M = ") print("-------------------") bow_count = [] for li in preBoW: for i in range(li[1]): bow_count.append(fhash(li[0],int(m))) BoW = [] for e in bow_count: check = False if len(BoW) != 0: for c in BoW: if e == c[0]: check = True c[1]+=1 if not check : BoW.append([e,1]) BoW.sort() print_other(char_count, alphanumeric_count,line_count,word_count) print("BoW = "+ str(BoW))
# 6330260221 (25.50) 110 (2021-03-22 21:24) stopwords = [] words = [] words2 = [] words3 = [] b=[] bow = [] n = 0 count = 0 line = 0 alphacount = 0 alphabet = 'abcdefghijklmnopqrstuvwxyz' num='0123456789' def fhash(w,M): n = 0 for i in range(len(w)): a = ord(w[i])*(37**i) n+=a n2 = n%M return n2 a = input('File name = ') while True == True: feature = input('Use feature hashing ? (y,Y,n,N) ') if feature in ('y', 'Y', 'n', 'N'): if feature in ('y','Y'): M = int(input('M = ')) break elif feature in ('n','N'): M = 'No' break else: print('Try again') word = open(a,'r') for i in word: count+=len(i) line+=1 for i2 in i.lower(): if i2 == ' ': words.append(' ') elif i2 not in alphabet : if i2 in num: words.append(i2) alphacount+=1 elif n != len(i)-1: if i[n+1] in alphabet: words.append(' ') elif i2 in alphabet : words.append(i2.lower()) alphacount+=1 n+=1 words2+=(''.join(words).split()) words.clear() n=0 word.close() stop_word = open('stopwords.txt','r') for j in stop_word: stopwords += j.split() for k in words2: if k in stopwords: continue else: words3.append(k) if M != 'No': for j in words3: b.append(fhash(j,M)) for d in b: if [d,b.count(d)] not in bow: bow.append([d,b.count(d)]) bow = (sorted(bow,key=lambda x:x[0])) else: for c in words3: if [c,words3.count(c)] not in bow: bow.append([c,words3.count(c)]) bow = (sorted(bow,key=lambda x:x[0])) stop_word.close() print('-------------------') print('char count =',count-line+1) print('alphanumeric count =',alphacount) print('line count =',line) print('word count =',len(words2)) print('Bow =',bow)
# 6330261921 (30.00) 111 (2021-03-21 23:39) file_n=input('File name = ') while True: ufh=input('Use feature hashing ? (y,Y,n,N) ') if ufh in 'YyNn':break print('Try again.') if ufh in 'Yy': M=int(input('M = ')) print('-------------------') else :print('-------------------') #อ่านstw filestw = open('stopwords.txt','r') l1=[] l2=[] for i in filestw: l1.append(i.split()) filestw.close() for i in l1: l2.append(' '.join(i)) sw=' '.join(l2).lower().split() lsw=[] for i in sw: if not i in lsw: lsw.append(i) #อ่านfile file_name=open(file_n,'r') s1=[] for i in file_name: s1.append(i.strip('\n')) line=len(s1) char=len(''.join(s1)) file_name.close() s1=' '.join(s1).lower() s2='' for i in s1: if i.isalnum(): s2+=i else:s2+=' ' s2=s2.split() s2.sort()#listของไม่เอาตัวพิเสดแยกคำเรียง word=len(s2) alphanumeric=len(''.join(s2)) s3=[] for i in s2: if not i in s3: s3.append(i)#s2ไม่ซ้ำ #------------------------- s4=[] for i in s2: if not i in lsw: s4.append(i) s5=s4 #s2ไม่เอาstopwordมีซ่้ำ #------------------------- print('char count =',char) print('alphanumeric count =',alphanumeric) print('line count =',line) print('word count =',word) bow=[] a=1 for i in s4: s4=s4[1::] if i in s4:a+=1 else : bow.append([i,a]) a=1 #----------------------- def fhash(w,M): n=0 for i in range(len(w)): n+=ord(w[i])*37**i return n%M bow1=[] if ufh in 'Yy': for i in s5: bow1.append(fhash(i,M)) bow1.sort() bow3=[] a=1 for i in bow1: bow1=bow1[1::] if i in bow1:a+=1 else : bow3.append([i,a]) a=1 print('BoW =',bow3) else :print('BoW =',bow)
# 6330262521 (28.00) 112 (2021-03-21 21:25) file_name = input('File name = ') a = input('Use feature hashing ? (y,Y,n,N) ') def fhash(w,M): G = 37 c = 0 for i in range(len(w)): c += ord(w[i])*G**i return c%M def type(a): if a in 'abcdefghijklmnopqrstuvwxyz0123456789': return 'a' return 'b' while True: if a.upper() == 'Y': M = int(input('M = ')) print("-------------------") k = 1 break if a.upper() == 'N': k = 2 print("-------------------") break print('Try again.') a = input('Use feature hashing ? (y,Y,n,N) ') stop = open("stopwords.txt", "r") pp = [] p = '' for i in stop: pp.append(i.strip()) for i in pp: p = p +i + ' ' p = p.split() stop.close() fin = open(file_name, "r") nn = fin.readlines() fin.close() l = len(nn) b = '' for i in range(len(nn)): b+=nn[i].lower() if b[-1] == '\n': b = b[:-1] c = len(b)-l+1 cc = 0 for i in b: if i in 'abcdefghijklmnopqrstuvwxyz0123456789': cc+=1 f = '' for i in range(len(b)): if type(b[i]) == 'b': f += ' ' else: f += b[i] f = f.split() ccc = len(f) def cut(f): b = '' for i in f: if i not in p: b+=i + ' ' return b def BoW(b): kk = [] vv = [] da = cut(f) da = da.split() if k == 2: for i in range(len(da)): if da[i] not in kk: kk.append(da[i]) vv.append(1) elif da[i] in kk: ii = kk.index(da[i]) vv[ii] +=1 BoW = [] for i in range(len(kk)): BoW.append([kk[i],vv[i]]) print('BoW = ' + str(BoW)) elif k == 1: t = [] da = cut(f) da = da.split() for i in range(len(da)): t.append(fhash(da[i],M)) t.sort() g = [] for i in range(len(t)-1): if t[i] != t[i+1]: g.append([t[i],t.count(t[i])]) g.append([t[-1],t.count(t[-1])]) print('BoW = ' + str(g)) if k == 1: print('char count = '+ str(c)) print('alphanumeric count = ' + str(cc)) print('line count = ' + str(l)) print('word count = ' + str(ccc)) BoW(b) elif k == 2: print('char count = '+ str(c)) print('alphanumeric count = ' + str(cc)) print('line count = ' + str(l)) print('word count = ' + str(ccc)) BoW(b)
# 6330263121 (16.03) 113 (2021-03-22 20:01) def line(file): a = len(file) return a def charcount(file): file1 = ''.join(file) a = len(file1) return a def alphanumericcount(file): file1 = ''.join(file) file2 = file1.split(',') file3 = ' '.join(file2) file4 = file3.split('.') file5 = ' '.join(file4) file6 = file5.lower() file9 = file6.split('"') file10 = ''.join(file9 ) file11 = file10.split() file12 = ''.join(file11) file13 = file12.split() file14 = ''.join(file13) a = len(file14) return a def wordcourt(file): file1 = ''.join(file) file2 = file1.split(',') file3 = ' '.join(file2) file4 = file3.split('.') file5 = ' '.join(file4) file6 = file5.lower() file7 = file6.split() a = len(file7) return a def read(x): f = open(x) a = [line.strip() for line in f.readlines()] f.close() return a def stopword(x): c=[] for i in range(len(x)): x1 =x[i] x2 =x1.split() for j in (x2): c.append(j) return c def test(x,y): a=[] for i in range(len(x)): if x[i] not in y: a.append(x[i]) return a def cut1(file): file1 = ''.join(file) file2 = file1.split(',') file3 = ' '.join(file2) file4 = file3.split('.') file5 = ' '.join(file4) file6 = file5.lower() file9 = file6.split('"') file10 = ''.join(file9 ) file11 = file10.split(';') file12 = ''.join(file11) file13 = file12.split() return file13 def samecut(x): c =[] for i in range(len(x)): a =x[i] if a not in x[i+1:]: c.append(a) return c def bow(x,y): c=[] for i in range(len(x)): a = x[i] l =0 for j in range(len(y)): if a==y[j]: l +=1 else: l +=0 a =[x[i],l] c.append( a ) return c def fhash(w,m): b = 0 i = 0 while i < len(w): a = ord(w[i])*(37**(i)) b += a i +=1 c =b%m return c def mixfhash(x,m): a =[] for i in range(len(x)): b = fhash(x[i],m) a.append(b) return a c = read('stopword.txt') e = stopword(c) a = str(input('File name = ')) file = read(a) file6 = cut1(file) e1 = stopword(file6) e2 = test(e1,e) e3 = samecut(e2) e4 = bow(e3,e2) b6 = mixfhash(e2,4) b7 = samecut(b6) b8 = bow(b7,b6) for i in range(10000): y = str(input('Use feature hashing ? (y,Y,n,N) ')) if y =='n' or y == 'N': print('-------------------') b1 = charcount(file) print('char count = 'f'{b1}' ) b2 = alphanumericcount(file) print('alphanumeric count = 'f'{b2}') b3 = line(file) print('line count = 'f'{b3} ') b4 = wordcourt(file) print('word count = 'f'{b4} ') print('BoW = 'f'{e4}') break elif y =='y' or y=='Y': b1 = str(input('M = ')) g1 = int(b1) print('-------------------') b2 = charcount(file) print('char count = 'f'{b2}' ) b3 = alphanumericcount(file) print('alphanumeric count = 'f'{b3}') b4 = line(file) print('line count = 'f'{b4} ') b5 = wordcourt(file) print('word count = 'f'{b5} ') g6 = mixfhash(e2,g1) g7 = samecut(g6) g8 = bow(g7,g6) g9 = sorted(g8) print('BoW = 'f'{g9}') break else: print('Try again.')
# 6330264821 (30.00) 114 (2021-03-22 19:45) def char_c(x) : c = 0 for ch in x : if ch != '\n': c += 1 return c def alpha_c(alpha) : k = 0 for ch in alpha : if '0' <= ch <= '9' or 'a' <= ch <= 'z' or 'A' <= ch <= 'Z' : k += 1 return k def words_c(word) : w = '' for ch in word : if '0' <= ch <= '9' or 'a' <= ch <= 'z' or 'A' <= ch <= 'Z' : w += ch else: w += ' ' x = w.split() l = len(x) return l def create_w(word) : w = '' for ch in word : if '0' <= ch <= '9' or 'a' <= ch <= 'z' or 'A' <= ch <= 'Z' : w += ch else: w += ' ' w = w.lower() return w def create_s(stop): ss = '' for ch in stop: if ch != '\n': ss += ch else : ss += ' ' return ss def cutto(word) : Bodyya = [] w = create_w(word) w = w.split() stop_word = open('stopwords.txt', 'r') stopword = '' for line in stop_word : stopword += create_s(line) stopword = stopword.split() for i in w : if i in stopword: pass else: Bodyya.append(i) Bodyya.sort() stop_word.close() return Bodyya def flash(w,M) : G = 37 n = 0 x = 0 for ch in w : x += ord(ch)*(G**n) n += 1 flashy = x%M return flashy file_name = input('File name = ') read_file = open(file_name , 'r') sumz = 0 line_n = 0 sumx = 0 sumw = 0 word = [] for line in read_file : sumz += char_c(line) line_n += 1 sumx += alpha_c(line) sumw += words_c(line) word += cutto(line) read_file.close() feature_hash = input('Use feature hashing ? (y,Y,n,N) ') read_file = open(file_name , 'r') loop = True while loop == True : BoW = '' if feature_hash in 'yY' : M = int(input('M = ')) wordy = [] x = [] y = [] c = 1 for line in read_file : wordy += cutto(line) for e in wordy : x.append(flash(e, M)) x.sort() x.append(100000) for i in range(len(x)-1) : if x[i] != x[i+1] : y.append([x[i], c]) c = 1 else : c += 1 BoW = y loop = False elif feature_hash in 'nN' : wordn = [] nono = [] k = 1 for line in read_file : wordn += cutto(line) wordn.sort() wordn.append('TIDapORnChaWFerN28') for i in range(len(wordn)-1) : if wordn[i] != wordn[i+1] : nono.append([wordn[i], k]) k = 1 else : k += 1 BoW = nono loop = False else : print('Try again.') feature_hash = input('Use feature hashing ? (y,Y,n,N) ') read_file.close() print('-------------------') print('char count = '+str(sumz)) print('alphanumeric count = '+str(sumx)) print('line count = '+str(line_n)) print('word count = '+str(sumw)) print('BoW = '+str(BoW))
# 6330265421 (25.00) 115 (2021-03-22 22:29) def stopwords(): stop_word = open('stopwords.txt','r') w = [] for line in stop_word: x = line.strip().split() for e in x: if e not in w: e = e.lower() w.append(e) stop_word.close() return w file_name = input('File name = ') def count(file_name): ch_c = 0 alpha_c = 0 alpha = [] line_c = 0 file = open(file_name) for line in file: x = line.strip() ch_c += len(x) line_c += 1 a = ' ' for e in x: if e in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789': alpha_c += 1 a += e else: a += ' ' alpha += a.split() file.close() return ch_c,alpha_c,line_c,len(alpha),alpha a,b,c,d,e = count(file_name) f_hash = input('Use feature hashing ? (y,Y,n,N) ').lower() while f_hash not in 'yn': print('Try again.') f_hash = input('Use feature hashing ? (y,Y,n,N) ').lower() if f_hash in 'y': M = int(input('M = ')) print('-------------------') print('char count =',a) print('alphanumeric count =',b) print('line count =',c) print('word count =',d) def fhash(word,M): c = 0 for i in range(len(word)): c += ord(word[i])*(37**i) return c % M def BoW(words): BoW = [] w = [] c = 1 d = [] for e in words: e = e.lower() if e not in stopwords(): w.append(e) if f_hash in 'y': for e in w: f = fhash(e,M) d.append(f) d.sort() a = d[0] for i in range(1,len(d)): if d[i] == a: c +=1 else : BoW.append([d[i-1],c]) a = d[i] c = 1 BoW.append([d[i],c]) else: for e in w: d.append(e) d.sort() a = d[0] for i in range(1,len(d)): if d[i] == a: c +=1 else : BoW.append([d[i-1],c]) a = d[i] c = 1 BoW.append([d[i],c]) return BoW print('BoW =',BoW(e))
# 6330266021 (16.34) 116 (2021-03-22 23:55) def fhash(w,M): total = 0 index = 0 for i in w: if index == 0: total += ord(i) else: total += ord(i)*(37**(index-1)) index += 1 total = total%M return total file_name = input("File name = ") file_name = open(file_name, "r") while True: feature = input("Use feature hashing ? (y,Y,n,N) ") if feature == 'Y' or feature == 'y' or feature == 'N' or feature == 'n': break; else: print("Try again.") if feature == 'Y' or feature == 'y': M = int(input("M = ")) print("-----------------------") with open('stopwords.txt', 'r') as stopwords: ban = stopwords.read().replace('\n', ' ') ban = ban.split(); charCount = 0 alphanumbericCount = 0 lineCount = 0 wordCount = 0 temp = [] BoW = [] res = [] for text in file_name: charCount += len(text) for t in text: if t.isalnum(): alphanumbericCount += 1 text = text.lower() textArray = text.split() wordCount += len(textArray) resultwords = [word for word in textArray if word not in ban] result = ' '.join(resultwords) result = ''.join([i for i in result if i.isalnum() or ' ' in i]) for word in result.split(): temp.append(word) lineCount += 1 print("char count = ", charCount) print("alphanumberic count = ",alphanumbericCount) print("line count = " , lineCount) print("word count = ",wordCount) if feature == 'Y' or feature == 'y': fhashArray = [] for x in temp: fhashArray.append(fhash(x,M)) BoW = [fhashArray.count(w) for w in fhashArray] for i in zip(fhashArray, BoW): if i not in res: res.append(i) print("BoW = ", res) else: BoW = [temp.count(w) for w in temp] for i in zip(temp, BoW): if i not in res: res.append(i) print("BoW = ", res)
# 6330267721 (23.20) 117 (2021-03-22 14:30) file_name = input('File name = ') b = input('Use feature hashing ? (y,Y,n,N) ') def fhash(w,M) : number = 0 result = 0 for i in range(len(w)) : number += ord(w[i])*(37**i) result += number % M return result def cut_special_char(a) : list_word = [] string_word = '' for i in range(len(a)) : if 'a' <= a[i] <= 'z' or 'A' <= a[i] <= 'Z' or '0' <= a[i] <= '9' : string_word += a[i] else : if string_word != '' : list_word.append(string_word) string_word = '' return list_word def number_and_alphabet(a) : count = 0 for i in range(len(a)) : if 'a' <= a[i] <= 'z' or 'A' <= a[i] <= 'Z' or '0' <= a[i] <= '9' : count += 1 return count c = '' while True : if b == 'y' or b == 'Y' : c = int(input('M = ')) break elif b == 'n' or b == 'N' : break else : print('Try again.') b = input('Use feature hashing ? (y,Y,n,N) ') fn1 = open(file_name, 'r') fn2 = open('stopwords.txt', 'r') count_line = 0 char_count = 0 word_count = 0 num_and_alpha = 0 BoW_1 = [] t = [] flash = [] show = [] stopwords_string = '' file_name_string = '' line_3 = '' line_4 = '' line_1 = fn1.readlines() line_1 = ''.join(line_1) for i in line_1 : if '0' <= i <= '9' or 'a' <= i <= 'z' or 'A' <= i <= 'Z' : line_3 += i else : line_3 += ' ' line_3 = line_3.split() for i in range(len(line_3)) : file_name_string += line_3[i].lower() + ' ' list_of_file_name = file_name_string.split() line_2 = fn2.readlines() line_2 = ''.join(line_2) for i in line_2 : if '0' <= i <= '9' or 'a' <= i <= 'z' or 'A' <= i <= 'Z' : line_4 += i else : line_4 += ' ' line_4 = line_4.split() for i in range(len(line_4)) : stopwords_string += line_4[i].lower() + ' ' list_of_stopwords = stopwords_string.split() for i in list_of_file_name : if i not in list_of_stopwords : BoW_1.append(i) for i in BoW_1 : if [i,BoW_1.count(i)] not in t : t.append([i,BoW_1.count(i)]) fn3 = open(file_name, 'r') for i in fn3 : char_count += len(i.strip()) list_word = i.strip().split() count_line += 1 word_count += len(cut_special_char(i)) num_and_alpha += number_and_alphabet(i) print('-------------------') print('char count =',char_count) print('alphanumeric count =',num_and_alpha) print('line count =',count_line) print('word count =',word_count) if c == '' : print('BoW =',t) else : for i in BoW_1 : flash.append(fhash(i,c)) for i in range(min(flash),max(flash)+1) : if flash.count(i) != 0: show.append([i,flash.count(i)]) print('BoW =',show) fn1.close() fn2.close() fn3.close()
# 6330268321 (19.25) 118 (2021-03-22 17:41) def bow(s): file_name=open(s) a=[] l=[] count=[] w=[] BoW=[] u="" x=["(", ")", "-", "_", "[", "]", '"', "'", ';', ':', '<', '>', '.', ','] stopwords=open("stopwords.txt") for line in stopwords: line=line.split() for e in line: if e!="\n": a.append(e) for line in file_name: for e in line: if e in x: u+=" " else: u+=e u=u.lower() u=u.split() for e in u: if e not in a: l.append(e) #ได้ list ที่มีแต่พิมพ์เล็กและตัด stopword ออกไปแล้ว c=0 for e in l: for i in range(len(l)): if e == l[i]: c+=1 count.append(c) c=0 for i in range(len(l)): w.append([l[i],count[i]]) for e in w: if e not in BoW: BoW.append(e) stopwords.close() return BoW #------------------------------- def fhash(w,M): a=0 for i in range(len(w)): a+=ord(w[i])*37**i b=a%int(M) return b #-------------------------------- s=input("File name = " ) file_name=open(s) a=["y","Y","n","N"] usef=input("Use feature hashing ? (y,Y,n,N) ") while usef not in a: print("Try again.") usef=input("Use feature hashing ? (y,Y,n,N) ") if usef =="N" or usef=="n": u="" x=["(", ")", "-", "_", "[", "]", '"', "'", ';', ':', '<', '>', '.', ','] linecount=0 alphacount=0 char="" for line in file_name: linecount+=1 #นับจำนวนบรรทัด for e in line: if e in x: u+=" " #นับจำนวนคำ(word count) else: u+=e line=line.lower() for e in line: if e!="\n": #นับอักขระ char+=e if "a"<=e<="z" or "0"<=e<="9": #นับจำนวนตัวอักษรอังกฤษและตัวเลขเท่านั้น alphacount+=1 line=line.split() u=u.split() print("-------------------") print("char count =",str(len(char))) #จำนวนอักขระ print("alphanumeric count =", str(alphacount)) #จำนวนตัวอักษากับตัวเลข print("line count =",str(linecount))#จำนวนบรรัด print("word count =",str(len(u)))#จำนวนคำ print("BoW =",bow(s)) #----------------------------------------------------------------------------- elif usef =="Y" or usef=="y": M=input("M = ") u1="" u2="" l=[] x=["(", ")", "-", "_", "[", "]", '"', "'", ';', ':', '<', '>', '.', ','] linecount=0 alphacount=0 char="" a=[] stopwords=open("stopwords.txt") for line in stopwords: line=line.split() for e in line: if e!="\n": a.append(e) for line in file_name: linecount+=1 #นับจำนวนบรรทัด for e in line: if e in x: u1+=" " #นับจำนวนคำ(word count) else: u1+=e line=line.lower() for e in line: if e!="\n": #นับอักขระ char+=e if "a"<=e<="z" or "0"<=e<="9": #นับจำนวนตัวอักษรอังกฤษและตัวเลขเท่านั้น alphacount+=1 for e in line: if e in x: u2+=" " else: u2+=e u2=u2.lower() u2=u2.split() for e in u2: if e not in a: l.append(e) #ได้ list ที่มีแต่พิมพ์เล็กและตัด stopword ออกไปแล้ว n1=[] countfh=0 n2=[] n3=[] for e in l: n1.append(fhash(e,M)) for i in range(max(n1)+1): for j in range(len(n1)): if i==n1[j]: countfh+=1 n2.append(countfh) countfh=0 if n2[i]!=0: n3.append([i,n2[i]]) u1=u1.split() print("-------------------") print("char count =",str(len(char))) #จำนวนอักขระ print("alphanumeric count =", str(alphacount)) #จำนวนตัวอักษากับตัวเลข print("line count =",str(linecount))#จำนวนบรรัด print("word count =",str(len(u1)))#จำนวนคำ print("BoW =",n3) stopwords.close() file_name.close()
# 6330269021 (22.54) 119 (2021-03-21 21:55) def fhash(w, m): s = 0 for i in range(len(w)): s += ord(w[i]) * 37**i return s % m fileName = input("File name = ").strip() while True: fhashMode = input("Use feature hashing ? (y,Y,n,N) ").strip().lower() if fhashMode == 'y' or fhashMode == 'n': break else: print("Try again.") if fhashMode == 'y': m = int(input("M = ")) print("-------------------") stopWordsFile = open("stopwords.txt") stopWords = [] for line in stopWordsFile: if line != "": for e in line.split(): stopWords.append(e) stopWordsFile.close() inputFile = open(fileName) chCount = 0 alnumCount = 0 lineCount = 0 wordCount = 0 BoW = [] wordTemp = "" words = [] for line in inputFile: lineCount += 1 for ch in line: chCount += 1 if ch.isalnum(): alnumCount += 1 wordTemp += ch elif wordTemp != "": words.append(wordTemp) wordTemp = "" words.append(wordTemp) inputFile.close() wordCount = len(words) print("char count =", chCount) print("alphanumeric count =", alnumCount) print("line count =", lineCount) print("word count =", wordCount) wordsLower = [e.lower() for e in words if e.lower() not in stopWords and e != ""] wordsLowerNoDuplicate = [] for e in wordsLower: if e not in wordsLowerNoDuplicate: wordsLowerNoDuplicate.append(e) for e in wordsLowerNoDuplicate: if fhashMode == 'y': BoW.append([fhash(e, m), wordsLower.count(e)]) else: BoW.append([e, wordsLower.count(e)]) print("BoW =", BoW)
# 6330270521 (23.00) 120 (2021-03-22 18:49) def fhash(w,M) : sum = 0 for i in range(len(w)) : sum += ord(w[i])*(37**i) fhash = sum % int(M) return fhash def BoW(file) : file_name = open(file) stopwords = open('stopwords.txt') stop = [] sentence = '' word = [] countword = [] alpnum = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' cnt = 0 BoW = [] for i in stopwords : i = i.split() for j in i : if j != '\n' : stop.append(j) for i in file_name : for j in i : if j in alpnum : sentence += j else : sentence += ' ' sentence = sentence.lower() sentence = sentence.split() for i in sentence : if i not in stop : word.append(i) for i in word : for j in range(len(word)) : if i in word[j] : cnt += 1 countword.append(cnt) cnt = 0 for i in range(len(word)) : BoW.append([word[i],countword[i]]) stopwords.close() return BoW file = input('File name = ') file_name = open(file) usefh = input('Use featur hashing ? (y,Y,n,N) ') while usefh not in 'yYnN' : print('Try again') usefh = input('Use featur hashing ? (y,Y,n,N) ') if usefh == 'Y' or usefh == 'y' : M = input('M = ') alpnum = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' stopwords = open('stopwords.txt') stopword = [] linecnt = 0 for i in stopwords : i = i.split() for j in i : stopword.append(j) sentence = '' alpcnt = 0 charcnt = '' for i in file_name : linecnt += 1 for j in i : if j != '\n' : charcnt += j if j in alpnum : sentence += j alpcnt += 1 else : sentence += ' ' sentence = sentence.lower() sentence = sentence.split() l = [] for i in sentence : if i not in stopword : l.append(i) fh = [] cnt = 0 fhcnt = [] bow = [] for i in l : fh.append(fhash(i,M)) for i in range(max(fh)+1) : for j in range(len(fh)) : if i == fh[j] : cnt += 1 fhcnt.append(cnt) cnt = 0 if fhcnt[i] != 0 : bow.append([i,fhcnt[i]]) print("-------------------") print("char count =",str(len(charcnt))) print("alphanumeric count =", str(alpcnt)) print("line count =",str(linecnt)) print("word count =",str(len(sentence))) print("BoW =",bow) stopwords.close() else : alpnum = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' linecnt = 0 alpcnt = 0 charcnt = '' sentence = '' for i in file_name : linecnt += 1 for j in i : if j != '\n' : charcnt += j if j in alpnum : sentence += j alpcnt += 1 else : sentence += ' ' print("-------------------") print("char count =",str(len(charcnt))) print("alphanumeric count =", str(alpcnt)) print("line count =",str(linecnt)) print("word count =",str(len(sentence.split()))) print("BoW =",BoW(file)) file_name.close()
# 6330271121 (30.00) 121 (2021-03-21 21:27) file_name = input('File name = ') ufh = input('Use feature hashing ? (y,Y,n,N) ').lower() while ufh not in ['y','n']: print('Try again.') ufh = input('Use feature hashing ? (y,Y,n,N) ').lower() stopword = open('stopwords.txt') stw = [] for line in stopword: sword = line.strip().lower().split() stw += sword stopword.close() if ufh == 'n': fin = open(file_name) charc = 0 alphc = 0 linec = 0 wordc = 0 all_words = [] cutwords = [] for line in fin: linec += 1 line = line.lower() if line[-1] == '\n': line = line[0:-1:1] charc += len(line) newlin = '' for e in line: if e in 'abcdefghijklmnopqrstuvwxyz' or e in '1234567890': newlin += e else: newlin += ' ' words = newlin.split() all_words += words for a in words: if a not in cutwords and a not in stw: cutwords.append(a) al = ''.join(words) alphc += len(al) wordc += len(words) cutwords.sort() c = [0]*len(cutwords) for i in all_words: if i in cutwords: j = cutwords.index(i) c[j] += 1 bow = [] for i in range(len(c)): bow.append([cutwords[i],c[i]]) fin.close() print('-------------------') print('char count =',charc) print('alphanumeric count =',alphc) print('line count =',linec) print('word count =',wordc) print('BoW =',bow) def fhash(w,M): num = 0 for i in range(len(w)): num += ord(w[i])*(37**i) fhash = num%M return fhash if ufh == 'y': M = int(input('M = ')) fin = open(file_name) charc = 0 alphc = 0 linec = 0 wordc = 0 cutwords = [] for line in fin: linec += 1 line = line.lower() if line[-1] == '\n': line = line[0:-1:1] charc += len(line) newlin = '' for e in line: if e in 'abcdefghijklmnopqrstuvwxyz' or e in '1234567890': newlin += e else: newlin += ' ' words = newlin.split() for a in words: if a not in stw: cutwords.append(a) al = ''.join(words) alphc += len(al) wordc += len(words) fh = [] for o in cutwords: if fhash(o,M) not in fh: fh.append(fhash(o,M)) fh.sort() c = [0]*len(fh) for u in cutwords: j = fh.index(fhash(u,M)) c[j] += 1 bow = [] for i in range(len(c)): bow.append([fh[i],c[i]]) fin.close() print('-------------------') print('char count =',charc) print('alphanumeric count =',alphc) print('line count =',linec) print('word count =',wordc) print('BoW =',bow)
# 6330272821 (10.10) 122 (2021-03-22 00:08) def fhash(w, m): s = 0 for i in range(len(w)): s += ord(w[i]) * 37**i return s % m def f(file_name): chr_cnt, alnum_cnt, line_cnt, word_cnt = 0, 0, 0, 0 buffer = "" words = [] file = open(file_name) for line in file: line_cnt += 1 for chr in line: chr_cnt += 1 if chr.isalnum(): alnum_cnt += 1 buffer += chr elif buffer != "": words.append(buffer) buffer = "" words.append(buffer) file.close() word_cnt = len(words) return chr_cnt,alnum_cnt, line_cnt, word_cnt, words def BoW_process(words, stopwords, is_hashing, M): BoW = [] bag_of_word = [] #bag_of_word = [n.lower for n in words if n.lower() not in stopwords and n != ""] for n in words: if n.lower() not in stopwords and n != "": bag_of_word.append(n.lower()) bag_of_word_nodup = [] for n in bag_of_word: if n not in bag_of_word_nodup: bag_of_word_nodup.append(n) for n in bag_of_word_nodup: if is_hashing: BoW.append([fhash(n, M), bag_of_word.count(n)]) else: BoW.append([n, bag_of_word.count(n)]) return BoW if __name__ == '__main__': file_name = input("File name = ").strip() while True: user_input = input("Use feature hashing ? (y,Y,n,N) ").strip().lower() if user_input == "y" or user_input == "n": break else: print("Try again.") is_hashing = False if user_input == "y": is_hashing = True if is_hashing: M = int(input("M = ")) print("-------------------") #stopword stopword_file = open("stopwords.txt") stopwords = [] for line in stopword_file: if line != "": for e in line.split(): stopwords.append(e) stopword_file.close() #prcoess chr_cnt,alnum_cnt, line_cnt, word_cnt, words = f(file_name) #BoW BoW = BoW_process(words, stopwords, is_hashing, M) #output print("char count =", chr_cnt) print("alphanumeric count =", alnum_cnt) print("line count =", line_cnt) print("word count =", word_cnt) print("BoW =", BoW)
# 6330273421 (30.00) 123 (2021-03-18 22:00) def pure_apb(line): #รับ,คืนstr apb='' for e in line: if 'a'<=e<='z' or 'A'<=e<='Z' or '0'<=e<='9': apb+=e else: apb+=' ' apb=apb.lower() return apb def use_fh(c): #คืนy,Y,n,N while c not in ['y','Y','n','N']: print('Try again.') c=input('Use feature hashing ? (y,Y,n,N) ') return c def bow_w(apb): #รับstr pureapb BoW=[] c=1 word=apb.strip().split() word.sort() word.append('[]') for i in range(len(word)-1): if word[i]==word[i+1]: c+=1 else: BoW.append([word[i],c]) c=1 BoW.sort() return BoW def fhash(w,M): #รับlistคำ,int numord=0 for i in range(len(w)): numord+=ord(w[i])*(37)**(i) num=numord%M return num def bow_n(apb): word=apb.strip().split() word.sort() c=1 BoW=[] BOW=[] for i in range(len(word)): num=fhash(word[i],M) BoW.append(num) BoW.sort() BoW.append([]) for i in range(len(BoW)-1): if BoW[i]==BoW[i+1]: c+=1 else: BOW.append([BoW[i],c]) c=1 BOW.sort() return BOW filename=input('File name = ') c=use_fh(input('Use feature hashing ? (y,Y,n,N) ')) file = open(filename,'r') char=0 apb=0 ms='' rms='' line_c=0 for line in file: line_c+=1 for e in line.strip(): char+=1 ms+=pure_apb(line)+' ' for e in ms: if 'a'<=e<='z' or 'A'<=e<='Z' or '0'<=e<='9': apb+=1 file.close() stop = open('stopwords.txt','r') stop_word='' stopword=[] for line in stop: stop_word+= line+' ' stopword+=stop_word.strip().split() for e in ms.strip().split(): if e not in stop_word: rms+=e+' ' if c in ['y','Y']: M=int(input('M = ')) BoW=bow_n(rms) else: BoW=bow_w(rms) word_c=len(ms.strip().split()) print('-------------------') print('char count = '+str(char)) print('alphanumeric count = '+str(apb)) print('line count = '+str(line_c)) print('word count = '+str(word_c)) print('BoW =',BoW)
# 6330274021 (21.40) 124 (2021-03-22 02:02) file_name = input('File name = ') fhash_use = '' while not fhash_use == 'y' or fhash_use == 'Y'or fhash_use == 'n'or fhash_use == 'N': fhash_use = input('Use feature hashing ? (y,Y,n,N) ') if fhash_use == 'y'or fhash_use == 'Y': m = int(input('M = ')) break elif fhash_use == 'n'or fhash_use == 'N': break else: print('Try again.') #---------------------------------------------------- def fhash(w,m): val = 0 for i in range(len(w)): val += ord(w[i])*37**i f_value = val % m return f_value #---------------------------------------------------- stopword = open('stopwords.txt', 'r') sw = [] for line in stopword: w = line.strip().lower().split() sw += w stopword.close() #==================================================== file = open(file_name, 'r') cc = 0 ac = 0 lc = 0 wc = 0 linef = '' word = [] wordf = [] for line in file: line = line.strip().lower() lc += 1 for e in line: cc += 1 if 'a'<=e<='z' or 'A'<=e<='Z' or '0'<=e<='9': ac += 1 linef += e else: linef += ' ' word += linef.split() for f in word: if f not in sw: wordf.append(f) wc = len(word) file.close() #==================================================== if fhash_use == 'y' or fhash_use == 'Y': hash_data = [] bow = [] w = [] for i in range(len(wordf)): hash_data.append(fhash(wordf[i],m)) if hash_data[i] not in w: w.append(hash_data[i]) f = [0]*len(w) for i in range(len(w)): for j in range(len(hash_data)): if w[i] == hash_data[j]: f[i] += 1 ele = [w[i],f[i]] bow.append(ele) bow.sort() elif fhash_use == 'n' or fhash_use == 'N': bow = [] w = [] for i in range(len(wordf)): if wordf[i] not in w: w.append(wordf[i]) f = [0]*len(w) for i in range(len(w)): for j in range(len(wordf)): if w[i] == wordf[j]: f[i] += 1 ele = [w[i],f[i]] bow.append(ele) bow.sort() #---------------------------------------------------- print('-------------------') print('char count = '+str(cc)) print('alphanumeric count = '+str(ac)) print('line count = '+str(lc)) print('word count = '+str(wc)) print('BoW = '+str(bow))
# 6330275721 (30.00) 125 (2021-03-22 21:44) #-------------------------------------------------------- alphabet = 'abcdefghijklmnopqrstuvwxyz' number = '0123456789' special_char = '!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' stopword_file = 'stopwords.txt' #-------------------------------------------------------- def char_count(x): file = open(x, 'r') c = 0 for line in file: line = line.strip() c += len(line) file.close() return c def alphanumeric_count(x): file = open(x, 'r') c = 0 for line in file: line = line.strip() text = '' for char in line: if char.lower() in alphabet or char in number: text += char c += len(text) file.close() return c def line_count(x): file = open(x, 'r') c = 0 for line in file: c += 1 file.close() return c def word_count(x): file = open(x, 'r') c = 0 for line in file: line = line.strip() text = '' for char in line: if char in special_char: text += ' ' else: text += char c += len(text.split()) file.close() return c def fhash(w, M): sum = 0 for i in range(len(w)): sum += ord(w[i])*(37**i) return sum % M def list_stopwords(x): file = open(x, 'r') stopwords_list=[] for line in file: line = line.strip() pre_stw = '' for char in line: if char in special_char: pre_stw += ' ' else: pre_stw += char pre_stw = pre_stw.strip().split() for i in pre_stw: stopwords_list.append(i) file.close() return stopwords_list def BoW(x, ufh, M): file = open(x, 'r') words=[] for line in file: line = line.strip() pre_words = '' for char in line: if char in special_char: pre_words += ' ' else: pre_words += char pre_words = pre_words.strip().split() for e in pre_words: k = e.lower() if k not in list_stopwords(stopword_file): words.append(k) words.sort() file.close() bow = []; bow_n = []; bow_y = []; n_word = [] if ufh in ['n', 'N']: for e in words: if e in bow_n: n_word[bow_n.index(e)] += 1 else: bow_n.append(e); n_word.append(1) for i in range(len(bow_n)): bow.append([bow_n[i],n_word[i]]) return bow elif ufh in ['y', 'Y']: for e in words: p = fhash(e,M) if p in bow_y: n_word[bow_y.index(p)] += 1 else: bow_y.append(p); n_word.append(1) for i in range(len(bow_y)): bow.append([bow_y[i],n_word[i]]) bow.sort() return bow #-------------------------------------------------------- file_name = input('File name = ') ufh = input('Use feature hashing ? (y,Y,n,N) ') while ufh not in ['y','Y','n','N']: print('Try again.') ufh = input('Use feature hashing ? (y,Y,n,N) ') if ufh in ['y', 'Y']: M = int(input('M = ')) elif ufh in ['n', 'N']: M = 0 else: M = 0 print('-'*19) print('char count =', char_count(file_name)) print('alphanumeric count =', alphanumeric_count(file_name)) print('line count =', line_count(file_name)) print('word count =', word_count(file_name)) print('BoW =', BoW(file_name, ufh, M))
# 6330276321 (30.00) 126 (2021-03-21 02:17) #------------------------------------------------------- def fhash(w, M) : # w เป็นสตริงของคำ และ M เป็น int summation = 0 for i in range(len(w)) : summation += ord(w[i]) * ( 37**i ) result = summation % M return result #------------------------------------------------------- def import_words_from(filename) : a = [] x = open(filename, "r") for line in x : words = line.split(" ") for e in words : a.append(e.strip()) x.close() return a #------------------------------------------------------- def count_all_char(file_name) : count = 0 x = open(file_name, "r") for line in x : count += len(line.strip()) x.close() print("char count =", count) #------------------------------------------------------- def count_alphanumeric(file_name) : num = "0123456789" alphabet = "abcdefghijklmnopqrstuvwxyz" alphanumeric = "" x = open(file_name, "r") for line in x : y = line.strip() for e in y : if e != " " and ( e.lower() in alphabet or e in num ) : alphanumeric += e x.close() count = len(alphanumeric) print("alphanumeric count =", count) #------------------------------------------------------- def count_line(file_name) : count = 0 x = open(file_name, "r") for line in x : count += 1 x.close() print("line count =", count) #------------------------------------------------------- def count_words(file_name) : num = "0123456789" alphabet = "abcdefghijklmnopqrstuvwxyz" texts = "" x = open(file_name, "r") for line in x : y = line.strip() for e in y : if e.lower() in alphabet or e in num : texts += e else : texts += " " texts += " " list_of_words = texts.strip().split() count = len(list_of_words) print("word count =", count) #------------------------------------------------------- def get_alphanumeric(file_name) : num = "0123456789" alphabet = "abcdefghijklmnopqrstuvwxyz" texts = "" x = open(file_name, "r") for line in x : y = line.strip() for e in y : if e.lower() in alphabet or e in num : texts += e else : texts += " " texts += " " list_of_words = texts.strip().split() x.close() return list_of_words #------------------------------------------------------- def BoW(file_name) : texts = get_alphanumeric(file_name) stoptexts = get_alphanumeric("stopwords.txt") no_stop = [] for e in texts : if e.lower() not in stoptexts : no_stop.append(e.lower()) # --------------------------------------- no_stop_and_repeat = [] for f in no_stop : if f not in no_stop_and_repeat : no_stop_and_repeat.append(f) # --------------------------------------- result = [] for g in no_stop_and_repeat : result.append([g, no_stop.count(g)]) result.sort() print("BoW =", result) #------------------------------------------------------- def BoW_fhash(file_name, M) : texts = get_alphanumeric(file_name) stoptexts = get_alphanumeric("stopwords.txt") # print(texts) # ได้เป็นลิสต์ของคำกับตัวเลขแต่ยังซ้ำอยู่ # print(stoptexts) no_stop = [] for e in texts : if e.lower() not in stoptexts : no_stop.append(e.lower()) # --------------------------------------- converted = [] for f in no_stop : converted.append(fhash(f, M)) # print("converted =", converted) # --------------------------------------- converted_no_repeat = [] for g in converted : if g not in converted_no_repeat : converted_no_repeat.append(g) # print("converted_no_repeat =", converted_no_repeat) # --------------------------------------- result = [] for h in converted_no_repeat : result.append([h, converted.count(h)]) result.sort() print("BoW =", result) #------------------------------------------------------------------- file_name = input("File name = ") while True : condition = input("Use feature hashing ? (y,Y,n,N) ") if condition == "y" or condition == "Y" : using_fhash = True break elif condition == "n" or condition == "N" : using_fhash = False break else : print("Try again.") if not using_fhash : print("-------------------") count_all_char(file_name) count_alphanumeric(file_name) count_line(file_name) count_words(file_name) BoW(file_name) else : M = int(input("M = ")) print("-------------------") count_all_char(file_name) count_alphanumeric(file_name) count_line(file_name) count_words(file_name) BoW_fhash(file_name, M)
# 6330277021 (23.55) 127 (2021-03-22 23:54) file_name = input("File name = ") fh = input("Use feature hashing ? (y,Y,n,N) ") while fh not in ['Y', 'y', 'N', 'n']: print("Try again.") fh = input("Use feature hashing ? (y,Y,n,N) ") if fh == 'Y' or fh == 'y': M = input("M = ") f1 = open("stopwords.txt", "r") stop_words = [] for line in f1: if len(line) != 0: sw = line.strip().split() for e in sw: if e.lower() not in stop_words: stop_words.append(e.lower()) f1.close() print("-------------------") f2 = open(file_name, "r") char_count = 0 for line in f2: if line[-1] == "\n": line = line[:len(line)-1:] char_count += len(line) print("char count =", char_count) f2.close() f2 = open(file_name, "r") an_count = 0 for line in f2: for ch in line: if 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' or '0' <= ch <= '9': an_count += 1 print("alphanumeric count =", an_count) f2.close() f2 = open(file_name, "r") line_count = 0 for line in f2: line_count += 1 print("line count =", line_count) f2.close() f2 = open(file_name, "r") word_count = [] x = '' for line in f2: for e in line: if 'A' <= e <= 'Z' or 'a' <= e <= 'z' or '0' <= e <= '9': x += e else: if x != '' and e != '\n': word_count.append(x) x = '' if x != '': word_count.append(x) x = '' print("word count =", len(word_count)) f2.close() f2 = open(file_name, "r") BoW = [] y = '' for line in f2: line = line.lower() for ch in line: if 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' or '0' <= ch <= '9' or ch == ' ': y += ch elif ch == '\n': y += ' ' y = y.split() y2 = [] for a in y: if a not in stop_words: y2.append(a) y3 = [] y4 = [] for b in y2: if b not in y3: y3.append(b) y4.append(1) else: z = y3.index(b) y4[z] += 1 for i in range(len(y3)): BoW.append([y3[i], y4[i]]) f2.close() if fh == 'N' or fh == 'n': print("BoW =", BoW)
# 6330278621 (24.40) 128 (2021-03-21 02:45) file_name = input('File name = ') x = input('Use feature hashing ? (y,Y,n,N) ') File_name = open(file_name,'r') while x not in ['y','Y','n','N']: print('Try again.') x = input('Use feature hashing ? (y,Y,n,N) ') if x in ['y','Y']: M = int(input('M = ')) #------------------------------------------------------ engalpha = 'abcdefghijklmnopqrstuvwxyz' num = '0123456789' #------------------------------------------------------ if x in ['y','Y','n','N']: char_count = 0 alphanumeric_count = 0 line_count = 0 word_count = 0 for line in File_name: line = line.strip() char_count += len(line) line_count += 1 w = '' for e in line: if e in engalpha or e in engalpha.upper() or e in num: alphanumeric_count += 1 w += e else: w += ' ' w = w.split() word_count += len(w) print('-'*19) print('char count = ',char_count) print('alphanumeric count = ',alphanumeric_count) print('line count = ',line_count) print('word count = ',word_count) File_name.close() #------------------------------------------------------ File_name = open(file_name,'r') stop_words = open('stopwords.txt','r') list_of_stpw = [] for line in stop_words: line = line.strip().split() for i in range(len(line)): list_of_stpw.append(line[i]) if x in ['n','N']: new_line = [] Line = '' for line in File_name: line = line.strip().lower() for e in line: if e in engalpha or e in num: Line += e else: Line += ' ' Line = Line.split() for i in range(len(Line)): if Line[i] not in list_of_stpw: new_line.append(Line[i]) word_in_nl = [] n_in_nl = [] for i in range(len(new_line)): new_line.sort() if new_line[i] not in word_in_nl: word_in_nl.append(new_line[i]) e = new_line[0] n = 1 for i in range(1,len(new_line)): if new_line[i] == e: n += 1 else: n_in_nl.append(n) e = new_line[i] n = 1 n_in_nl.append(n) w_n = [] for i in range(len(word_in_nl)): w_n.append([word_in_nl[i],n_in_nl[i]]) print('BoW = ',w_n) File_name.close() #------------------------------------------------------ def fhash(w,M): sums = 0 for i in range(len(w)): sums += (ord(w[i])) * (37**i) return sums % M File_name = open(file_name,'r') stop_words = open('stopwords.txt','r') if x in ['y','Y']: new_line = [] Line = '' for line in File_name: line = line.strip().lower() for e in line: if e in engalpha or e in num: Line += e else: Line += ' ' Line = Line.split() for i in range(len(Line)): if Line[i] not in list_of_stpw: new_line.append(Line[i]) BoW = [] val = [] for i in range(len(new_line)): val.append(fhash(new_line[i],M)) val.sort() n_of_BoW = [] e = val[0] n = 1 for i in range(1,len(val)): if val[i] == e: n += 1 else: n_of_BoW.append(n) e = val[i] n = 1 n_of_BoW.append(n) new_val = [] new_val.append(val[0]) for i in range(1,len(val)): if val[i] != val[i-1]: new_val.append(val[i]) for i in range(len(new_val)): BoW.append([new_val[i],n_of_BoW[i]]) print('BoW = ',BoW) stop_words.close() File_name.close()
# 6330279221 (30.00) 129 (2021-03-22 22:27) file_name = input('File name = ',) u = input('Use feature hashing ? (y,Y,n,N) ',) #-------------------------------------------------------- def fhash(w, M): G = 37 n = 0 cx = 0 for i in w: f = ord(i)*(G**n) cx += f n += 1 c = cx % M return c #--------------------------------------------------------- while u not in ['y', 'Y', 'n', 'N']: print('Try again') u = input('Use feature hashing ? (y, Y, n, N) ',) #----------------------------------------------------------- stop = open('stopwords.txt', 'r') s = stop.readlines() sw = [] for i in s: i = i.lower() sw += i.split() #----------------------------------------------------------- fn = open(file_name, 'r') f = fn.readlines() linecount = len(f) for i in range(len(f)): f[i] = f[i].strip('\n').lower() g = '' for e in f[i]: if e.isalnum(): g += e else: g += ' ' f[i] = g chacount = 0 for k in f: chacount += len(k) wordcount = 0 for j in f: wordcount += len(j.split()) alnumcount = 0 for i in f: alnumcount += len(''.join(i.split())) wds = [] for i in f: wds += i.split() wds.sort() words = [] for j in wds: if j not in sw: words.append(j) word = [] for i in range(len(words)): if words[i] not in words[i+1:]: word.append(words[i]) if u == 'y' or u == 'Y': M = input('M = ',) BoW = [] fhs = [] for i in words: fhs.append(fhash(i,int(M))) fh = [] fr = [] for j in fhs: if j in fh: fr[fh.index(j)] = fr[fh.index(j)]+1 else: fh.append(j) fr.append(1) for i in range(len(fh)): w = [fh[i],fr[i]] BoW.append(w) BoW.sort() elif u == 'n' or u == 'N': BoW = [] for i in word: w = [i] c = 0 for e in range(len(words)): if i == words[e]: c += 1 w.append(c) BoW.append(w) print('-------------------') print('char count =', chacount) print('alphanumeric count =', alnumcount) print('line count =', linecount) print('word count =', wordcount) print('BoW =', BoW)
# 6330280821 (0.00) 130 (2021-03-22 22:05) def char_count(fn): file_name = open(fn) c = 0 for e in file_name: for a in e: if a != "\n": c += 1 file_name.close return c def count_line(fn): file_name = open(fn) c = 0 for line in file_name: c += 1 file_name.close() return c def alphanumeric(fn): a = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" file_name = open(fn) c = 0 for e in file_name : for d in e: if d in a: c += 1 file_name.close() return c def word_count(fn): file_name = open(fn) c = "" for e in file_name: for a in e: if a not in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789": c += " " else : c += a file_name.close return c.split() s = open("stopwords.txt") stop = "" for e in s: for d in e: stop += d stop2 = stop.lower().split() s.close() def bow_no_hashing(fn): p = [] q = word_count(fn) for e in q: if e.lower() not in stop2: p.append(e) p.sort() p += "!!" d = p[0] last = [] num = 1 for j in range(1,len(p)) : if d != p[j]: last.append([d,num]) num = 1 d = p[j] else : num += 1 return last def fhash(w,M): c = 0 a = 0 G = 37 for i in w : c += ord(i)*(G**a) a += 1 b = c%M return b def bow_hashing(fn,M): p = [] q = word_count(fn) for e in q: if e.lower() not in stop2: p.append(e) s = [] for i in p : v = fhash(i,int(M)) s.append(v) s.sort() s += "!!" d = s[0] last = [] num = 1 for j in range(1,len(s)) : if d != s[j]: last.append([d,num]) num = 1 d = s[j] else : num += 1 return last print(bow_hashing("sample.txt",10)) x = input("File name = ") b = input("Use feature hashing ? (y,Y,n,N) ") while b not in "yYnN": print("Try again.") b = input("Use feature hashing ? (y,Y,n,N) ") if b in "yY": M = input("M = ") print("-------------------") print("char count =",char_count(x)) print("alphanumeric count =",alphanumeric(x)) print("line count =",count_line(x)) print("word count =",len(word_count(x))) print("BoW = ",bow_hashing(x,M)) else : print("-------------------") print("char count =",char_count(x)) print("alphanumeric count =",alphanumeric(x)) print("line count =",count_line(x)) print("word count =",len(word_count(x))) print("BoW = ",bow_no_hashing(x))
# 6330281421 (27.80) 131 (2021-03-21 12:19) #-------------------------------------- #ข้อมูลที่แก้ได้ stopword_file='stopwords.txt' sp_char='!@#$%^&*()_+{}[]:\";\',./<>?\\=-`' al_and_nume='abcdefghijklmnopqrstuvwxyz0123456789' #-------------------------------------- #ส่วนฟังก์ชั่น def c_count(filename): with open(filename,'r') as file: n=0 for line in file: line=line.strip() n+=len(line) return n def alpha_count(filename): with open(filename,'r') as file: n=0 for line in file: line=line.strip() text='' for char in line: #if not(char in sp_char): if char.lower() in al_and_nume : text+=char #text=''.join(text.split()) n+=len(text) return n def line_count(filename): with open(filename,'r') as file: n=0 for line in file: n+=1 return n def word_count(filename): with open(filename,'r') as file: n=0 for line in file: line=line.strip() text='' for char in line: if char in sp_char: text+=' ' else: text+=char n+=len(text.split()) return n def list_of_stopwords(filename): with open(filename,'r') as file: stopwords_list=[] for line in file: line=line.strip() text='' for char in line: if char in sp_char: text+=' ' else: text+=char text=text.split() for i in text: stopwords_list.append(i.lower()) return stopwords_list def fhash(word,m): sum=0 for i in range(len(word)): sum+=ord(word[i])*(37**i) return sum%m def BoW(filename,condition,m): with open(filename,'r') as file: words=[] for line in file: line=line.strip() text='' for char in line: if char in sp_char: text+=' ' else: text+=char text=text.split() for i in text: if not(i.lower() in list_of_stopwords(stopword_file)): words.append(i.lower()) words.sort() bag_of_word=[] repit_word=[] bag_of_words=[] if condition.lower() =='n': for i in words: if i in bag_of_word: repit_word[bag_of_word.index(i)]+=1 else: bag_of_word.append(i) repit_word.append(1) for i in range(len(bag_of_word)): bag_of_words.append([bag_of_word[i],repit_word[i]]) return bag_of_words else: for i in words: p=fhash(i,m) if p in bag_of_word: repit_word[bag_of_word.index(p)]+=1 else: bag_of_word.append(p) repit_word.append(1) for i in range(len(bag_of_word)): bag_of_words.append([bag_of_word[i],repit_word[i]]) bag_of_words.sort() return bag_of_words #-------------------------------------- #ส่วนทำงาน file=input('File name = ') feature=input('Use feature hashing ? (y,Y,n,N) ') while not(feature in ['Y','n','N','y']): print('Try again.') feature=input('Use feature hashing ? (y,Y,n,N) ') if feature.lower() == 'y': m=int(input('M = ')) else: m=0 print('-------------------') print('char count =',c_count(file)) print('alphanumeric count =',alpha_count(file)) print('line count =',line_count(file)) print('word count =',word_count(file)) print('BoW =',BoW(file,feature,m))
# 6330282021 (30.00) 132 (2021-03-21 18:23) def fhash(w,M): sum_f = 0 for a in range(len(w)): sum_f += (ord(w[a])*(37**a)) result = sum_f % M return result def words_in_line(line): s = '' for e in line: if 'A' <= e <= 'Z' or 'a' <= e <= 'z' or '0' <= e <= '9': s += e else: s += ' ' return s.split() def remove_stopwords(line,stop_words): s = [] line = line.lower().strip() line = words_in_line(line) for e in line: if e not in stop_words: s.append(e) return s def count(line,word): c = 0 while True: if word in line: c += 1 line.remove(word) else: break return c #------------------------------------ file_name = input('File name = ') ans = input('Use feature hashing ? (y,Y,n,N) ').lower() while ans != 'y' and ans != 'n': print('Try again.') ans = input('Use feature hashing ? (y,Y,n,N) ').lower() if ans == 'y': m = int(input('M = ')) print('-------------------') #------------------------------------ # check stop words sw_file = open('stopwords.txt','r') stop_words = [] for l in sw_file: l = l.lower().split() for data in l: stop_words.append(data) sw_file.close() #------------------------------------ # check data in file_name f = open(file_name,'r') char_c = 0 al_c = 0 line_c = 0 word_c = 0 words = [] words_list = [] line_list_no_stopword = [] line_list = [] for line in f: line = line.strip() line_c += 1 char_c += len(line) w = words_in_line(line) for ww in w: al_c += len(ww) word_c += 1 line_list.append(ww.lower()) words.append(remove_stopwords(line,stop_words)) print('char count =',char_c) print('alphanumeric count =',al_c) print('line count =',line_c) print('word count =',word_c) for w1 in words: for w2 in w1: line_list_no_stopword.append(w2) if w2 not in words_list: words_list.append(w2) BoW = [] if ans != 'y': for d in words_list: n = count(line_list,d.lower()) bag = [d,n] BoW.append(bag) else: line_list_fhash = [] word_list_fhash = [] for k in line_list_no_stopword: word_code1 = fhash(k,m) line_list_fhash.append(word_code1) for kk in words_list: word_code2 = fhash(kk,m) if word_code2 not in word_list_fhash: word_list_fhash.append(word_code2) for d in word_list_fhash: n = count(line_list_fhash,d) bag = [d,n] BoW.append(bag) BoW.sort() print('BoW =', BoW) f.close()
# 6330283721 (22.95) 133 (2021-03-22 18:39) def fhash(w,M): s=0 for i in range(len(w)): s+=ord(w[i])*(37**i) s=s%M return s #-------------------------------------------------------------------------------------------- file_name = input('File name = ',) a=open(file_name,'r') a=a.read() b=open('stopwords.txt','r') b=b.read() #-------------------------------------------------------------------------------------------- u=len(a) if a[-1]=="\n": c = -1 u -= 1 else: c = 0 t = "" s = "" for e in a: if e in "\"\'/\\,.:;()[]{}": t+=" " elif e in "\n": t+=" " c+=1 else: t+=e for f in b: if f in "\"\'/\\,.:;()[]{}": s+=" " elif f in "\n": s+=" " else: s+=f t=t.split() s=s.split() x=('').join(t) b1=[] b2=[] b3=[] b4=[] BoW1=[] BoW2=[] p=[] q=[] r=[] for i in range(len(s)): b2.append(s[i].lower()) for i in range(len(t)): if t[i].lower() not in b2: b1.append(t[i].lower()) for word in b1: if word not in b3: b3.append(word) count=1 b4.append(count) elif word in b3: b4[b3.index(word)]+=1 for i in range(len(b3)): BoW1.append([b3[i],b4[i]]) #-------------------------------------------------------------------------------------------- ch=True while ch==True: fh = input('Use feature hashing ? (y,Y,n,N) ',) if fh in 'nN': print('-------------------') print('char count =',u-c) print('alphanumeric count =',len(x)) print('line count =',c+1) print('word count =',len(t)) print('BoW =',BoW1) ch=False elif fh in 'yY': M = int(input('M = ',)) print('-------------------') print('char count =',u-c) print('alphanumeric count =',len(x)) print('line count =',c+1) print('word count =',len(t)) for i in range(len(b1)): p.append(fhash(b1[i],M)) for n in p: if n not in q: q.append(n) count=1 r.append(count) elif n in q: r[q.index(n)]+=1 for i in range(len(q)): BoW2.append([q[i],r[i]]) print('BoW =',sorted(BoW2)) ch=False else: print('Try again.')
# 6330284321 (30.00) 134 (2021-03-21 20:15) fileName = input("File name = ") M = -1 while True: tp = input("Use feature hashing ? (y,Y,n,N) ") if tp == 'Y' or tp == 'y': M = int(input("M = ")) break elif tp == 'N' or tp == 'n': break else: print("Try again.") print("-------------------") ualpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" lalpha = "abcdefghijklmnopqrstuvwxyz" num = "0123456789" f = open(fileName, "r") stringList = [] charCount, alphaCount, lineCount, wordCount = 1, 0, 0, 0 for line in f: lineCount += 1 charCount += len(line) - 1 cur = "" line += " " for c in line: status = True for i in range(26): if ualpha[i] == c or lalpha[i] == c: cur += lalpha[i] status = False for i in range(10): if num[i] == c: cur += num[i] status = False if status and cur != "": stringList.append(cur) alphaCount += len(cur) cur = "" wordCount = len(stringList) print("char count =", charCount) print("alphanumeric count =", alphaCount) print("line count =", lineCount) print("word count =", wordCount) stopwords = [] f = open("stopwords.txt", "r") for line in f: x = line.split() for z in x: stopwords.append(z); BoW = [] if M == -1: for word in stringList: if word in stopwords: continue status = True for idx in range(len(BoW)): if BoW[idx][0] == word: BoW[idx][1] += 1 status = False break if status: BoW.append([word, 1]) else: for word in stringList: if word in stopwords: continue status = True num = 0 for c in word[::-1]: num *= 37 num += ord(c) num %= M for idx in range(len(BoW)): if BoW[idx][0] == num: BoW[idx][1] += 1 status = False break if status: BoW.append([num, 1]) BoW.sort() print("BoW =", BoW)
# 6330285021 (28.00) 135 (2021-03-22 16:37) def charcount(file) : data = open(file,"r") num = 0 for line in data : for d in line : if d != "\n" : num += 1 return num data.close() def alphacount(file) : data = open(file,"r") num = 0 for line in data : for d in line : d = d.lower() if "a" <= d <= "z" or "0" <= d <= "9": num += 1 return num data.close() def linecount(file) : data = open(file,"r") num = 0 for line in data : num += 1 return num data.close() def wordcount(file) : data = open(file,"r") ans = "" for line in data : for d in line : if not ("a" <= d <= "z" or "A" <= d <= "Z" or "0" <= d <= "9"): ans += " " else : ans += d.lower() return ans.split() data.close() def BoWnohash(file) : data = wordcount(file) sf = [] s = open("stopwords.txt","r") for line in s : sf += line.lower().split() data2 = [] for d in data : if not (d in sf) : data2.append(d) s.close() data2.sort() final = [] data2.append("*") num = 1 first = data2[0] b = len(data2) for j in range(1,b) : if first == data2[j] : num += 1 else : final.append([first,num]) first = data2[j] num = 1 return final def BoWhash(file,M) : M = int(M) data = wordcount(file) sf = [] s = open("stopwords.txt","r") for line in s : sf += line.lower().split() data2 = [] for d in data : if not (d in sf) : data2.append(d) s.close() final = [] for c in data2 : num = 0 a = 0 for d in c : num += ord(d)*(37**a) a += 1 final.append(num%M) num2 = 1 final += [max(final)+1000] final.sort() first = final[0] ans = [] for j in range(1,len(final)) : if first == final[j] : num2 += 1 else : ans.append([first,num2]) first = final[j] num2 = 1 return ans file = input("File name = ") a = True while a == True : feature = input("Use feature hashing ? (y,Y,n,N) ") if feature == "y" or feature == "Y" : M = input("M = ") print("-------------------") print("char count =",charcount(file)) print("alphanumeric count =",alphacount(file)) print("line count =",linecount(file)) print("word count =",len(wordcount(file))) print("BoW =",BoWhash(file,M)) a = False elif feature == "n" or feature == "N" : print("-------------------") print("char count =",charcount(file)) print("alphanumeric count =",alphacount(file)) print("line count =",linecount(file)) print("word count =",len(wordcount(file))) print("BoW =",BoWnohash(file)) a = False else : print("Try again.")
# 6330286621 (30.00) 136 (2021-03-21 06:33) def fhash(w,M): f=0 G=37 k=0 for i in w: f+=ord(i)*(G**k) k+=1 return f%M file_name=input('File name = ') while True : Fh=input('Use feature hashing ? (y,Y,n,N) ') if Fh.upper()=='Y' : M=int(input('M = ')) break elif Fh.upper()=='N': break else: print('Try again.') fs = open('stopwords.txt', 'r') stopw=[] for line in fs: line=line.strip() stopw+=line.split() fs.close() f = open(file_name, 'r') char_count,alphanumeric_count=0,0 line_count,word_count=0,0 BoW=[] BoWc=[] allwords=[] for line in f: line=line.strip() char_count+=len(line) nline='' for i in line: if i.isalnum() : nline+=i ; alphanumeric_count+=1 else : nline+=' ' nline=nline.split() word_count+=len(nline) line_count+=1 #print(nline) for j in nline: if j.lower() not in stopw: allwords.append(j.lower()) if j.lower() not in BoW and j.lower() not in stopw: BoW.append(j.lower()) ; BoWc.append(1) elif j.lower() in BoW: BoWc[BoW.index(j.lower())]+=1 f.close() BoW2list=[] for i in range(len(BoW)): BoW2list.append([BoW[i],BoWc[i]]) if Fh.upper()=='Y' : p=[] BoW2list=[] for w in allwords: if not fhash(w,M) in p: p.append(fhash(w,M)) ; BoW2list.append([fhash(w,M),1]) elif fhash(w,M) in p: BoW2list[p.index(fhash(w,M))][1]+=1 BoW2list.sort() print('char count =',char_count) print('alphanumeric count =',alphanumeric_count) print('line count =',line_count) print('word count =',word_count) print('BoW =',BoW2list)
# 6330288921 (15.00) 137 (2021-03-22 15:57) #------------------------------------------------- file_name = input('File name = ',) fileee = input("Use feature hashing ? (y,Y,n,N) ",) while fileee != 'n' and fileee != 'N' and fileee != 'y' and fileee != 'Y' : print('Try again.') fileee = input("Use feature hashing ? (y,Y,n,N) ",) if fileee == 'y' or fileee == 'Y' : fileee= int(input('M = ',)) #------------------------------------------------- stw = open('stopwords.txt', 'r') file = open(file_name, 'r') #------------------------------------------------- file1 = file.read().strip() f1 = 0 for j in file1 : if j == '\n' : continue else : f1 += len(j) file12 = file1.split() file123 = " ".join(file12) c2 = "" for j in file123 : if "0" <= j <= "9" or \ "A" <= j <= "Z" or \ "a" <= j <= "z" or \ j == " ": c2 += j+' ' c1 = 0 b= c2.split() for j in b : c1 += len(j) count2 = 0 file = open(file_name, 'r') for line in file : count2 += 1 c3 = "" for j in file12 : if j[0].isalnum()==False : for i in range(len(j)) : if "0" <= j[i-1] <= "9" or \ "A" <= j[i-1] <= "Z" or \ "a" <= j[i-1] <= "z" or \ j[i-1] == " " : c3 += j+' ' break if "0" <= j <= "9" or \ "A" <= j <= "Z" or \ "a" <= j <= "z" or \ j == " ": c3 += j+' ' a= c3.split() d= len(a) print('-------------------') print("char count =",f1) print("alphanumeric count =",c1) print("line count =",count2) print("word count =",d) stw.close() file.close()
# 6330289521 (27.75) 138 (2021-03-22 22:41) #--------------------------------------------- def fhash(w,M): sum=0 for i in range (len(w)): sum+=(ord(w[i]))*(37**i) x=sum%M return x #--------------------------------------------- def remove_expression(x): ex=[':',';','.',',','[',']','(',')','{','}',"'",'"','<','>',"?",'+','-','*','#','$','=','-','%','^','&','/','\\','_','$','%','@','!','^','฿'] a='' for c in x: if c in ex: a+=" " else: a+=c return a #--------------------------------------------- def bow(l,s): l=l.lower() l=remove_expression(l) l=l.split() x=[] for e in l: if e not in s: x+=[e] return x #--------------------------------------------- def freq(x): x.sort() y=[] z=[x[0]] count=1 out=[] for i in range(len(x)-1): if x[i]==x[i+1]: count+=1 else: z+=[x[i+1]] y+=[count] count=1 y+=[count] for i in range(len(z)): out+=[[z[i],y[i]]] return out #--------------------------------------------- def line_count(): file_name=open(k,'r') linecount=0 for line in file_name: line = line.rstrip("\n") # if line != '\n' linecount+=1 file_name.close() print('line count =',linecount) #--------------------------------------------- def word_count(): wordcount=0 file_name=open(k,'r') for line in file_name: line = line.rstrip("\n") line=remove_expression(line) line=line.split() for e in line: # if e!="\n" wordcount+=1 file_name.close() print('word count =',wordcount) #--------------------------------------------- def char_count(): file_name=open(k,'r') charcount=0 for line in file_name: line = line.rstrip("\n") for c in line: charcount+=1 file_name.close() print('char count =',charcount) #--------------------------------------------- def alpha_count(): file_name=open(k,'r') alphacount=0 for line in file_name: line = line.rstrip("\n") line=line.lower() for c in line: if 'a'<=str(c)<='z' or str(9)>=str(c)>=str(0): alphacount+=1 file_name.close() print('alphanumeric count =',alphacount) #--------------------------------------------- def frequency(): file_name=open(k,'r') o=[] p=[] for line in file_name: line = line.rstrip("\n") BOW=bow(line,x) p+=BOW p.sort() if len(p)==0: return print("BoW = []") o+=freq(p) n=[] if Feature =="Y" or Feature =="y": for a,b in o: a=fhash(a,M) n+=[[a,b]] n.sort() r=[] count=n[0][1] for i in range(len(n)-1): if n[i][0]==n[i+1][0]: count+=n[i+1][1] else: r+=[[n[i][0],count]] count=n[i+1][1] if n[-2][0]!=n[-1][0]: r+=[[n[-1][0],n[-1][1]]] else: r+=[[n[-1][0],count]] print("BoW =",r) else: print("BoW =",o) file_name.close() #--------------------------------------------- file_name=input("File name = ") k=file_name a=['y','Y','n','N'] while True: Feature=input("Use feature hashing ? (y,Y,n,N) ") if Feature in a: break else: print("Try again.") if Feature =="Y" or Feature =="y": M=int(input('M = ')) stopwords=open("stopwords.txt","r") x=[] for line in stopwords: line.strip() line=remove_expression(line) line=line.split() for e in line: x+=[e] if Feature=="n" or Feature=="N": print('-------------------') char_count() alpha_count() line_count() word_count() frequency() else: print('-------------------') char_count() alpha_count() line_count() word_count() frequency()
# 6330290021 (28.00) 139 (2021-03-22 21:18) file_name = open(input('File name = '),'r') order = input('Use feature hashing ? (y,Y,n,N) ') def word(line): line = line.lower() w = '' for i in line : if 'a' <= i <= 'z'or i in '0123456789': w += i else: w += ' ' return w #------------------------------------------------- def stop_words (): stop_words = open('stopwords.txt','r') stop = [] for line in stop_words: line = word(line).split() stop += line stop_words.close() return stop #------------------------------------------------- def fhash(w,M): G = 37 s = 0 for i in range(len(w)) : s += ord(w[i])*G**i return s%M #------------------------------------------------- def BoW(n): bow = [] s = [] n = word(n).split() for i in n: if i not in s and i not in stop_words(): s.append(i) bow.append([i,n.count(i)]) return sorted(bow) #------------------------------------------------- def fhash_bow(n,m): bow = [] s = [] f = [] n = word(n).split() for i in n: if i not in stop_words(): f.append(fhash(i,m)) for e in f: if e not in s : s.append(e) bow.append([e,f.count(e)]) return sorted(bow) #------------------------------------------------- def word_count(line): count = 0 line = word(line).split() count += len(line) return count #------------------------------------------------- while order not in ['y','Y','N','n']: print('Try again.') order = input('Use feature hashing ? (y,Y,n,N) ') try: if order == 'Y' or order == 'y' : m = int(input('M = ')) print('-'*19) s = '' char = 0 alphanumeric= 0 line_c = 0 w_count = 0 for line in file_name: s += word(line) if line[-1:] == '\n': line = line[:-1] line_c += 1 char += len(line) w_count += word_count(line) alphanumeric += len(''.join(word(line).split())) h = fhash_bow(s,m) print('char count =',char) print('alphanumeric count =',alphanumeric) print('line count =',line_c) print('word count =',w_count) print('BoW =',h) elif order == 'n' or order == 'N' : print('-'*19) s = '' char = 0 alphanumeric= 0 line_c = 0 w_count = 0 for line in file_name: s += word(line) if line[-1:] == '\n': line = line[:-1] line_c += 1 char += len(line) w_count += word_count(line) alphanumeric += len(''.join(word(line).split())) print('char count =',chr) print('alphanumeric count =',alphanumeric) print('line count =',line_c) print('word count =',w_count) print('BoW =',BoW(s)) except: print('') finally: file_name.close()
# 6330291721 (30.00) 140 (2021-03-21 16:29) def fhash(w,M): G = 37 fhash = 0 for i in range(len(w)): fhash += ord(w[i])*(G**i) fhash = fhash % int(M) return fhash def get_unique(words): unique_words = [ ] for i in words: if not i in unique_words: unique_words.append(i) return unique_words file_name = input('File name = ') way = input('Use feature hashing ? (y,Y,n,N) ') while way not in ['y','Y','n','N']: print('Try again.') way = input('Use feature hashing ? (y,Y,n,N) ') char_count = 0 alphanumeric_count = 0 line_count = 0 word = [] BoW = [] infile = open(file_name, "r") for line in infile: char_count += len(line.strip()) line_count += 1 w = line.lower().strip() for i in range(len(line.strip())): if line.strip()[i].isalnum() == True: alphanumeric_count += 1 else: w = w[:i:] + ' ' + w[i+1::] for i in w.split(): word.append(i) infile.close() word_count = len(word) if way == 'y' or way == 'Y': M = input('M = ') print('-------------------') print('char count =',char_count) print('alphanumeric count =',alphanumeric_count) print('line count =',line_count) print('word count =',word_count) stopword = [] stopwords = open('stopwords.txt','r') for line in stopwords: for i in line.strip().split(): stopword.append(i) stopwords.close() for k in range(len(word)): c = 0 if word[k] not in stopword: for i in range(len(word)): if word[k] == word[i]: c += 1 BoW.append([word[k],c]) BoWfh = [] for i in range(len(BoW)): BoWfh.append(fhash(BoW[i][0],M)) BoW = [] for k in range(len(BoWfh)): c = 0 for i in range(len(BoWfh)): if BoWfh[k] == BoWfh[i]: c += 1 BoW.append([BoWfh[k],c]) BoW = get_unique(BoW) print('BoW =',BoW) elif way == 'n' or way == 'N': print('-------------------') print('char count =',char_count) print('alphanumeric count =',alphanumeric_count) print('line count =',line_count) print('word count =',word_count) stopword = [] stopwords = open('stopwords.txt','r') for line in stopwords: for i in line.strip().split(): stopword.append(i) stopwords.close() for k in range(len(word)): c = 0 if word[k] not in stopword: for i in range(len(word)): if word[k] == word[i]: c += 1 BoW.append([word[k],c]) BoW = get_unique(BoW) print('BoW =',BoW)
# 6330292321 (24.80) 141 (2021-03-22 17:02) def fhash(w,m): ans = 0 for i in range(len(w)): ans+=ord(w[i])*(37**i) return ans%m def d_word(line): stopword = [] ans = [] for i in open('stopwords.txt','r'): stopword.extend(i.split()) sp = clean(line) for i in sp.split(): if i not in stopword: ans.append(i) return ans def clean(line): sp = '' for char in line: if char=="'": sp+=' ' elif (ord(char)==32) or \ (ord(char)>=48 and ord(char)<=57) or\ (ord(char)>=65 and ord(char)<=90) or\ (ord(char)>=97 and ord(char)<=122) : sp+=char.lower() return sp def r_count(l): try : ans = [[l[0],1]] except: return [] for i in l[1::] : m = 0 for j in range(len(ans)): if i == ans[j][0]: ans[j][1]+=1 m = 1 break if m == 0: ans.append([i,1]) return sorted(ans) file_name = input('File name = ') c = input('Use feature hashing ? (y,Y,n,N) ') while True: if c not in 'YyNn': print('Try again.') c = input('Use feature hashing ? (y,Y,n,N) ') else: break file = open(file_name,'r') sc = [] c_count = 0 l_count = 0 w_count = 0 a_count = 0 for line in file: if line[-1]=='\n': c_count -= 1 c_count += len(line) l_count += 1 for i in clean(line): if i != ' ' and i != '\n': a_count+=1 w_count += len(clean(line).split()) sc.extend(d_word(clean(line))) if c == 'Y' or c =='y': m = int(input('M = ')) h = [fhash(i,m) for i in sc] bow = r_count(h) else: h = [i for i in sc] bow = r_count(h) print('-------------------') print('char count =', c_count) print('alphanumeric count =', a_count) print('line count =', l_count) print('word count =', w_count) print('BoW =', bow)
# 6330293021 (30.00) 142 (2021-03-21 23:46) def flash(word , M): s = 0 for i in range(len(word)): s += ord(word[i])* (37**i) return s % M #---------------------------------------------------------------------- def words(line): new = '' for e in line: if not ('0'<= e <= '9' or 'a'<= e <= 'z' or 'A'<= e <= 'Z'): new += ' ' else: new += e return new #---------------------------------------------------------------------- def readFilename(Filename): char = '' ;alp = '' ; line_count = 0 ; word = '' new_words = [] for line in Filename: char += line.strip('\n') for e in line: if '0'<= e <= '9' or 'a'<= e <= 'z' or 'A'<= e <= 'Z': alp += e word += ' ' + words(line) line_count += 1 print('char count =' , len(char) ) print('alphanumeric count =' , len(alp)) print('line count =' ,line_count) print('word count =', len(word.split())) for e in word.split(): if e.lower().strip() not in stopwords: new_words.append(e.lower().strip()) return new_words #---------------------------------------------------------------------- stopwords = [] stoptext = open('stopwords.txt' , 'r') for line in stoptext: for word in line.split(): stopwords.append(word) stoptext.close() Filename = open(input('Filename = '), 'r' ) cmd = input('Use feature hashing ? (y,Y,n,N) ') while cmd.lower() != 'n' and cmd.lower() != 'y': print('Try again.') cmd = input('Use feature hashing ? (y,Y,n,N) ') if cmd.lower() == 'y': M = int(input('M = ')) print('-------------------') new_words = readFilename(Filename) bow_list = [] ; repeated_list = [] ; new = [] for word in new_words: if flash(word , M) not in repeated_list: bow_list.append([flash(word , M),1]) repeated_list.append(flash(word , M)) else: for w in bow_list: if flash(word , M) == w[0]: w[1] += 1 print('BoW =', sorted(bow_list) ) else: new_words = readFilename(Filename) bow_list = [] ; repeated_list = [] for word in new_words: if word in repeated_list: for e in bow_list: if e[0] == word: e[1] += 1 else: bow_list.append([word , 1]) repeated_list.append(word) print('BoW =', sorted(bow_list)) Filename.close()
# 6330294621 (10.66) 143 (2021-03-22 02:09) def fhash(w,M): G = 37 c = 0 sume = 0 for e in w: sume += (ord(e)*(G**c)) c += 1 t = sume%M return t def countword(a,b): p = 0 for i in range(len(b)): if a == b[i] : p += 1 return p # sample.txt file_name = input('File Name = ').strip() f = open(file_name,'r') sw = '' linecount = 0 for line in f: sw += line[:-1] + ' ' linecount += 1 char_count = len(sw) - linecount aln = '' for i in range(len(sw)): if 'a' <= sw[i].lower() <= 'z' or '0' <= sw[i] <= '9': aln += sw[i] alphanumericcount = len(aln) swnop = '' for i in range (len(sw)): if sw[i] not in [',','.','"',"'"]: swnop += sw[i] swnop = swnop.strip() word = swnop.split() word_count = len(word) stopwordsop = open('stopwords.txt','r') st = '' for line in stopwordsop: st += line[:-1] + ' ' stopwords = st.split() wnst = [] for i in range(len(word)): if word[i].lower() not in stopwords : wnst.append(word[i].lower()) norpwnst = [] for i in range(len(wnst)): if wnst[i] not in norpwnst: norpwnst.append(wnst[i]) wnstst = ''.join(wnst) hon = input('Use feature hashing ? (y,Y,n,N) ') while hon not in ['y','Y','n','N']: print('Try again.') hon = input('Use feature hashing ? (y,Y,n,N) ') if hon in ['y','Y']: M = int(input('M = ')) c = [] for i in range(len(wnst)): c.append(fhash(wnst[i],M)) d = [] for i in range(len(c)): if c[i] not in d: d.append(c[i]) BoW = [] for i in range (len(d)): BoW.append([d[i],countword(d[i],c)]) BoW.sort() else: BoW = [] for i in range (len(norpwnst)): BoW.append([norpwnst[i],countword(norpwnst[i],wnst)]) print('char count = ' + str(char_count)) print('alphanumeric count = ' + str(alphanumericcount)) print('line count = ' + str(linecount)) print('word count = ' + str(word_count)) print('BoW = ' , BoW)
# 6330295221 (18.90) 144 (2021-03-22 15:10) def fhash(w,M): G = 37 sum = 0 for i in range(len(w)): sum += ord(w[i])*G**(i) fult = sum % M return fult def count( data, element ): c = 0 for e in data: if e == element: c += 1 return c Alphabet = 'abcdefghijklmnopqrstuvwxyz' alphabet = Alphabet.lower() ALPHABET = Alphabet.upper() numbers = '123456789' everyt = alphabet+ALPHABET+numbers y = input('File name = ') stop_words = open('stopwords.txt', 'r') stop_wordsl = [] for line1 in stop_words: line1st = line1.strip() line1stl = line1st.split() for e in line1stl: stop_wordsl.append(e) stop_words.close() file_name = open(y , 'r') char = 0 numeng = 0 linec = 0 olen = '' for line2 in file_name: for e in line2: if e not in everyt: olen += ' ' else: olen += e line2st = line2.strip() linec += 1 for e in line2st: char += 1 if e in everyt: numeng += 1 olenl = olen.split() word_count = len(olenl) bagow = olen.lower() bagows = bagow.split() bow = [] Bow = [] for e in bagows: if e not in stop_wordsl: bow.append(e) for e in bow: if e not in Bow: Bow.append(e) x = input('Use feature hashing ? (y,Y,n,N) ') while x not in 'yYnN': print('Try again.') x = input('Use feature hashing ? (y,Y,n,N) ') if x == 'y' or x =='Y': z = input('M = ') print('-------------------') print('char count =', char) print('alphanumeric count =', numeng) print('line count =', linec) print('word count =', word_count) bowc = [] BoW = [] for e in bow : bowc.append(fhash(e,int(z))) bowns = [] for e in bowc: if e not in bowns: bowns.append(e) bowns.sort() for e in bowns: c2 = count(bowc,e) BoW.append([e,c2]) print('BoW =', BoW) else: print('-------------------') print('char count =', char) print('alphanumeric count =', numeng) print('line count =', linec) print('word count =', word_count) BoW = [] for e in Bow: c1 = count( bow, e ) BoW.append([e,c1]) print('BoW =', BoW) file_name.close()
# 6330296921 (30.00) 145 (2021-03-22 21:54) def fharsh(w,M): M = int(M) n=0 x=0 z=0 for c in w: x += ord(c)*(37**n) n +=1 z = str(x%M) return z #-------------------------------------------------------- file_name = input('File name = ' ) fh_para = input('Use feature hashing ? (y,Y,n,N) ') #-------------------------------------------------------- fs = open('stopwords.txt', 'r') list_sw = [] for line in fs: line = line.lower() list_sw += line.split() fs.close() #--------------------------------------------------------- fn = open(file_name, 'r') list_text = [] char_count = 0 alphanumeric_count = 0 line_count = 0 word_count = 0 no_stop_word =[] checkBeforeBag =[] BoW = [] for line in fn: line_count += 1 line = line.strip() line = line.lower() for x in line : char_count += 1 for c in range(len(line)): if line[c] in '0123456789abcdefghijklmnopqrstuvwxyz': alphanumeric_count += 1 else: line = line[0:c]+' '+line[c+1:] list_text += line.split() word_count = len(list_text) for i in list_text: if i in list_sw: pass else : no_stop_word.append(i) for i in no_stop_word: if i not in checkBeforeBag: checkBeforeBag.append(i) for i in checkBeforeBag: BoW.append([i,list_text.count(i)]) fn.close() while fh_para != 'n'or'N'or'y'or'Y': if fh_para == 'n' or fh_para =='N': print('-------------------') print('char count =',char_count) print('alphanumeric count =',alphanumeric_count) print('line count =',line_count) print('word count =',word_count) BoW.sort() print('BoW =',BoW) break if fh_para == 'y' or fh_para =='Y': M = input('M = ') BoW=[] fharshed =[] for i in range(len(no_stop_word)): no_stop_word[i] = fharsh(no_stop_word[i],M) for i in no_stop_word: if i not in fharshed: fharshed.append(i) for i in fharshed: BoW.append([int(i),no_stop_word.count(i)]) BoW.sort() print('-------------------') print('char count =',char_count) print('alphanumeric count =',alphanumeric_count) print('line count =',line_count) print('word count =',word_count) print('BoW =',BoW) break print('Try again.') fh_para = input('Use feature hashing ? (y,Y,n,N) ')
# 6330298121 (19.85) 146 (2021-03-22 23:25) # input data file_name = input("file name = ") b = input("Use feature hashing ? (y,Y,n,N) ") while True: if b in ['y','Y','n','N']: if b == 'y' or b == 'Y': fh = True elif b == 'n' or b == 'N': fh = False break else : print("Try again") b = input("Use feature hashing ? (y,Y,n,N) ") if fh: M = int(input("M = ")) print('-------------------') stw = open("stopwords.txt","r") fn = open(file_name) #set variable charc = 0 alpc = 0 linec = 0 wcl = [] wc = '' bowcl = [] bow1 = [] bow2 = [] stwl = [] #count for line in fn: for e in line.strip(): charc += 1 if 'a'<=e<='z' or 'A'<=e<='Z' or '0'<=e<='9': alpc += 1 wc += e else : if len(wc) != 0: wcl.append(wc) wc = '' linec += 1 wordc = len(wcl) for line in stw: stwl += (line.strip().split()) for e in wcl: d = e.lower() if d not in stwl: bowcl.append(d) # func about BoW def fhash(a,b): c = len(a) d = 0 for i in range(c): d += ord(a[i])*37**i return d%b if fh: for i in range(M): bow1.append([i,0]) for e in bowcl: bow1[fhash(e,M)][1] += 1 bowsp1 = [] for i in range(len(bow1)): if bow1[i][1] != 0: bowsp1.append(bow1[i]) else: bowcl.sort() bowcl += [' '] for i in range(len(bowcl)-1): if bowcl[i] != bowcl[i+1]: bow2.append([bowcl[i],bowcl.count(bowcl[i])]) bowsp2 = [] for i in range(len(bow2)): if bow2[i][1] != 0: bowsp2.append(bow2[i]) #print print("char count =",charc) print("alphanumeric count =",alpc) print("line count =",linec) print("word count =",wordc) if fh: print("BoW =",bowsp1) else: print("BoW =",bowsp2)
# 6330299821 (30.00) 147 (2021-03-21 20:46) #==================================================== alphanumeric = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789' def op_check(): while True: a = input('Use feature hashing ? (y,Y,n,N) ', ) if a == 'y' or a == 'Y': M = int(input('M = ', )) return a,M elif a == 'n' or a == 'N': M = '' return a,M else: print('Try again.') def charcount(line): line = line.strip() q = len(line) return q def alphanumericcount(line): h = '' for i in range(len(line)): if line[i] in alphanumeric: h += line[i] q = len(h) return q def wordcount(line): h = '' for i in range(len(line)): if line[i] in alphanumeric: h += line[i].lower() else: h += ' ' h = h.strip().split() q = len(h) return q, h def BagofWords(words,M): w = [] temp = [] for i in words: if i not in stopwords: w.append(i) if type(M) == int: for i in range(len(w)): w[i] = fhash(w[i],M) w.sort() for i in range(len(w)): if i == 0: temp.append([w[i],1]) elif w[i] == w[i-1]: temp[-1][1] += 1 else: temp.append([w[i],1]) return temp def fhash(w,M): G = 37 r = 0 for i in range(len(w)): r += (ord(w[i])*(G**i)) ans = r % M return ans #---------------------------------------------------- file_name = input('File name = ', ) a, M = op_check() x = open('stopwords.txt') y = open(file_name) stopwords = [] c = 0 d = 0 f = 0 g = 0 words = [] for line in x: stopwords += line.strip().split() for line in y: c += charcount(line) d += alphanumericcount(line) f += 1 g += wordcount(line)[0] words += wordcount(line)[1] print('-------------------') print('char count =',c) print('alphanumeric count =',d) print('line count =',f) print('word count =',g) print('BoW =', BagofWords(words,M)) x.close() y.close()
# 6330300721 (17.30) 148 (2021-03-20 23:58) file_name = input('File name = ') q = input('Use feature hashing ? (y,Y,n,N) ') stopwords = open( 'stopwords.txt', 'r') f = [ '(', ')', '-', '_', '[', ']' ,'"' ,"'" ,';', ':', '>', '<','.','/','\\'] alpha_up = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] alpha_low = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] number = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0'] while q not in ['Y', 'y','N','n'] : print('Try again.') q = input('Use feature hashing ? (y,Y,n,N) ') if q == 'y' or q == 'Y' : M = int(input('M = ')) def l2_non(x) : l = [] for e in x : if e not in l : l.append(e) return(l) def sortt(data) : l = [] for i in range(min(data), max(data)+1): l.append(i) return(l) def count( data, t ) : count = 0 for e in data: if e == t : count += 1 return(count) def ceep(data) : a = sortt(data) l = [] for e in a : l.append([e,count(data,e)]) return (l) def fhash(w,M) : x = 0 n = 1 for e in w : x += ord(e)*37**(n-1) n += 1 return(x%M) def char_count() : file = open( file_name , 'r') chcount = 0 for line in file : line = line.strip() for i in range(len(line)): chcount += 1 file.close() return(chcount) def alphanumeric_count() : file = open( file_name , 'r') alcount = 0 for line in file : line = line.strip() for e in line : if e in alpha_up or e in alpha_low or e in number : alcount += 1 else : pass file.close() return(alcount) def word_count() : file = open( file_name , 'r') l = '' for line in file : line = line.strip() for e in line : if e in alpha_up or e in alpha_low or e in number : l += e else : l += ' ' l = l.split() file.close() return(len(l)) def BoW_y(q,M) : file = open( file_name , 'r') l = '' for line in file : for e in line : line = line.strip() if e in alpha_up or e in alpha_low or e in number : l += e.lower() else : l += ' ' l = l.split() file.close() stop = open( 'stopwords.txt' , 'r' ) s = [] for line in stop : l_line = line.strip().split() for k in l_line : s.append(k) stop.close() l2 = [] for e in l : if e not in s : l2.append(e) b = [] for e in l2 : b.append(fhash(e,M)) pre_bow = ceep(b) bow = [] for e in pre_bow : if e[-1] != 0 : bow.append(e) return(bow) def BoW_n(q) : file = open( file_name , 'r') l = '' for line in file : line = line.strip() for e in line : if e in alpha_up or e in alpha_low or e in number : l += e.lower() else : l += ' ' l = l.split() file.close() stop = open( 'stopwords.txt' , 'r' ) s = [] for line in stop : l_line = line.strip().split() for k in l_line : s.append(k) stop.close() l2 = [] for e in l : if e not in s : l2.append(e) l3 = l2_non(l2) bow = [] for e in l3 : bow.append([e,count(l2,e)]) return(bow) print('-------------------') print('char count = ' ,char_count()) print('alphanumeric count = ',alphanumeric_count()) print('word count = ',word_count()) if q == 'y' or q == 'Y' : print('BoW = ',BoW_y(q,M)) else : print('BoW = ',BoW_n(q))
# 6330301321 (30.00) 149 (2021-03-22 22:57) G=37 #------------- def main(): file_name = input('File name = ') stopwords = getWordsList('stopwords.txt') myWords = getWordsList(file_name) myWords = normalized(myWords,stopwords) while True: isfHash = input('Use feature hashing ? (y,Y,n,N) ') if isfHash in ['Y','y'] : M = int(input('M = ')) print('-------------------') report(file_name) for i in range(len(myWords)): myWords[i] = fHash(myWords[i],M) print('BoW =', bagWords(myWords)) break elif isfHash in ['N','n'] : print('-------------------') report(file_name) print('BoW =', bagWords(myWords)) break else : print('Try again.') #------------- def getRidPunc(mystring) : temp = '' for i in range(len(mystring)) : if mystring[i].isalnum() or mystring[i] == ' ' : temp+=mystring[i] else: temp+=' ' return temp def getWordsList(file_name) : f = open(file_name,'r') wArr = [] for line in f : nline = getRidPunc(line) for w in nline.split() : wArr.append(w) f.close() return wArr def report(file_name) : f = open(file_name,'r') chCount = 0 alnumCount = 0 wCount = 0 lCount = 0 for line in f : for ch in line : if ch != '\n' : chCount += 1 if ch.isalnum() : alnumCount += 1 wCount += len(getRidPunc(line).split()) lCount += 1 print('char count =',chCount) print('alphanumeric count =',alnumCount) print('line count =',lCount) print('word count =',wCount) f.close() def normalized(wArr,stopwords) : norm = [] for w in wArr : if w.isalnum() and w.lower() not in stopwords : norm.append(w.lower()) return norm def bagWords(wArr) : wArr.sort() if wArr == [] : return [] w = wArr[0] c = 1 BoW = [] for i in range(len(wArr)): if i>0 and wArr[i] == w: c+=1 elif i>0 and i<(len(wArr)-1): BoW.append([w,c]) c = 1 w = wArr[i] if i==(len(wArr)-1) : BoW.append([w,c]) if wArr[i] != wArr[i-1] : BoW.append([wArr[i],1]) return BoW def fHash(w,M) : global G return (sum([ord(w[i])*(G**i) for i in range(len(w))]))%M #------------- main()
# 6330302021 (20.00) 150 (2021-03-22 23:41) file = open(input('File name = ', ),'r') # typeinput = input('Use feature hashing ? (y,Y,n,N) ', ) listtypeinput = ['y','Y','n','N'] char_count = 0 line_count = 0 alphanumeric_count = 0 for line in file: for ch in line: if ch != '\n': char_count += len(ch) if ((ord(ch) >= 65 and ord(ch) <= 90) or (ord(ch) >= 97 and ord(ch) <= 122)) or (ord(ch) >= 48 and ord(ch) <= 57): alphanumeric_count += 1 if line != "\n": line_count += 1 file.seek(0) def BoW(file): listword = [] stopword = ['it','they','the','a','an','of','on','in','at','is','am','are','was','were'] ans = [] for line in file: for p in line: if ((ord(p) >= 65 and ord(p) <= 90) or (ord(p) >= 97 and ord(p) <= 122)) or (ord(p) >= 48 and ord(p) <= 57): pass else: line = line.replace(p,' ') line = line.lower() for e in line.split(): listword.append(e.strip()) listcount = [] for i in range (len(listword)): count = 0 for k in listword: if listword[i] == k: count += 1 listcount.append(count) for i in range (len(listword)): ans.append([listword[i],listcount[i]]) ans.sort() BoWn = [] BoWn += [ans[0]] listword_count = [] listword_count += [ans[0][1]] word_count2 = [] word_count2 += [ans[0]] for i in range (len(ans)): if ans[i] not in BoWn and ans[i][0] not in stopword : BoWn.append(ans[i]) if ans[i] not in word_count2: word_count2.append(ans[i]) listword_count.append(ans[i][1]) word_count = sum(listword_count) file.seek(0) return BoWn , word_count BoWn , word_count = BoW(file) def fhash(w,M): M = int(M) wordvalue = 0 for i in range (len(w)): value = int(ord(w[i])) wordvalue += value*(37)**i result = int(wordvalue)%M return int(result) while True: typeinput = input('Use feature hashing ? (y,Y,n,N) ', ) if typeinput in listtypeinput: break else: print('Try again.') if typeinput == 'y' or typeinput == 'Y' : M = input('M = ', ) BoWy = BoWn listfhash = [] listfhash2 = [] for r in range (len(BoWy)): listfhash.append([fhash(BoWy[r][0],M),BoWy[r][1]]) listfhash = sorted(listfhash) c = 0 listfhash2 = [] find = [listfhash[0][0],0] for e in listfhash + [['end',1000]]: if e[0] == find[0]: c += e[1] else: listfhash2.append([find[0],c]) c = e[1] find = e BoW = listfhash2 print('-------------------') print('char count =',char_count) print('alphanumeric count =',alphanumeric_count) print('line count =',line_count) print('word count =',word_count) print('BoW =',BoW) elif typeinput == 'n' or typeinput == 'N' : BoW = BoWn print('-------------------') print('char count =',char_count) print('alphanumeric count =',alphanumeric_count) print('line count =',line_count) print('word count =',word_count) print('BoW =',BoW) file.close
# 6330303621 (21.90) 151 (2021-03-21 22:13) file_name = input('File name = ',) a = input("Use feature hashing ? (y,Y,n,N) ",) while a != 'n' and a != 'N' and a != 'y' and a != 'Y' : print('Try again.') a = input("Use feature hashing ? (y,Y,n,N) ",) if a == 'y' or a == 'Y' : b = int(input('M = ',)) print('-------------------') def fhash(w, M) : n = 0 for i in range(len(w)) : n = n+(ord(w[i])*37**i) n %= M return n def count_words(w) : count5 = 0 for i in range(len(g)): if g[i] == w : count5 += 1 else : count5 += 0 return count5 sw = open('stopwords.txt', 'r') f = open(file_name, 'r') ff1 = f.read().strip() ff = ff1.split() fff = " ".join(ff) sww = sw.read().strip().split() count1 = 0 for line in ff1 : if line == '\n' : count1+=0 else : count1 += len(line) print('char count =', count1) count2 = 0 x=[] z=[] for line in ff : for i in range(len(line)) : if line[i].isalnum()==True : count2 += 1 x.append(line[i]) y = ''.join(x) else : continue z.append(y) x=[] print('alphanumeric count =', count2) count3 = 0 f = open(file_name, 'r') for line in f : count3 += 1 print('line count =', count3) count4 = 0 for i in range(len(fff)) : if fff[i]==fff[0] : continue if fff[i].isalnum()==False and fff[i].isalnum() != fff[i-1].isalnum() : count4 += 1 else : continue print('word count =', count4) BoW = [] BoW0 = [] BoW1 = [] BoW2 = [] g = " ".join(z).lower().split() if a == 'y' or a == 'Y' : for i in range(len(g)) : if g[i] not in sww and g[i] not in BoW0 : BoW0.append(g[i]) BoW1.append([fhash(g[i],b), count_words(g[i])]) k = sorted(BoW1) for i in range(len(k)) : if i < len(k)-1 : for j in range(i+1,len(k)) : if k[i][0]==k[j][0] : k[i][1]+=k[j][1] for i in range(len(k)) : if k[i][0]==k[i-1][0] : continue else : BoW.append(k[i]) print('BoW =', BoW) elif a == 'n' or a == 'N' : for i in range(len(g)) : if g[i] not in sww : BoW2.append([g[i], count_words(g[i])]) k = sorted(BoW2) for i in range(len(k)) : if k[i][0]==k[i-1][0] : continue else : BoW.append(k[i]) print('BoW =', BoW) f.close() sw.close()
# 6330304221 (30.00) 152 (2021-03-21 17:42) # [Done] file_name --> Get the words from filename.txt # [Done] stopwords --> Get stop words from stopwords.txt # [Done] while feature hashing not in ('y', 'n') --> lowercase # [Done] if 'y': Use fhash --> M = ? # Output: 1. [Done] character count # 2. [Done] alphanumeric count --> isalnum # 3. [Done] line count # 4. [Done] word count # 5. [Done] Bow --> (Use fhash or not) # --------------------------------------------------------------- stopwords_file = open('stopwords.txt', 'r') stopwords = [] for line in stopwords_file: stopwords.extend(line.strip().lower().split()) def test(): file_name = input('File name = ') # somename.txt hashing = input('Use feature hashing ? (y,Y,n,N) ').lower() while hashing not in ('y', 'n'): print('Try again.') hashing = input('Use feature hashing ? (y,Y,n,N) ').lower() if hashing == 'y': useFeatureHash = True m_value = int(input('M = ')) else: useFeatureHash, m_value = False, None print('-------------------') char_count, alnum_count, line_count, word_count = 0, 0, 0, 0 text = [] testfile = open(file_name, 'r') for line in testfile: words = line.strip() for i, char in enumerate(words): # 1. character count char_count += 1 if char.isalnum(): # 2. alphanumeric count alnum_count += 1 else: words = words[0:i] + ' ' + words[i+1:] # 3. line count line_count += 1 words = words.lower().split() # 4. word count word_count += len(words) text.extend(words) stopwords_file.close() testfile.close() # Output print('char count =', char_count) print('alphanumeric count =', alnum_count) print('line count =', line_count) print('word count =', word_count) print('BoW =', bag_of_word(text, useFeatureHash, m_value)) # --------------------------------------------------------------- def unique_of(words): unique_words = [] for word in words: if word not in unique_words: unique_words.append(word) return unique_words def remove_stopwords(words): none_stopword = [] for word in words: if word not in stopwords: none_stopword.append(word) return none_stopword def bag_of_word(text: list, useFeatureHash: bool, m_value): if useFeatureHash: all_fhash = [] temp_text = remove_stopwords(text) for word in temp_text: fhash_value = fhash(word, m_value) all_fhash.append(fhash_value) unique_words = remove_stopwords(unique_of(all_fhash)) text = all_fhash else: unique_words = remove_stopwords(unique_of(text)) counter = [] for unique in unique_words: count = 0 for word in text: if word == unique: count += 1 counter.append([unique, count]) return counter def fhash(word, m_value): n = len(word) fhash_value = 0 for i in range(n): fhash_value += ord(word[i]) * 37 ** i return int(fhash_value % m_value) # --------------------------------------------------------------- test()
# 6330305921 (26.00) 153 (2021-03-21 22:57) #Prog-08: Bag-of-words #6330305921 (26.00) Pras Pitasawad file_name = input('File name = ') k = True while k == True : x = input('Use feature hashing ? (y,Y,n,N) ') if x == 'y'or x =='Y' : m = True M = int(input('M = ')) break elif x == 'n'or x == 'N': m = False break else : print('Try again.') def char(x) : f = open(x,'r') k = 0 for line in f : if line[-1] == '\n' : k += len(line) - 1 else : k += len(line) f.close() return str(k) def alphacount(x) : f = open(x,'r') k = 0 for line in f : for e in line : if '0'<= e <= '9' or 'A'<= e <= 'Z' or 'a' <= e <= 'z' : k += 1 f.close() return str(k) def linecount(x) : f = open(x,'r') k = 0 for line in f : k += 1 f.close() return str(k) def word(x) : f = open(x,'r') y = '' for line in f : for e in line : if '0'<= e <= '9' or 'A'<= e <= 'Z' or 'a' <= e <= 'z' : y += e else : y += ' ' f.close() return y.strip().split() def flash(w,M) : s = 0 for i in range(len(w)): s += ord(w[i])*(37**i) return s%M def BoW(words,stop) : for i in range(len(words)) : words[i] = words[i].lower() for i in range(len(stop)) : stop[i] = stop[i].lower() x = [] for e in words : if e not in stop : x.append(e) if m == False : x.sort() BoW =[] y = x[0] k = 1 for i in range(1,len(x)) : if x[i] != y : BoW.append([y,k]) y = x[i] k = 1 else : k += 1 BoW.append([y,k]) return BoW else : BoW = [] n = [] for e in x : n.append(flash(e,M)) n.sort() y = n[0] k = 1 for i in range(1,len(n)) : if n[i] != y : BoW.append([y,k]) y = n[i] k = 1 else : k += 1 BoW.append([y,k]) return BoW print('-------------------') print('char count = '+ char(file_name)) print('alphanumeric count = ' + alphacount(file_name)) print('line count = ' + linecount(file_name)) print('word count = ' + str(len(word(file_name)))) print('BoW = ' + str(BoW(word(file_name),word('stopwords.txt'))))
# 6330306521 (24.07) 154 (2021-03-22 21:28) def fhash(a,b) : c=0 d=0 for i in a.lower() : c+=ord(i)*(37**d) d+=1 return c%int(b) def count(a) : c='' for i in a : if 'a'<=i<='z' or 'A'<=i<='Z' or '0'<=i<='9' : c+=i else : c+=' ' return c.split() def countalpha(a) : b='' for i in a : if 'a'<=i<='z' or 'A'<=i<='Z' or '0'<=i<='9' : b+=i return len(b) file_name=open(input('File name = ')) b=input('Use feature hashing ? (y,Y,n,N) ') while b not in 'yYnN' : print('Try again.') b=input('Use feature hashing ? (y,Y,n,N) ') if b=='y' or b=='Y' : m=input('M = ') print('-------------------') stop=open('stopword.txt') q=stop.readline() stopword=[] while len(q)>0 : stopword+=q.split() q=stop.readline() stop.close() a=file_name.readline() nuba=0 aa=a bow=[] while len(a)>0 : nuba+=1 a=file_name.readline() aa+=a print('char count = '+str(len(aa)-nuba+1)) print('alphanumeric count = '+str(countalpha(aa))) print('line count = '+str(nuba)) print('word count = '+str(len(count(aa)))) if b=='y' or b=='Y' : for i in count(aa) : if i.lower() not in stopword : bow.append([fhash(i,m),1]) bow.sort() bow.append('.') rbow=[] bow0=bow[0][0] bow1=[[bow[0][0]]] w=-1 for i in range(len(bow)) : if bow[i][0]==bow0 : w+=1 else : rbow.append(w) bow1.append([bow[i][0]]) bow0=bow[i][0] w=1 for i in range(len(rbow)) : bow1[i].append(rbow[i]) print('BoW = ',bow1[:-1]) else : for i in count(aa) : if i.lower() not in stopword : bow.append(i) bow.sort() bow.append('.') rbow=[] bow0=bow[0] bow1=[[bow[0]]] w=0 for i in range(len(bow)) : if bow[i]==bow0 : w+=1 else : rbow.append(w) bow1.append([bow[i]]) bow0=bow[i] w=1 for i in range(len(rbow)) : bow1[i].append(rbow[i]) print('BoW = ',bow1[:-1])
# 6330308821 (30.00) 155 (2021-03-20 16:16) def fhash(w,M): s = 0 for i in range(len(w)): s += ord(w[i])*(37)**i return s%M def cut_stopwords(text): l_of_stopwords = [] text = text.lower() s = "" f = open("stopwords.txt", "r") for line in f: n = line.split() for e in n: l_of_stopwords.append(e) f.close() for e in text: if e in "abcdefghijklmnopqrstuvwxyz0123456789": s += e else: s += " " l = s.split() m = [] for e in l: if e not in l_of_stopwords: m.append(e) return m def chr_count_lst(l_text): n = 0 for e in l_text: n += len(e) return n def alp_count(l_text): n = 0 for e in l_text: for k in e: k = k.lower() if k in "abcdefghijklmnopqrstuvwxyz0123456789": n += 1 return n def words_count(l_text): n = 0 for e in l_text: s = "" for k in e: k = k.lower() if k in "abcdefghijklmnopqrstuvwxyz0123456789": s += k else: s += " " n += len(s.split()) return n def BoW(l_text): l_words = [] l_words_notdupli = [] l_bow = [] for e in l_text: for i in cut_stopwords(e): l_words.append(i) for e in l_words: if e not in l_words_notdupli: l_words_notdupli.append(e) for e in l_words_notdupli: n = 0 for k in l_words: if e == k: n += 1 l_bow.append([e,n]) l_bow.sort() return l_bow def BoW_fhash(l_text,M): l_words = [] l_words_notdupli = [] l_bow = [] for e in l_text: for i in cut_stopwords(e): l_words.append(fhash(i,M)) for e in l_words: if e not in l_words_notdupli: l_words_notdupli.append(e) for e in l_words_notdupli: n = 0 for k in l_words: if e == k: n += 1 l_bow.append([e,n]) l_bow.sort() return l_bow def main(): fn = input("File name = ") con = input("Use feature hashing ? (y,Y,n,N) ") while True: if con in ["y","Y","n","N"]: break else: print("Try again.") con = input("Use feature hashing ? (y,Y,n,N) ") f = open(fn,"r").read().splitlines() if con in ["y","Y"]: M = int(input("M = ")) print("-------------------") print("char count =", chr_count_lst(f)) print("alphanumeric count =", alp_count(f)) print("line count =", len(f)) print("word count =", words_count(f)) print("BoW =",BoW_fhash(f,M)) elif con in ["n","N"]: print("-------------------") print("char count =", chr_count_lst(f)) print("alphanumeric count =", alp_count(f)) print("line count =", len(f)) print("word count =", words_count(f)) print("BoW =",BoW(f)) main()
# 6330309421 (0.00) 156 (2021-03-22 21:01) def countchar(txt): txt = open('sample.txt', 'r') txtr = txt.readline() k = '' while len(txtr) > 0: for i in txtr.strip(): k += i txtr = txt.readline() txt.close() c1 = len(k) return c1 def countalpnum(txt): txt = open('sample.txt', 'r') txtr = txt.readline() txtr.strip() st = '' while len(txtr) > 0: st += txtr.strip() st += ' ' txtr = txt.readline() not_alpnum = ''' "\'[]{}(),.;?!:''' n_alpandnum = [] [n_alpandnum.append(x) for x in st if not x in not_alpnum] c2 = len(n_alpandnum) txt.close() return c2 def linecount(txt): txt = open('sample.txt', 'r') txtr = txt.readline() txtr.strip() c3 = 0 while len(txtr) > 0: txtr = txt.readline() c3 += 1 return c3 def wordcount(txt): txt = open('sample.txt', 'r') txtr = txt.readline() txtr.strip() s = '' not_alpnum = '"\'[]{}(),.;?!:' while len(txtr) > 0: s += txtr.strip() s += ' ' txtr = txt.readline() s_alpnum = [] for x in s: if not x in not_alpnum: s_alpnum.append(x) word = '' for i in range(len(s_alpnum)-1): word += s_alpnum[i] wd = word.split(' ') c4 = len(wd) return c4 def fhash(w,M): G = 37 cal = 0 e = 0 for i in w : c = ord(i) * (G**e) cal += c e += 1 cal = cal % int(M) return cal def stopwords(s): s = open('stopwords.txt','r') st = s.readline() stp = '' b = 0 while len(st) > 0: for i in st.strip(): if b > 0: stp += ' ' + i b -= 1 else: stp += i st = s.readline() b += 1 s.close() return stp def count(a): n = 0 b = [] c = [] ans = [] for i in a: n = 0 for j in a: if j == i : n += 1 if i not in b: b.append(i) c.append(n) for i in range(len(b)): ans.append([b[i],c[i]]) ans.sort() return ans def word(txt): txt = open('sample.txt', 'r') txtr = txt.readline() txtr.strip() s = '' not_alpnum = '"\'[]{}(),.;?!:' while len(txtr) > 0: s += txtr.strip() s += ' ' txtr = txt.readline() s_alpnum = [] for x in s: if not x in not_alpnum: s_alpnum.append(x) word = '' for i in range(len(s_alpnum)-1): word += s_alpnum[i] wd = word.split(' ') a = [] b = [] for i in wd: a.append(i.lower()) a.sort() for i in a : if i not in b and i not in stopwords('stopwords.txt'): b.append(i) return b file_name = input('File name = ') BoW = [] a = [] w = word(file_name) M = 0 while M <= 0: fhashkey = input('Use feature hashing? (y,Y,n,N) ') if fhashkey == 'y' or fhashkey == 'Y': M = input('M = ') for i in w: a.append(fhash(i,M)) BoW = count(a) break elif fhashkey == 'n' or fhashkey == 'N': M = 0 BoW = count(w) break else: print('Try again.') continue print('-------------------') print('char count =',countchar(file_name)) print('alphanumeric count =',countalpnum(file_name)) print('line count =',linecount(file_name)) print('word count =',wordcount(file_name)) print('BoW =',BoW )
# 6330310021 (28.00) 157 (2021-03-22 22:07) # --------------------------------------- def stop_word(filename): stpw = [] for line in filename: line = line.strip("\n") word = line.split() for e in word: stpw.append(e) return stpw # --------------------------------------- def char_count(filename): char = 0 for line in filename: line = line.strip("\n") char += len(line) return char # --------------------------------------- def replace_sym(line): out = "" for e in line: if e in "!+@/#$%^&฿*()_-=/*-\\|,]}[{:;\'\".?><": out += " " else: out += e.lower() return out # --------------------------------------- def alpha_count(filename): ap_count = 0 word = 0 for line in filename: line = line.strip("\n") clear_line = replace_sym(line) list_alpha = clear_line.split() word += len(list_alpha) for x in line: if x.lower() in "abcdefghijklmnopqrstuvwxyz0123456789": ap_count += 1 return ap_count, word # --------------------------------------- def line_count(filename): n = 0 for line in filename: n += 1 return n # --------------------------------------- def BoW(filename): ready_for_BoW = [] have = [] BoW_list = [] for line in filename: line = line.strip("\n") clear_line = replace_sym(line) list_of_word_in_line = clear_line.split() for e in list_of_word_in_line: if e not in stop_words: ready_for_BoW.append(e) for e in ready_for_BoW: if e not in have: have.append(e) BoW_list.append([e, 1]) else: for info in BoW_list: if e in info: info[1] += 1 BoW_list.sort() return BoW_list # --------------------------------------- def flash(w, M): G = 37 fls = 0 for i in range(len(w)): fls += ord(w[i])*(G**i) return fls % M # --------------------------------------- def BoW_Hashing(filename): ready_for_BoW = [] have = [] BoW_list = [] hash = [] hashing = [] for line in filename: line = line.strip("\n") clear_line = replace_sym(line) list_of_word_in_line = clear_line.split() for e in list_of_word_in_line: if e not in stop_words: ready_for_BoW.append(e) for e in ready_for_BoW: if e not in have: have.append(e) BoW_list.append([e, 1]) else: for info in BoW_list: if e in info: info[1] += 1 for i in range(len(BoW_list)): BoW_list[i][0] = flash(BoW_list[i][0], M) for e in BoW_list: if str(e[0]) not in hash: hash.append(str(e[0])) hashing.append([e[0], e[1]]) else: for x in hashing: if str(e[0]) == str(x[0]): x[1] += e[1] hashing.sort() return hashing # --------------------------------------- file_name = input("File name = ") feature_hashing = input("Use feature hashing ? (y,Y,n,N) ") while feature_hashing.lower() != "y" and feature_hashing.lower() != "n": print("Try again.") feature_hashing = input("Use feature hashing ? (y,Y,n,N) ") M = 0 if feature_hashing.lower() == "y": M = int(input("M = ")) print("-------------------") stop_file = open("stopwords.txt", "r") stop_words = stop_word(stop_file) # --------------------------------------- file1 = open(file_name, "r") file2 = open(file_name, "r") file3 = open(file_name, "r") file4 = open(file_name, "r") cc = char_count(file1) print("char count =", cc) ap, w = alpha_count(file2) print("alphanumeric count =", ap) l = line_count(file3) print("line count =", l) print("word count =", w) if feature_hashing in "Nn": B1 = BoW(file4) print("BoW =", B1) elif feature_hashing in "Yy": B2 = BoW_Hashing(file4) print("BoW =", B2) file1.close() file2.close() file3.close() file4.close() stop_file.close()
# 6330311621 (30.00) 158 (2021-03-18 22:11) def num_all(fn): c = 0 fn = open(fn,'r').read() for i in fn: if i != '\n': c += 1 return c def num_char(fn): out = '' fn = open(fn,'r').read() for i in fn: if 'a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9': out += i return len(out) def num_line(fn): c = 0 fn = open(fn,'r') for line in fn: c += 1 return c def num_word(fn): out = '' fn = open(fn,'r').read() for i in fn: if not('a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9'): i = ' ' out += i word = out.split() return len(word) def listword(fn): out = '' fn = open(fn,'r').read() for i in fn: if not('a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9'): i = ' ' out += i out = out.lower().split() sw = open('stopwords.txt','r').read() t = sw.split() list_word = [] for i in out: if not i in t: list_word.append(i) return list_word def BoW(fn): word = listword(fn) t_word = [] for i in word: if not i in t_word: t_word.append(i) slot = [0]*len(t_word) for i in word: if i in t_word: slot[t_word.index(i)] += 1 f_word = [] for i in range(len(slot)): f_word.append([t_word[i],slot[i]]) return f_word def fhash(w,M): num = 0 for i in range(len(w)): num += ord(w[i])*(37**i) return num%int(M) def BoW_fhash(fn,M): word1st = listword(fn) word = [] for i in word1st: word.append(fhash(i,M)) t_word = [] for i in word: if not i in t_word: t_word.append(i) slot = [0]*len(t_word) for i in word: if i in t_word: slot[t_word.index(i)] += 1 f_word = [] for i in range(len(slot)): f_word.append([t_word[i],slot[i]]) return f_word file_name = input('File name = ') choose = input('Use feature hashing ? (y,Y,n,N) ') while not choose in 'nNyY': print('Try again.') choose = input('Use feature hashing ? (y,Y,n,N) ') if choose in 'yY': M = input('M = ') print('-------------------') print('char count =',num_all(file_name)) print('alphanumeric count =',num_char(file_name)) print('line count =',num_line(file_name)) print('word count =',num_word(file_name)) print('BoW =',BoW_fhash(file_name,M)) else: print('-------------------') print('char count =',num_all(file_name)) print('alphanumeric count =',num_char(file_name)) print('line count =',num_line(file_name)) print('word count =',num_word(file_name)) print('BoW =',BoW(file_name))
# 6330312221 (22.80) 159 (2021-03-21 16:41) def fhash(w,M): s = 0 for i in range(len(w)): s += ord(w[i])*(37**i) fs = s%M return fs def remove_punc(t): out = '' for e in t: if e in "\"\'/\\().,;:[]-<>?!%&*_+@#^$": out += ' ' else: out += e return out def count_alpha(t): out = '' count = 0 for e in t: if e != ' ' and e != '\n': out += e count += 1 return count def count_words(t): n = len(t) return n def find_bow(c): count = 1 first_word = c[0] ans = [] for i in range(1,len(c)) : if c[i] == first_word : count += 1 else : ans.append([c[i-1], count]) first_word = c[i] count = 1 ans.append([c[i],count]) return ans def find_hash_bow(c,M): ans1 = [] ans2 = [] ans = [] for i in c: n = fhash(i,M) ans1.append(n) ans1.sort() count = 1 first_ans = ans1[0] for i in range(1,len(ans1)) : if ans1[i] == first_ans : count += 1 else : ans.append([ans1[i-1], count]) first_ans = ans1[i] count = 1 ans.append([ans1[i],count]) return ans st = '' stop_words = '' c = 0 n = '' bow = [] st_word = [] no_stop_word = [] file_name = input('File name = ') h = input('Use feature hashing ? (y,Y,n,N) ') fn = open(file_name, 'r') fn2 = open('stopwords.txt','r') for line in fn2: stop_words += line fn2.close() stop_words = stop_words.lower() stop_words = stop_words.split() while h not in ['y','Y','n','N']: print('Try again.') h = input('Use feature hashing ? (y,Y,n,N) ') if h in ['y','Y']: M = int(input('M = ')) print('-------------------') for line in fn: st += line c += 1 fn.close() st = remove_punc(st) st = st.lower() st_word = st.split() st_word.sort() for e in st_word: if e not in stop_words: no_stop_word.append(e) fn = open(file_name, 'r') for line in fn: if line[-1] == '\n': n += line[0:-1] elif line[-1] != '\n': n += line fn.close() char = len(n) print('char count = '+str(char)) print('alphanumeric count = '+str(count_alpha(st))) print('line count = '+str(c)) print('word count = '+str(count_words(st_word))) bow = find_hash_bow(no_stop_word,M) print('BoW =',bow) elif h in ['n','N']: print('-------------------') for line in fn: st += line c += 1 fn.close() st = remove_punc(st) st = st.lower() st_word = st.split() st_word.sort() for e in st_word: if e not in stop_words: no_stop_word.append(e) fn = open(file_name, 'r') for line in fn: if line[-1] == '\n': n += line[0:-1] elif line[-1] != '\n': n += line fn.close() char = len(n) print('char count = '+str(char)) print('alphanumeric count = '+str(count_alpha(st))) print('line count = '+str(c)) print('word count = '+str(count_words(st_word))) bow = find_bow(no_stop_word) print('BoW =',bow)
# 6330313921 (28.40) 160 (2021-03-22 21:05) def char_count(name1) : file = open(name1 ,'r') file_read = file.readline() n = '' while len(file_read) > 0 : for i in file_read.strip() : n += i file_read = file.readline() ch_count = len(n) file.close() return ch_count def alphanumeric_count(name2) : file = open(name2 ,'r') file_read = file.readline() ex = 'qwertyuiopasdfghjklzxcvbnm0123456789' file_read.strip() c = '' c2 = '' c3 = '' while len(file_read) > 0 : for i in file_read.lower() : c += i file_read = file.readline() for j in c : if j in ex : c2 += j for k in c2 : if 'a' <= k <= 'z' or '0' <= k <= '9' : c3 += k alc = len(c3) return alc def line_count(name3) : file = open(name3 ,'r') file_read = file.readline() line = 0 while len(file_read) : line += 1 file_read = file.readline() file.close() return line def word_count(name4) : file = open(name4 ,'r') file_read = file.readline() nmn = '' while len(file_read) > 0 : nmn += file_read.strip() nmn += ' ' file_read = file.readline() wc = nmn.split() return len(wc) #------------------------------------------------------ def alphanumeric2(names) : file = open(names,'r') file_read = [i.strip() for i in file.readlines()] alpNum = 'qwertyuiopasdfghjklzxcvbnm0123456789' an = [] for i in range(len(file_read)): word = "" for j in range(len(file_read[i])): if file_read[i][j].lower() in alpNum: word += file_read[i][j].lower() else: an.append(word) word = "" an.append(word) string = "" for i in range(len(an)): if an[i] != "": string += an[i] + " " return (string) def stop_word(sw) : stopw = open('stopwords.txt','r') stw = stopw.readline() stwu = '' while len(stw) > 0: stwu += stw.strip() stwu += ' ' stw = stopw.readline() stopw.close() return stwu #------------------------------------------------------ def fhash(w,M) : G = 37 tisy = 0 n = 0 for i in w : c = ord(i) * (G**n) tisy += c n += 1 tisy = tisy % int(M) return tisy #------------------------------------------------------ def bownofhash(name) : file = open(name,'r') file_read = file.readline() sname = 'stopwords.txt' stopw = stop_word(sname) sp = '' stopw1 = stopw.split() wline = alphanumeric2(name) wl = wline wl = wl.split() for i in range(len(wl)): if wl[i] not in stopw1 : sp += wl[i] sp += ' ' foruse = sp.split() times = [] for wl in foruse : c = foruse.count(wl) times.append(c) free = [] for k in range(len(foruse)): M = [foruse[k],times[k]] free += [M] result = [] for n in free : if n not in result : result.append(n) result.sort() return result #------------------------------------------------------ def bownusefhash(name) : file = open(name,'r') file_read = file.readline() sname = 'stopwords.txt' stopw = stop_word(sname) sp = '' stopw1 = stopw.split() wline = alphanumeric2(name) wl = wline.lower() wl = wl.split() for i in range(len(wl)): if wl[i] not in stopw1 : sp += wl[i] sp += ' ' foruse = sp.split() times = [] for wl in foruse : c = foruse.count(wl) times.append(c) usefh = [] for f in range(len(foruse)): w = foruse[f] usefh.append(fhash(w,M)) c = [] usefh.sort() for h in usefh: c.append(usefh.count(h)) ref = [] for r in range(len(foruse)): re = [usefh[r],c[r]] ref += [re] res = [] for s in ref : if s not in res : res.append(s) return res #------------------------------------------------------ name = input('File name = ') filename = open(name ,'r') file_read = filename.readline() isHash = False while True: ans = input('Use feature hashing ? (y,Y,n,N) ') if ans not in ['n' , 'N' , 'y' , 'Y' ]: print("Try again.") else: if ans in ['Y','y']: isHash = True M = int(input('M = ')) else: pass break print('-------------------') print('char count = ' , char_count(name)) print('alphanumeric count = ' , alphanumeric_count(name)) print('line count = ' , line_count(name)) print('word count = ' , word_count(name)) if not isHash : print('BoW = ' , bownofhash(name)) else: print('BoW = ' , bownusefhash(name))
# 6330314521 (19.00) 161 (2021-03-21 22:47) file_name = input('File name = ') feat = input('Use feature hashing ? (y,Y,n,N) ') end = False if feat == 'y' or feat == 'Y': M = input('M = ') fhashbow = True elif feat == 'N' or feat == 'n': fhashbow = False else: end = True print('Try again.') while end == True: feat = input('Use feature hashing ? (y,Y,n,N) ') if feat == 'y' or feat == 'Y': M = input('M = ') fhashbow = True end = False break elif feat == 'N' or feat == 'n': fhashbow = False end == False break else: print('Try again.') stop_w = open('stopwords.txt','r') stopw = stop_w.readline() if feat == 'y' or feat == 'Y': print('-------------------') else: skip = True file = open(file_name ,'r') filed = file.readline() #=================line count================ def line_count(fileds): file = open(file_name ,'r') filed = file.readline() f1 = filed linum = 0 #line count while len(f1) > 0: linum += 1 f1 = file.readline() file.close() return linum #=================word count================ def count_word(fileds): file = open(file_name ,'r') filed = file.readline() liword = '' while len(filed) > 0: liword += filed.strip() filed = file.readline() nope = '"\'[]{}(),.;' for i in range(len(liword)): if i == 0: pass elif liword[i] in nope: liword = liword[:i] + ' ' + liword[i+1:] word = liword.split() k = 0 for e in range(len(word)): k += 1 file.close() return k #=================char count================ def count_char(fileds): file = open(file_name ,'r') filed = file.readline() mix = '' while len(filed) > 0: for i in filed.strip(): mix += i filed = file.readline() file.close() return len(mix) #==============alphnum===================== def alphnum(fileds): file = open(file_name ,'r') filed = file.readline() mix = '' while len(filed) > 0: for i in filed.strip(): mix += i filed = file.readline() nope = '"\'[]{}(),.;?!' for i in range(len(mix)): if i == 0: pass elif mix[i] in nope: mix = mix[:i] + ' ' + mix[i+1:] mix.lower() file.close() return mix #=================alphanumeric count=========== def count_alphnum(fileds): file = open(file_name ,'r') filed = file.readline() mix = '' while len(filed) > 0: for i in filed.strip(): mix += i filed = file.readline() nope = '"\'[]{}(),.;?!' for i in range(len(mix)): if i == 0: pass elif mix[i] in nope: mix = mix[:i] + ' ' + mix[i+1:] m = '' for t in mix.lower(): if 'a' <= t <= 'z' or '0' <= t <= '9': m += t file.close() return len(m) #===================fhash=============== def fhash(w,M): G = 37 cal = 0 z = 0 for i in w : c = ord(i) c *= G**z z +=1 cal += c cal = cal % int(M) return cal #================stop words===================== def stopwords(stopws): stop_w = open('stopwords.txt','r') stopw = stop_w.readline() stp = '' while len(stopw) > 0: stp += stopw.strip() stp += ' ' stopw = stop_w.readline() stop_w.close() return stp #====================bow===================== def bow(filed): file = open(file_name ,'r') filed = file.readline() stp = stopwords(stopw) sigwords = '' stpw = stp.split() wordlines = alphnum(filed) w = wordlines.lower().split() for e in range(len(w)): if w[e] not in stpw: sigwords += w[e] sigwords += ' ' wordlist = sigwords.split() wordfreq = [] for w in wordlist: wordfreq.append(wordlist.count(w)) bow = [] for q in range(len(wordlist)): b = [wordlist[q],wordfreq[q]] bow += [b] BOW = [] [BOW.append(x) for x in bow if x not in BOW] file.close() return BOW #=====================fhash bow================ def fhash_bow(filed): file = open(file_name ,'r') filed = file.readline() stp = stopwords(stopw) sigwords = '' stpw = stp.split() wordlines = alphnum(filed) w = wordlines.lower().split() for e in range(len(w)): if w[e] not in stpw: sigwords += w[e] sigwords += ' ' wordlist = sigwords.split() wordfreq = [] for w in wordlist: wordfreq.append(wordlist.count(w)) wordfh = [] for s in range(len(wordlist)): D = wordlist[s] wordfh.append(fhash(D,M)) wordfh_f = [] wordfh.sort() for z in wordfh: wordfh_f.append(wordfh.count(z)) bowfh = [] for q in range(len(wordlist)): b = [wordfh[q],wordfh_f[q]] bowfh += [b] BOWfh = [] [BOWfh.append(x) for x in bowfh if x not in BOWfh] file.close() return BOWfh #================================================================ #========================output process========================== # if end == False: if fhashbow == True: print('char count =', count_char(filed)) print('alphanumeric count =', count_alphnum(filed)) print('line count =',line_count(filed)) print('word count =', count_word(filed)) print('BoW =',fhash_bow(file)) elif fhashbow == False: print('char count =', count_char(filed)) print('alphanumeric count =', count_alphnum(filed)) print('line count =',line_count(filed)) print('word count =', count_word(filed)) print('BoW =',bow(filed)) # elif end == True: # print('Try again.') stop_w.close() file.close()
# 6330315121 (28.20) 162 (2021-03-22 18:14) def fhash(w,M): G = 37 fh_sum = 0 for i in range(len(w)): fh_sum += (ord(w[i]))*(G**i) fh = fh_sum%M return fh def clear_shid(t): cleared = "" for e in t: if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9": cleared += e else: cleared += "" return cleared def clear2(t): cleared = "" for e in t: if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9": cleared += e else: cleared += " " return cleared def BoW1(t): cleared = "" for e in t: if e in x: cleared += " " elif "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9": cleared += e + " " else: cleared += " " bow1 = cleared.split() return bow1 def n_BoW(t): bow_solo = [] for e in t: if e not in bow_solo: bow_solo.append(e) n_bow = [] for e in bow_solo: n_bow.append([e, t.count(e)]) return n_bow def fh_BoW(t): fh_list = [] for e in t: fh_list.append(fhash(e,M)) bow_solo = [] for e in fh_list: if e not in bow_solo: bow_solo.append(e) fh_bow = [] for e in bow_solo: fh_bow.append([e, fh_list.count(e)]) return fh_bow x = [] stop_w = open("stopwords.txt", "r") for line in stop_w: y = line.split() for i in range(len(y)): x.append(y[i]) stop_w.close() file_name = input("File name = ") choose = input("Use feature hashing ? (y,Y,n,N) ") while choose not in ["y", "Y", "n", "N"]: print("Try again.") choose = input("Use feature hashing ? (y,Y,n,N) ") if choose in ["y", "Y"]: M = int(input("M = ")) fn1 = open(file_name, "r") a = "" for line in fn1: a += clear2(line) t1 = a.lower().split() a_bow1 = BoW1(t1) final_bow = fh_BoW(a_bow1) fn1.close() elif choose in ["n", "N"]: fn2 = open(file_name, "r") b = "" for line in fn2: b += clear2(line) t2 = b.lower().split() b_bow1 = BoW1(t2) final_bow = n_BoW(b_bow1) fn2.close() char_c = 0 line_c = 0 word_c = "" apnum_c = "" f_n = open(file_name, "r") for line in f_n: apnum_c += clear_shid(line) line_c += 1 word_c += clear2(line) for e in line: if e!= "\n": char_c +=1 f_n.close() print("-------------------") print("char count =",char_c) print("alphanumeric count =",len(apnum_c)) print("line count =",line_c) print("word count =",len(word_c.split())) print("BoW =",final_bow)
# 6330316821 (28.00) 163 (2021-03-22 21:39) def new_string(x): t = "" for e in x: if e in "\"\'/\\,.:;()[]{}+-*=_&^%#@!|$><?": t += " " else: t += e.lower() return t #------------------------------------ def count_word(words, w): c = 0 for e in words: if e == w: c += 1 return c #------------------------------------ def Bow(string): list_Bow = string.split() list_Bow.sort() x = [] y = [] for e in list_Bow: if e not in y: if e not in stop_words: x.append([e, count_word(list_Bow, e)]) y.append(e) return x #------------------------------------ def fhash(w,M): sum = 0 for i in range(len(w)): sum += ord(w[i])*(37**i) fhash = sum % M return fhash #------------------------------------ file_name = input("File name = ") feature = input('Use feature hashing ? (y,Y,n,N) ') while feature.lower() != 'y' and feature.lower() != 'n': print('Try again.') feature = input('Use feature hashing ? (y,Y,n,N) ') if feature.lower() == 'y': M = input("M = ") print('-------------------') stop = open('stopwords.txt', 'r') stop_words = [] for line in stop: x = line.split() for e in x : stop_words.append(e) fn = open(file_name) word = [] char_count = 0 alphanumeric_count = 0 line_count = 0 word_count = 0 string = '' Bow_1 = [] Bow_2 = [] test_Bow = [] for line in fn: x = new_string(line).split() y= new_string(line) string += " " + y line_count += 1 for e in y: if e in 'abcdefghijklmnopqrstuvwxyz0123456789': alphanumeric_count += 1 if e != '\n': char_count += 1 for e in x: word.append(e) Bow_1 = Bow(string) print('char count = ' + str(char_count)) print('alphanumeric count = ' + str(alphanumeric_count)) print('line count = ' + str(line_count)) print('word count = ' + str(len(word))) if feature.lower() == 'n': print('BoW =', Bow_1) if feature.lower() == 'y': for i in range(len(Bow_1)): Bow_1[i][0] = fhash(Bow_1[i][0], int(M)) Bow_1.sort() for e in Bow_1: if e[0] not in test_Bow: test_Bow.append(e[0]) Bow_2.append([e[0], e[1]]) elif e[0] in test_Bow: Bow_2[test_Bow.index(e[0])][1] += e[1] print('BoW = ', Bow_2) fn.close() stop.close()
# 6330317421 (21.85) 164 (2021-03-22 17:11) name = input('File name = ').strip() file_name = open(name,'r') stop_words = open('stopwords.txt','r') read_fn = file_name.read() read_fn = read_fn.lower() read_fn2 = read_fn.lower().split() read_sw = stop_words.read().split() fn_sw = [] line = 0 co = read_fn.split("\n") for i in co: if i: line += 1 def ac(read): k = '' for j in read: if j in 'abcdefghijklmnopqrstuvwxyz0123456789': k += j else: k += '' return k for i in read_fn2: if i not in read_sw: k = ac(i) fn_sw += [k] else: i = '' def dup(z): dup = [] for i in z: if i not in dup: dup.append(i) return dup WoW = dup(fn_sw) def fhash(w,M): o = 0 for i in range(len(w)): o += ord(w[i])*37**(i) return o % M def char(name): count = 0 file_name = open(name,'r') for line in file_name.read(): if line[-1:] == '\n': line = line[:-1] count += len(line) return count def show(B): print('char count =',char(name)) print('alphanumeric count =',len(ac(read_fn))) print('line count =',line) print('word count =', len(read_fn2)) print('BoW =',dup(B)) x = input('Use feature hashing ? (y,Y,n,N) ') while x not in ['y','Y','n','N']: print('Try again.') x = input('Use feature hashing ? (y,Y,n,N) ') else: if x == 'y' or x == 'Y': M = int(input('M = ')) print('-------------------') BoWf = [] for i in WoW: f = fn_sw.count(i) BoWf += [[fhash(i,M),f]] show(BoWf) elif x == 'n' or x == 'N': print('-------------------') BoW = [] for i in WoW: i = str(i) i = i.strip('[]') f = fn_sw.count(i) BoW += [[i,f]] show(BoW) file_name.close()
# 6330318021 (20.00) 165 (2021-03-22 09:40) def use_fhash(w,M) : G=37 list_of_ord=[ord(e) for e in w] sum_bow=0 for i in range(len(list_of_ord)): sum_bow+=list_of_ord[i]*(G**(i)) fhash=sum_bow%M return fhash file_name=input('File name = ') while True : hash_or_not=input('Use feature hashing ? (y,Y,n,N) ') if hash_or_not=='Y' or hash_or_not=='y' : M=input('M = ') fhash='Y' break elif hash_or_not=='n' or hash_or_not=='N' : fhash='N' break elif hash_or_not!='n' or hash_or_not!='N' : print('Try again.') stop_word=open('stopwords.txt','r') list_of_stopwords=[] for line in stop_word : line.strip x=line.split() list_of_stopwords+=[e for e in x] readed_file=open(file_name,'r') character_count=0 eng_num_count=0 word_count=0 line_count=0 words=[] for line in readed_file : list_of_char=[e.lower() for e in line] line_count+=1 while True : if '\n' in list_of_char : list_of_char.remove('\n') else : break for e in list_of_char : character_count+=len(e) for i in range(len(e)) : if 'a'<=e[i]<='z' or '0'<=e[i]<='9' : eng_num_count+=1 for i in range(len(list_of_char)) : if ord('a')<=ord(list_of_char[i])<=ord('z') or ord('0')<=ord(list_of_char[i])<=ord('9') : pass else : list_of_char[i]=' ' word=('').join(list_of_char).split() word_count+=len(word) words+=word print('-'*19) print('char count = '+str(character_count)) print('alphanumeric count = '+str(eng_num_count)) print('line count = '+str(line_count)) print('word count = '+str(word_count)) for i in range(len(words)) : if words[i] in list_of_stopwords : words[i]='' while True : if '' in words : words.remove('') else : break if fhash=='N' : list_normbow=[] for e in words : bow=[e,''] count_word=0 for i in range(len(words)) : if words[i]==e : count_word+=1 words[i]='' bow[1]=count_word list_normbow.append(bow) display_list=[] for i in range(len(list_normbow)) : if list_normbow[i][0]!='' : display_list.append(list_normbow[i]) print(display_list) if fhash=='Y' : list_fhash=[] fhash_list=[use_fhash(e,int(M)) for e in words] for e in fhash_list : bow=[e,''] count_word=0 for i in range(len(fhash_list)) : if fhash_list[i]==e : count_word+=1 fhash_list[i]='' bow[1]=count_word list_fhash.append(bow) display_list=[] for i in range(len(list_fhash)) : if list_fhash[i][0]!='' : display_list.append(list_fhash[i]) print(display_list) readed_file.close() stop_word.close()
# 6330319721 (26.60) 166 (2021-03-22 17:04) def fhash(w,M): G = 37 frac = 0 letters = list(w) for i in letters: frac += ord(i)*(G**(len(letters)-1)) ans = frac % M return ans #============================================================= def stopwords(): b = [] stop = open('stopwords.txt') for line in stop: if line != "\n": line1 = line.strip('\n') line2 = line1.split(' ') for j in range(len(line2)): b.append(line2[j]) stop.close() return b #============================================================= def text(file): file = open(file) a = '' for line in file: if line != "\n": line = line.lower() l = line.strip('\n') a += ''.join(l)+' ' file.close() return a #============================================================= def char(file): file = open(file) char = '' for line in file: linex = line.strip() if linex != "\n": line = line.lower() l1 = line.strip('\n') char += ''.join(l1) file.close() ans = len(char) return ans #============================================================= def alphanum(cn): ans = '' for i in cn: if i == ' ': ans += ' ' elif 48<=ord(i)<=57 or 97<=ord(i)<=122 or 65<=ord(i)<=90: ans += i else: ans += ' ' return ans #============================================================= def line(file_name): file = open(file_name) ans = 0 r = file.read() r1 = r.strip('\n') r2 = r1.split('\n') for i in r2: ans += 1 file.close() return ans #============================================================= def BoW(file_name): a1 = file_name.split() ans = [] num = 0 for i in a1: for k in range(len(a1)): if i == a1[k]: num += 1 a2 = [i,num] if a2 in ans: num = 0 else: ans.append([i,num]) num = 0 return ans #============================================================= def BoWfhash(w,m): a1 = w.split() ans = [] list1 = [] for i in a1: feh = fhash(i,m) list1.append(feh) num = 0 for j in list1: for k in range(len(list1)): if j == list1[k]: num+=1 a2 = [j,num] if a2 in ans: num = 0 else: ans.append(a2) num = 0 return ans #============================================================= file_name = input('File name = ') yn = input('use feature hashing ? (y,Y,n,N) ') do = 0 b = stopwords() a = text(file_name) cn1 = alphanum(a) cn2 = ''.join(cn1.split()) cut = ' '.join([i for i in cn1.split() if i not in b]) while yn != 'y' or yn != 'Y': if yn == 'n' or yn == 'N': break elif yn == 'y' or yn == 'Y': do = 1 m = input('M = ') break else: print('Try again.') yn = input('Use feature hashing ? (y,Y,n,N) ') if do == 1: print('-------------------') print('char count =', char(file_name)) print('alphanumeric count =', len(cn2)) print('line count =', line(file_name)) print('word count =', len(a.split())) print('BoW =', BoWfhash(cut,int(m))) else: print('-------------------') print('char count =', char(file_name)) print('alphanumeric count =', len(cn2)) print('line count =', line(file_name)) print('word count =', len(cn1.split())) print('BoW =', BoW(cut))
# 6330320221 (24.90) 167 (2021-03-21 02:13) def char_count(file_name): file=open(file_name,'r') c=0 for line in file: for e in line: if e!='\n': c+=1 file.close() return c def alphanumeric_count(file_name): file=open(file_name,'r') c=0 for line in file: for e in line: if '0'<=e<='9' or 'a'<=e.lower()<='z': c+=1 file.close() return c def line_count(file_name): file=open(file_name,'r') c=0 line_list=[] for line in file: c+=1 file.close() return c def word_count(file_name): file=open(file_name,'r') c=0 new_file='' for line in file: for e in line: if 'a'<=e.lower()<='z' or '0'<=e<='9' or e==' ': new_file+=e new_file+=' ' wordlist=new_file.split() for e in wordlist: c+=1 file.close() return c def fhash(w,M): sum=0 for i in range(len(w)): sum+=ord(w[i])*37**i fhash=sum%M return fhash def BoW(file_name): file=open(file_name,'r') cuttext='' bowtext=[] for line in file: for e in line: if 'a'<=e.lower()<='z' or '0'<=e<='9' or e==' ': cuttext+=e.lower() cuttext+=' ' cuttext=cuttext.split() for word in cuttext: if word not in stopwords: bowtext.append(word) if command==True:#---------fhash fhash_list=[] for word in bowtext: fhash_list.append(fhash(word,M)) BoW_Order=[] for num in fhash_list: if num not in BoW_Order: BoW_Order.append(num) BoW_Order.sort() count=0 BoW_Fhash=[] for find_num in BoW_Order: for num in fhash_list: if num==find_num: count+=1 BoW_Fhash.append([find_num,count]) count=0 file.close() return BoW_Fhash else:#-------------Normal count=0 BoW=[] bowtext.sort() for find_word in bowtext: for word in bowtext: if word==find_word: count+=1 BoW.append([find_word,count]) count=0 BoW_Undup=[] for e in BoW: if e not in BoW_Undup: BoW_Undup.append(e) file.close() return BoW_Undup #-----INPUT------------------- file_name=input('File name = ').strip() while True: command=input('Use feature hashing ? (y,Y,n,N) ').strip() if command=='Y' or command=='y': command=True break elif command=='N' or command=='n': command=False break else: print('Try again.') if command==True: M=int(input('M = ').strip()) #---STOPWORD-PREP----------------- stopwords=[] stop_file=open('stopwords.txt','r') for line in stop_file: wordinline=line.split() for e in wordinline: stopwords.append(e) stop_file.close() #------SHOW----------------------- print('-------------------') print('char count =',char_count(file_name)) print('alphanumeric count =',alphanumeric_count(file_name)) print('line count =',line_count(file_name)) print('word count =',word_count(file_name)) print('BoW =',BoW(file_name)) #-------------------------------
# 6330321921 (12.00) 168 (2021-03-22 13:15) #Prog-08: Bag-of-words #6330321921 (12.00) Poonnawich Kerdsup def fhash(w, M): k = 0 for i in range(len(w)): k += ord(w[i])*(37**i) return k % M def blank(t): result = '' for c in t: if c in '\'\"/\\,.:;?!': result += ' ' else: result += c return result def count_word(list): c = 0 for e in list: if 'a' <= e.lower() <= 'z' or \ '0' <= e <= '9': c += 1 return c def stopwords_list(file_name): stopwords = [] for line in file_name: sw_eachline = line.split(' ') for sw in sw_eachline: stopwords.append(sw) return stopwords #----------------------------------------------------- #stopwords swin = open('stopwords.txt', 'r') stopwords = stopwords_list(swin) swin.close() #----------------------------------------------------- file_name = input('File name = ').strip() yorn = input('Use feature hashing ? (y,Y,n,N) ').strip() fn = open(file_name, 'r') t = True while t: if yorn == 'y' or yorn == 'Y': M = int(input('M = ')) t = False elif yorn == 'n' or yorn == 'N': M = -1 t = False else: print('Try again') yorn = input('Use feature hashing ? (y,Y,n,N) ').strip() print('-'*19) # 19 '-' #---------------------------------------------------------- c = 0 d = 0 line_count = 0 word_count = 0 for line in fn: if line[-1] == '\n': c += len(line) - 1 else: c += len(line) line_count += 1 for e in line: if 'a' <= e.lower() <= 'z' or \ '0' <= e <= '9': d += 1 b = blank(line) x = b.split(' ') word_count += count_word(x) print('char count =', str(c)) # charactor count from file print('alphanumaric count =', str(d)) # alphanumaric count from file print('line count =', str(line_count)) # lines counted print('word count =', str(word_count)) # words counted fn.close() #----------------------------------------------------------- #BoW ต่อ fn = open(file_name, 'r') words = '' for line in fn: w = line.split(' ') for e in w: if blank(e.lower()) in stopwords: words += ' ' else: words += e bb = blank(words) xx = bb.split(' ') y = [] for e in xx: if e == '' or e == '\n': pass else: e = e.lower() y.append(e) list_fhash = [] BoW = [] if M != -1: for i in range(len(y)): if len(y[i]) == 0: pass else: fh = fhash(y[i], M) list_fhash.append(fh) list_fhash.sort() list_fhash.append('!') q = 1 for j in range(1, len(list_fhash)): if list_fhash[j - 1] == list_fhash[j]: q += 1 else: BoW.append([list_fhash[j - 1], q]) q = 1 elif M == -1: y.sort() y.append('!') qq = 1 for ii in range(1, len(y)): if y[ii-1] == y[ii]: qq += 1 else: BoW.append([y[ii-1], qq]) qq = 1 print('BoW =', BoW) fn.close()
# 6330322521 (26.00) 169 (2021-03-22 14:44) def BoW(words): o=[] words.sort() i=0 while (i <= len(words)-1): c = 1 ch = words[i] j = i while (j < len(words)-1): if (words[j] == words[j+1]): c = c+1 j = j+1 else:break o.append([words[i],c]) i = j+1 return o file_name=open(input('File name = '),'r') p=input('Use feature hashing ? (y,Y,n,N) ') while p != 'y' and p != 'Y' and p != 'n' and p != 'N': print('Try again.') print('Use feature hashing ? (y,Y,n,N) ') p=input() if p == 'y' or p== 'Y':M=int(input('M = ')) print('-------------------') S=open('stopwords.txt','r') stop1='' stop2='' for l in S: stop1+=l for g in stop1: if g == '\n': stop2+=' ' else : stop2+=g s=stop2.split() S.close() a='' for l in file_name: a+=l.lower() c=0 for e in a: if e != '\n':c+=1 print('char count = ',c) a1='' for e in a: if 'a'<=e<='z' or 'A'<=e<='Z' or'0'<=e<='9': a1+=e else :a1+=' ' c=0 for e in a1: if not e == ' ':c+=1 print('alphanumeric count = ',c) c=0 for e in a: if e == '\n':c+=1 print('line count = ',c) c=len(a1.split()) print('word count = ',c) F=[] a1=a1.split() for e in a1: if e not in s: F.append(e) if p == 'n' or p=='N': print('BoW =',BoW(F)) if p == 'y' or p == 'Y': a2=F a3=[] for e in a2: num=0 for i in range(len(e)): num+=ord(e[i])*(37**i) a3.append(num%M) print('BoW =',BoW(a3)) file_name.close()
# 6330323121 (22.43) 170 (2021-03-22 21:47) #..................................................................................... #ให้ w คือคำที่ประกอบด้วยอักขระ c0 c1 c2 ... cn –1 #fhash(w,M) = fhash(c0 c1 c2 ... cn –1, M) = ( ord(c0) + ord(c1)G1 + ord(c2)G2 + ... + ord(cn –1)Gn –1) % M def fhash(w,M) : u=0 G=37 fh=0 for i in range(len(w)): fh+=ord(w[i])*(G**u) u+=1 return fh%M def char_count(file_name): n = -1 c = 0 f = open(file_name) for line in f: n += 1 c += len(line) f.close() c -= n return c return c def a_and_num_count(file_name): f=open(file_name) c=0 alphabet='abcdefghijklmnopqrstuvwxyz' num='0123456789' for line in f: for i in line: if i in alphabet or i in alphabet.upper() or i in num: c+=1 f.close() return c def words_count(file_name): f=open(file_name) s='' alphabet='abcdefghijklmnopqrstuvwxyz' num='0123456789' for line in f: for i in line : if i in alphabet or i in alphabet.upper() or i in num : s+=i else: s+=' ' x=s.split() f.close() return len(x) def line_count(file_name): c = 0 f = open(file_name) for line in f: c += 1 f.close() return c def BoW_Nn(file_name,stop): alphabet = "abcdefghijklmnopqrstuvwxyz" num = "1234567890" f = open(file_name) f2 = "" s2 = "" for line in f: for e in line: if e.lower() in alphabet or e in num: f2 += e else: f2 += " " s = open(stop) for line in s: for e in line: s2 += e f3 = f2.lower().split() s3 = s2.lower().split() x = [] for e in f3: if e not in s3: x.append(e) x.sort() b0 = [x[0]] b1 = [1] for i in range(1,len(x)): if x[i] != x[i-1]: b0.append(x[i]) b1.append(1) else: b1[-1] += 1 b = [] for i in range(len(b0)): b.append([b0[i],b1[i]]) f.close() s.close() return b def BoW_Yy(file_name,stop,M): b=BoW_Nn(file_name,stop) by=[] for i in range(len(b)): by.append(fhash(b[i][0],M)) by.sort() #[1,1,2,3,3,4,5,5,6] by0=[by[0]] by1=[1] for i in range(1,len(by)): if by[i-1]!=by[i]: by0.append(by[i]) by1.append(1) else: by1[-1]+=1 bowyes=[] for i in range(len(by0)): bowyes.append([by0[i],by1[i]]) return bowyes #.......................................... file_name=input('File name = ') yn=input('Use feature hashing ? (y,Y,n,N) ') while yn not in ['Y','y','N','n'] : print('Try again.') yn=input('Use feature hashing ? (y,Y,n,N) ') if yn =='N' or yn=='n': print('char count = '+str(char_count(file_name))) print('alphanumeric count = '+str(a_and_num_count(file_name))) print('line count = '+str(line_count(file_name))) print('word count = '+str(words_count(file_name))) print('BoW = '+str(BoW_Nn(file_name,'stopword.txt'))) elif yn=='Y' or yn=='y': M=int(input('M = ')) print('char count = '+str(char_count(file_name))) print('alphanumeric count = '+str(a_and_num_count(file_name))) print('line count = '+str(line_count(file_name))) print('word count = '+str(words_count(file_name))) print('BoW = '+str(BoW_Yy(file_name,'stopword.txt',M)))
# 6330324821 (30.00) 171 (2021-03-21 15:24) def count(Bow_wordsi,new_sentence):#Yes N=0 for eachword in new_sentence: if eachword == Bow_wordsi: N+=1 return N def delete(line):#Yes คืน listรวมช่องว่าง newline='' for c in line.strip(): if c in 'ABCDEFGHIJKLMNOPQRSTUVWXUZabcdefghijklmnopqrstuvwxyz0123456789' : newline+=c else: newline+=' ' return newline def change(sentence,allstopwords):#Yesรับเป็นstring list_newsentence=[] sentence_strip=sentence.strip().lower() sentences=delete(sentence_strip).split() for c in sentences: if c not in allstopwords: list_newsentence.append(c.lower()) return list_newsentence def Bag_of_words(list_newsentence):#Yes list_newsentence.sort() Bow_words=[] for c in list_newsentence: if c not in Bow_words: Bow_words.append(c) Bow=[] for i in range(len(Bow_words)): N=count(Bow_words[i],list_newsentence) Bow1=Bow_words[i] Bow2=N Bow.append([Bow1,Bow2]) return Bow def fhash(w,M)\ : set_ans=[] ans=0 for i in range(len(w)): set_ans.append(ord(w[i])*(37**i)) for c in set_ans: ans+=c Fhash=ans%M return Fhash #รับอินพุต file_name = input('File name = ') choice = input('Use feature hashing ? (y,Y,n,N) '"") if choice == 'y' or choice == 'Y': check=True M=int(input('M = ')) elif choice == 'n' or choice == 'N': check=False else: while choice not in ['y','Y','n','N']: print('Try again.') choice = input('Use feature hashing ? (y,Y,n,N) '"") if choice == 'y' or choice == 'Y': check=True M=int(input('M = ')) elif choice == 'n' or choice == 'N': check=False #อ่านstopwords stopwords=open('stopwords.txt', 'r') allstopwords=[] for line in stopwords: stopword_in_line=line.strip().split() for c in stopword_in_line: if c not in allstopwords: allstopwords.append(c) stopwords.close() print('-------------------') #อ่านไฟล์ fn=open(file_name , 'r') line_count=0 character_count=0 word_count=0 ch_nb_count=0 AllBows=[] newline_string='' sentence='' for line in fn: character_count+=len(line.strip()) line_count+=1 newline=delete(line) for c in newline: if c != ' ': ch_nb_count+=1 newline_list=newline.split() word_count+=len(newline_list) for i in range(len(newline_list)): sentence+=newline_list[i]+' ' fn.close() print('char count =',character_count) print('alphanumeric count =',ch_nb_count) print('line count =',line_count) print('word count =',word_count) Bows=Bag_of_words(change(sentence,allstopwords)) if check: new1=[] for i in range(len(Bows)): some=Bows[i][1] for k in range(some): new1.append(fhash(Bows[i][0],M)) new=[] new2=[] new1_sort=sorted(new1) c=1 xxx=[] for c in new1_sort: if c not in xxx: N=count(c,new1_sort) new2.append(N) xxx.append(c) for i in range(len(xxx)): new.append([xxx[i],new2[i]]) Bows=new print('BoW =',Bows)
# 6330325421 (27.00) 172 (2021-03-22 23:44) def cut(a): b='' for c in a: if not ('a'<=c<='z' or 'A'<=c<='Z' or '0'<=c<='9'): b+=' ' else: b+=c return b def fhash(w,M): G=37 a=0 for c in range(len(w)): if 'a'<=w[c]<='z' or w[c] in '0123456789': a+=ord(w[c])*(G**c) return int(a%M) def count( data, element ): c = 0 for e in data: if e == element: c += 1 return c #----------------------------- word2=[] file_name2='stopwords.txt' infile2=open(file_name2,"r") for line2 in infile2: line2=line2.lower().split() word2+=line2 #------------------------------ file_name =input('File name = ' ) x=input('Use feature hashing ? (y,Y,n,N) ') while x not in 'yYnN': print('Try again.') x=input('Use feature hashing ? (y,Y,n,N) ') if x=='y' or x=='Y': M=int(input('M = ')) infile=open(file_name,"r") word=[] word_count=0 character_count=0 alphanumeric_count=0 line_count=0 BoW=[] bow=[] realbow=[] true=[] truefhash=[] realbowfhash=[] print('-------------------') for line in infile: if '\n' in line: character_count+=len(line)-1 else: character_count+=len(line) line=cut(line).split() word_count+=len(line) word+=line #------------------------------------------------- for i in line: true.append(i.lower()) if i.lower() not in bow: bow.append(i.lower()) alphanumeric_count+=len(i) line_count+=1 for c in line: if c.lower() not in word2: truefhash.append(fhash(c.lower(),M)) for a in range(len(bow)): if bow[a] not in word2 and (bow[a] not in realbow): realbow.append(bow[a]) print('char count =',character_count) print('alphanumeric count =',alphanumeric_count) print('line count =',line_count) print('word count =',word_count) #------------------------------------------------- for i in range(len(realbow)): realbow[i]=fhash(realbow[i],M) if realbow[i] not in realbowfhash: realbowfhash.append(realbow[i]) BoW.append([realbow[i],count( truefhash, realbow[i] )]) print('BoW =',BoW) elif x=='n' or x=='N': infile=open(file_name,"r") word=[] word_count=0 character_count=0 alphanumeric_count=0 line_count=0 BoW=[] bow=[] realbow=[] true=[] print('-------------------') for line in infile: if '\n' in line: character_count+=len(line)-1 else: character_count+=len(line) line=cut(line).split() word_count+=len(line) word+=line for i in line: true.append(i.lower()) if i.lower() not in bow: bow.append(i.lower()) alphanumeric_count+=len(i) line_count+=1 for a in range(len(bow)): if bow[a] not in word2 and (bow[a] not in realbow): realbow.append(bow[a]) print('char count =',character_count) print('alphanumeric count =',alphanumeric_count) print('line count =',line_count) print('word count =',word_count) for i in range(len(realbow)): BoW.append([realbow[i],count( true, realbow[i] )]) print('BoW =',BoW) infile.close() infile2.close()
# 6330326021 (30.00) 173 (2021-03-21 18:07) def words_in_line(line): new_line = "" for a in line: if a in ".,<>/?\|!~`()*&^%$#@_-+=][}{\'\";:\\": new_line += " " else: new_line += a return new_line.split() def list_of_words(f): char_count = 0 alphanumeric_count = 0 line_count = 0 word_count = 0 words = [] for line in f: line_count += 1 line = line.strip() char_count += len(line) words += words_in_line(line) word_count = len(words) for w in words: alphanumeric_count += len(w) print("char count = " + str(char_count)) print("alphanumeric count = " + str(alphanumeric_count)) print("line count = " + str(line_count)) print("word count = " + str(word_count)) return words def mod_words(words): result = [] for w in words: if w.lower() not in stop_words: result.append(w.lower()) return result def non_feature_hashing(words): result = [] did = [] for w in words: if w not in did: c = words.count(w) did.append(w) result.append([w, c]) return sorted(result) def fhash(word, M): temp = 0 for i in range(len(word)): temp += ord(word[i]) * 37**i return temp % M def feature_hashed(words, M): return [fhash(word, M) for word in words] def feature_hashing(words, M): result = [] did = [] new_words = feature_hashed(words, M) for f in new_words: if f not in did: c = new_words.count(f) did.append(f) result.append([f, c]) return sorted(result) file_name = input("File name = ") fin = open(file_name, "r") stop_words_file = open("stopwords.txt", "r") stop_words = [] for line in stop_words_file: line = line.strip().split() for w in line: stop_words.append(w) stop_words_file.close() correct = False while not correct: command = input("Use feature hashing ? (y,Y,n,N) ") if command == "n" or command == "N": correct = True print("-" * len("Use feature hashing")) words = list_of_words(fin) print("BoW =", non_feature_hashing(mod_words(words))) elif command == "y" or command == "Y": correct = True M = int(input("M = ")) print("-" * len("Use feature hashing")) words = list_of_words(fin) print("BoW =", feature_hashing(mod_words(words), M)) else: print("Try again.") fin.close()
# 6330327721 (26.00) 174 (2021-03-22 21:12) def fhash(w,M): fh=0 for i in range(len(w)): fh+=ord(w[i])*(37**i) fh=fh%M return fh def BoW(st): sw = open("stopwords.txt ",'r') swn=[] stn =[] n=0 for i in sw: i = i.split() for j in i: swn.append(j) for i in st: if i not in swn: stn.append(i) stn.sort() bow=[[stn[0],1]] for i in stn: if bow[-1][0] == i: n+=1 bow[-1][1]=n else: n=1 bow.append([i,n]) sw.close() return bow def fhB(st,M): bow = BoW(st) for i in range(len(bow)): bow[i][0] = fhash(bow[i][0],M) bow.sort() fhb = [bow[0]] for i in range(1,len(bow)): if fhb[-1][0] == bow[i][0]: fhb[-1][1]=int(fhb[-1][1])+int(bow[i][1]) else: fhb.append(bow[i]) return fhb def show(cc,cnc,lc,wc): print('char count =',cc) print('alphanumeric count =',cnc) print('line count =',lc) print('word count = ',wc) def analize(file_name,M): ch = 'abcdefghijklmnopqrstuvwxyz' num = '0123456789' f = open(file_name,'r') x='' cc=0 cnc=0 wc=0 lc=0 for i in f: lc+=1 i=i.lower().strip() for l in i: cc+=1 if l in ch or l in num: cnc+=1 x+=l else: x+=' ' x+=' ' x = x.strip().split() wc=len(x) show(cc,cnc,lc,wc) if M == '': bow = BoW(x) else: bow = fhB(x,M) print('BoW =',bow) f.close() def choice(): file_name = input('File name = ') c=input('Use feature hashing ? (y,Y,n,N) ') while c not in ['n','N','y','Y']: print('Try again.') c=input('Use feature hashing ? (y,Y,n,N) ') if c in ['y','Y']: M = int(input('M = ')) print('-------------------') analize(file_name,M) elif c in ['n','N']: print('-------------------') analize(file_name,'') choice()
# 6330328321 (30.00) 175 (2021-03-21 15:49) def readfile(filename): file = open(filename,'r') s=list() for line in file: s.append(line.strip()) file.close() return s def charcount(lines): char_count=0 for i in range(len(lines)): char_count+=len(lines[i]) return char_count def removespecial(s): t='' for ch in s: if 'a'<=ch<='z' or\ '0'<=ch<='9': t+=ch else: t+=' ' return t def dosomething(do): M=-1 while True: if do =='y': M = input('M = ') print('-------------------') break elif do=='n': print('-------------------') break else: print('Try again.') do=input('Use feature hashing ? (y,Y,n,N) ').lower() return do,M def fhash(word,M): G=37 nsum=0 for i in range(len(word)): n=ord(word[i]) nsum+=n*(37**i) return nsum%int(M) def calbow(listwords): wordlist=list() nlist=list() for i in range(len(listwords)): if listwords[i] not in wordlist: wordlist.append(listwords[i]) nlist.append(1) else: idx=wordlist.index(listwords[i]) nlist[idx]=nlist[idx]+1 bowlist=list() for i in range(len(wordlist)): tem = list() tem.append(wordlist[i]) tem.append(nlist[i]) bowlist.append(tem) bowlist.sort() return bowlist def calbowfhash(listwords,M): wordlist=list() nlist=list() for i in range(len(listwords)): if fhash(listwords[i],M) not in wordlist: wordlist.append(fhash(listwords[i],M)) nlist.append(1) else: idx=wordlist.index(fhash(listwords[i],M)) nlist[idx]=nlist[idx]+1 bowlist=list() for i in range(len(wordlist)): tem = list() tem.append(wordlist[i]) tem.append(nlist[i]) bowlist.append(tem) bowlist.sort() return bowlist def main(): x=input('File name = ') do=input('Use feature hashing ? (y,Y,n,N) ') do,M = dosomething(do.lower()) s=readfile(x) line_count=len(s) n=charcount(s) s=' '.join(s) t=removespecial(s.lower()) word = t.split() print('char count =',n) print('alphanumeric count =',len(''.join(word))) print('line count =',line_count) print('word count =',len(word)) stopwords = readfile('stopwords.txt') stopwords =' '.join(stopwords) stopwords = stopwords.split() listwords=list() for i in range(len(word)): if word[i] not in stopwords: listwords.append(word[i]) if do=='y': print('BoW =',calbowfhash(listwords,M)) else: print('BoW =',calbow(listwords)) main()
# 6330329021 (30.00) 176 (2021-03-22 00:33) file_name = input('File name = ') hash_check = '' while hash_check not in ['y','Y','n','N']: hash_check = input('Use feature hashing ? (y,Y,n,N) ') if hash_check not in ['y','Y','n','N']: print('Try again.') if hash_check.lower() == 'y': M = int(input('M = ')) print('-------------------') stopwords_file = open('stopwords.txt', 'r') stopwords_list = [] for line in stopwords_file: line = line.strip().split() for e in line: stopwords_list.append(e) stopwords_file.close() file = open(file_name, 'r') chars = 0 for line in file: for e in line: if e not in '\n': chars += 1 print('char count = '+str(chars)) file.close() file = open(file_name, 'r') alphanumeric = 0 for line in file: for e in line: if e.lower() in '0123456789abcdefghijklmnopqrstuvwxyz': alphanumeric += 1 print('alphanumeric count = '+str(alphanumeric)) file.close() file = open(file_name, 'r') line_count = 0 for line in file: line_count += 1 print('line count = '+str(line_count)) file.close() file = open(file_name, 'r') word_count = 0 clean_line = '' for line in file: line = line.lower() for e in line: if e in '0123456789abcdefghijklmnopqrstuvwxyz': clean_line += e else: clean_line += ' ' line = clean_line.strip().split() word_count += len(line) clean_line = '' print('word count = '+str(word_count)) file.close() file = open(file_name, 'r') BoW = [] list_of_words = [] clean_line = '' for line in file: line = line.lower() for e in line: if e in '0123456789abcdefghijklmnopqrstuvwxyz': clean_line += e else: clean_line += ' ' clean_line = clean_line.strip().split() for e in clean_line: if e not in stopwords_list: list_of_words.append(e) clean_line = '' file.close() def fhash(w,M): value = 0 for i in range(len(w)): value += ord(w[i])*(37**i) return value%M if hash_check.lower() == 'y': for i in range(len(list_of_words)): list_of_words[i] = fhash(list_of_words[i], M) list_of_words.sort() c = 0 if len(list_of_words) != 0: x = list_of_words[0] for i in range(len(list_of_words)): if list_of_words[i] == x: c += 1 else: BoW.append([x,c]) x = list_of_words[i] c = 1 BoW.append([x,c]) print('BoW = '+str(BoW))
# 6330330521 (0.00) 177 (2021-03-22 01:16) #============================================================ def remove_punc(t): out = "" for e in t: if e not in '''!()-[]{};:'"\,<>./?@#$%^&*_~''': out += e else: out += ' ' return out #======================================================================= def fhash(w, M): sum = 0 for i in range (len(w)): sum += ord(w[i])*(37**i) fhash = sum % M return fhash #========================================================================= def bow_n(): list_word =[] unique_word = [] list_stopwords = [] word_frequencies = [] last_bow =[] fn = open("sample.txt") fr = open("stop words.txt") for line in fn: line = line.lower() line = remove_punc(line) line = line.split() list_word += line for line in fr: line = line.split() list_stopwords += line for word in list_word: if word not in unique_word and word not in list_stopwords: unique_word += [word] for word in unique_word: word_frequencies += str(list_word.count(word)) for i in range(len(unique_word)): r=[] r.append(unique_word[i]) r.append(int(word_frequencies[i])) last_bow.append(r) last_bow.sort() fn.close() fr.close() return last_bow #========================================================================= def count_char(): count_char = 0 count_line =0 fn = open("sample.txt") for line in fn: count_line += 1 for i in range(len(line)): count_char += 1 fn.close() count_char = count_char - count_line return count_char #============================================= def count_line(): count_line =0 fn = open("sample.txt") for line in fn: count_line += 1 fn.close() return count_line #================================================================ def count_word(): count_word =0 fn = open("sample.txt") for line in fn: line = remove_punc(line) line = line.split() count_word += len(line) fn.close() return count_word #============================================================== def count_alpha(): alpha = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] number = ['0','1','2','3','4','5','6','7','8','9'] count_alpha = 0 fn = open("sample.txt") for line in fn: for i in range(len(line)): line = line.lower() if line[i] in alpha or line[i] in number: count_alpha += 1 fn.close() return count_alpha #============================================================== def bow_y(): list_word =[] unique_word = [] list_stopwords = [] word_frequencies = [] last_bow =[] fn = open("sample.txt") fr = open("stop words.txt") for line in fn: line = line.lower() line = remove_punc(line) line = line.split() list_word += line for line in fr: line = line.split() list_stopwords += line for word in list_word: if word not in list_stopwords: unique_word += [word] for word in unique_word: word_frequencies += str(list_word.count(word)) for i in range(len(unique_word)): r=[] r.append(unique_word[i]) r.append(int(word_frequencies[i])) last_bow.append(r) last_bow.sort() fn.close() fr.close() return last_bow #============================================================== file_name = input('File name = ') fh = input('Use feature hashing ? (y,Y,n,N) ') while fh != 'y' and fh != 'Y' and fh != 'n' and fh != 'N': print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') if fh == 'n' or fh == 'N': print('-'*19) print('-'*19) print('char count = '+str(count_char())) print('alphanumeric count = '+str(count_alpha())) print('line count = '+str(count_line())) print('word count = '+str(count_word())) print('BoW = '+ str(bow_n())) elif fh == 'y' or fh == 'Y': list_bow = [] unique_bow = [] num_frequencies =[] num_bow = [] M = input('M = ') print('-'*19) print('char count = '+str(count_char())) print('alphanumeric count = '+str(count_alpha())) print('line count = '+str(count_line())) print('word count = '+str(count_word())) bow_y = bow_y() for i in range (len(bow_y)): sum = 0 for k in range (len(bow_y[i][0])): sum += ord(bow_y[i][0][k])*(37**k) bow = sum % int(M) list_bow.append(bow) for e in list_bow: if e not in unique_bow: unique_bow += [e] for e in unique_bow: num_frequencies += str(list_bow.count(e)) for i in range(len(unique_bow)): r=[] r.append(unique_bow[i]) r.append(int(num_frequencies[i])) num_bow.append(r) num_bow.sort() print('BoW = '+ str(num_bow))
# 6330331121 (30.00) 178 (2021-03-22 14:35) def fhash (w, M): G = 37 a = 0 for i in range(len(w)): a += (ord(w[i])) * (G**i) b = a % M return b file_name = input("File name = ") use = input("Use feature hashing ? (y,Y,n,N) ") d = ["y","Y"] e = ["n","N"] f = ["y","Y","n","N"] while use not in f: print("Try again.") use = input("Use feature hashing ? (y,Y,n,N) ") if use in d: M = int(input("M = ")) print("-------------------") elif use in e: print("-------------------") charcou = 0 op_file = open(file_name, "r") for i in op_file: for e in i: charcou += 1 if e == "\n": charcou += -1 op_file.close() print("char count =", charcou) alpha = 0 op_file = open(file_name, "r") for i in op_file: for e in i: if "A" <= e <= "Z": alpha += 1 if "a" <= e <= "z": alpha += 1 if "0" <= e <= "9": alpha += 1 else: alpha += 0 op_file.close() print("alphanumeric count =", alpha) op_file = open(file_name, "r") linecou = 0 for i in op_file: linecou += 1 op_file.close() print("line count =", linecou) word = "" wordcou = 0 op_file = open(file_name, "r") for i in op_file: for e in i: if "A" <= e <= "Z": word += e.lower() elif "a" <= e <= "z": word += e elif "0" <= e <= "9": word += e else: word += " " op_file.close() wordlist = word.split() for f in range(len(wordlist)): wordcou += 1 print("word count =", wordcou) sn = "" op_stop = open("stopwords.txt", "r") for i in op_stop: for e in i: if "A" <= e <= "Z": sn += e elif "a" <= e <= "z": sn += e elif "0" <= e <= "9": sn += e else: sn += " " op_stop.close() stopwordlist = sn.split() pppcorrect = "" for i in wordlist: if i not in stopwordlist and i not in pppcorrect: pppcorrect += i +" " pppcorrect = pppcorrect.lower() pppcorrect = pppcorrect.split() pppwrong = "" for i in wordlist: if i not in stopwordlist: pppwrong += i +" " pppwrong = pppwrong.lower() pppwrong = pppwrong.split() bowlistn = [] if use in ["y","Y"]: pppcorrect = "" for i in wordlist: if i not in stopwordlist and str(fhash(i,M)) not in pppcorrect: pppcorrect += str(fhash(i,M)) +" " pppcorrect = pppcorrect.split() pppwrong = "" for i in wordlist: if i not in stopwordlist: pppwrong += str(fhash(i,M)) +" " pppwrong = pppwrong.split() bowlistn = [] for i in range(len(pppcorrect)): summ = 0 for e in range(len(pppwrong)): if pppcorrect[i] == pppwrong[e]: summ += 1 bowlistn.append([int(pppcorrect[i]),summ]) print("BoW =", bowlistn) else: if use in ["n","N"]: for i in range(len(pppcorrect)): summ = 0 for e in range(len(pppwrong)): if pppcorrect[i] == pppwrong[e]: summ += 1 bowlistn.append([pppcorrect[i],summ]) print("BoW =", bowlistn)
# 6330332821 (30.00) 179 (2021-03-22 14:10) def fhash(w,M): G = 37 i = 0 sumChar = 0 for c in w: sumChar += ord(c)*pow(G,i) i += 1 return sumChar % M def setBoWList(L): BoW = [] checkRedundant = [] for e in L: if e not in checkRedundant: BoW.append([e,L.count(e)]) checkRedundant.append(e) return sorted(BoW,key = lambda x: x[0]) def main(): fileStopWords = open("stopwords.txt", "r") stopWords = [] for line in fileStopWords: stopWords.extend(line.split()) fileStopWords.close() txtFile = input("File name = ") while True: hashing = input("Use feature hashing ? (y,Y,n,N) ") if hashing == "y" or hashing == "Y": hashing = True M = int(input("M = ")) break elif hashing == "n" or hashing == "N": hashing = False break else: print("Try again.") print("-------------------") txtFile = open(txtFile, "r") data = [] char = 0 alnum = 0 line = 0 word = 0 for txtLine in txtFile: text = txtLine.strip() char += len(text) line += 1 t = "" space = 0 for c in text: if not (c.isalnum() or c.isspace()): t += " " space += 1 else: t += c.lower() if c.isspace(): space += 1 text = t alnum += len(text) - space text = text.split() word += len(text) data.extend(text) txtFile.close() print("char count =", char) print("alphanumeric count =", alnum) print("line count =", line) print("word count =", word) clearStop = [] for e in data: if e not in stopWords: clearStop.append(e) L = [] if hashing: for w in clearStop: L.append(fhash(w,M)) else: L = clearStop BoW = setBoWList(L) print("BoW =",BoW) main()
# 6330333421 (21.00) 180 (2021-03-22 17:45) def read_file_to_list(filepath): file = open(filepath, 'r') lines = [] for line in file.readlines(): lines.append(line.strip()) file.close() return lines def fhash(w, M): sum_ord = 0 for i in range(len(w)): sum_ord += ord(w[i])*(37**i) return sum_ord % M def remove_non_anumeric(word): result = "".join(e for e in word if e.isalnum()) return result def describe_file(lines): no_of_char = 0 no_of_lines = 0 no_of_anumeric = 0 no_of_word = 0 for line in lines: no_of_char += len(line) no_of_lines += 1 no_of_anumeric += len(remove_non_anumeric(line)) no_of_word += len([w for w in line.split()]) return(no_of_char,no_of_lines, no_of_anumeric, no_of_word) def clean_words(words): result = '' for c in words: if c.isalnum() or c == ' ': result += c else: result += ' ' return result.strip().split() def get_stop_words(): stopwords = [] stopwords_lines = read_file_to_list('stopwords.txt') for line in stopwords_lines: for word in line.strip().split(): stopwords.append(remove_non_anumeric(word.lower())) return stopwords def find_bow(words, useFHash=False, M=0): all_word = [] words = clean_words(words) stop_words = get_stop_words() for word in words: if word not in stop_words: if useFHash: word = fhash(word, M) all_word.append(word) word_bow = [] word_count = [] for word in all_word: if word not in word_bow: word_bow.append(word) word_count.append(1) else: word_count[word_bow.index(word)] += 1 result = [] for i in range(len(word_bow)): result.append([word_bow[i], word_count[i]]) return result # =========================================================== def main(): file_name = input("File name = ").strip() use_f_hash = input("Use feature hashing ? (y,Y,n,N) ").lower().strip() while use_f_hash not in ['n','y']: print("Try again") use_f_hash = input().lower().strip() use_f_hash = True if use_f_hash=='y' else False if use_f_hash: M = int(input("M = ")) lines = read_file_to_list(file_name) no_of_char,no_of_lines, no_of_anumeric, no_of_word = describe_file(lines) print("-------------------") print("char count = {}".format(no_of_char)) print("alphanumeric count = {}".format(no_of_anumeric)) print("line count = {}".format(no_of_lines)) print("word count = {}".format(no_of_word)) all_words = [] for line in lines: all_words.append(line) bow = find_bow(' '.join(all_words)) if use_f_hash: bow = find_bow(' '.join(all_words), use_f_hash, M) print("BoW = {}".format(bow)) # =========================================================== main()
# 6330334021 (30.00) 181 (2021-03-22 17:51) file_name = input('File name = ') hashing = input('Use feature hashing ? (y,Y,n,N) ') while not (hashing in ['y', 'Y', 'n', 'N']): print("Try again.") hashing = input('Use feature hashing ? (y,Y,n,N) ') M = 0 if hashing == 'y' or hashing == 'Y': M = int(input('M = ')) print('-------------------') def fhash(w,M): summ = 0 s = 0 for l in w: summ += (ord(l)*37**s) s += 1 return summ%M ss = '' stop = [] stpw = open('stopwords.txt', 'r') for line in stpw: for i in range(len(line) - 1): if line[i] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789': ss += line[i] if not (line[i + 1] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'): stop.append(ss) ss = '' if line[-1] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789': ss += line[-1] stop.append(ss) stpw.close() char = 0 alp_count = 0 line_count = 0 ww = '' word = [] word_count = 0 BoW = [] fakebow = [] alp_num = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' analyse = open(file_name, 'r') for line in analyse: line_count += 1 for i in line: char += 1 if i in alp_num: alp_count += 1 for ii in range(len(line) - 1): if line[ii] in alp_num: ww += line[ii] if not (line[ii + 1] in alp_num): word.append(ww) ww = '' if line[-1] in alp_num: ww += line[-1] word.append(ww) word.sort() char -= (line_count - 1) word_count = len(word) print('char count = ' + str(char)) print('alphanumeric count = ' + str(alp_count)) print('line count = ' + str(line_count)) print('word count = ' + str(word_count)) for iii in range(len(word)): word[iii] = word[iii].lower() for iiii in range(len(word)): if not (word[iiii] in stop): if not (word[iiii] in fakebow): BoW.append([word[iiii], 1]) fakebow.append(word[iiii]) else: BoW[fakebow.index(word[iiii])][1] += 1 if hashing == 'y' or hashing == 'Y': BoW = [] fakebow = [] for iiiii in range(len(word)): if not (word[iiiii] in stop): if not (fhash(word[iiiii], M) in fakebow): BoW.append([fhash(word[iiiii], M), 1]) fakebow.append(fhash(word[iiiii], M)) else: BoW[fakebow.index(fhash(word[iiiii], M))][1] += 1 BoW.sort() print('BoW =' ,BoW) analyse.close()
# 6330335721 (24.90) 182 (2021-03-18 20:53) #----------------------------------------------------------------- def count(lis,word ): # return the count of the given element in the given data c = 0 for e in lis: if e == word: c += 1 return c #----------------------------------------------------------------- def hasss(w,m): z=0 mod=0 for i in w: mod+=ord(i)*(37**z) z+=1 return mod%int(m) #----------------------------------------------------------------- file_name = input("Filename = ") file_name = open(file_name) linecount= 0 #----------------------------------------------------------------- stop =open("stopwords.txt") stoplis = [] for line in stop: x = line.split() for i in x: stoplis.append(i) #----------------------------------------------------------------- yes=False while True: has = input("Use feature hashing ? (y,Y,n,N) ") if has == "y"or has == "Y"or has == "n"or has == "N": if has == "y"or has == "Y": m = input("M = ") yes=True break else: print("Try again.") #----------------------------------------------------------------- l = '';ls=[];uni=[];bow=[];unih = [];char ="" for line in file_name: for i in line: if i != "\n": char +=i linecount +=1 l = '' for i in line: if i.lower() in "abcdefghigklmnopqrstuvwxyz0123456789":l+=i.lower() elif i == " ":l+= " " else:i+=" " l =l.split() for i in l: ls.append(i) ls =sorted(ls) #----------------------------------------------------------------- uni2=[] uni2h=[] for i in ls: if i not in stoplis: uni.append(i) if i not in stoplis and i not in uni2: uni2.append(i) if yes ==True: for i in uni: unih.append(hasss(i,m)) unih =sorted(unih) for i in unih: if i not in uni2h: uni2h.append(i) #----------------------------------------------------------------- if yes ==False: for i in uni2 : bow.append([i,count(ls,i)]) #----------------------------------------------------------------- else: for i in uni2h : bow.append([i,count(unih,i)]) wordcount = "".join(ls) #print("word = ",ls) #print("unique = ",uni) #print("unique = ",uni2) #print("uniqueh = ",unih) print("char count =",len(char)) print("alphanumeric count =",len(wordcount)) print("line count =",linecount) print("word count =",len(ls)) print("bow =",bow) #print("stop = ",stoplis) stop.close() file_name.close()
# 6330336321 (30.00) 183 (2021-03-21 20:05) Alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" number = "1234567890" def fhash(w, m): G = 37 value = 0 for i in range(len(w)): value += ord(w[i]) * (G ** i) value = value % m return value def alp_count(x): ans = 0 for i in range(len(x)): if x[i] in Alphabet or x[i] in number: ans += 1 return ans def word_counter(x): num = 0 ref = "" for i in range(len(x)): if x[i] not in Alphabet and x[i] not in number: ref += " " else: ref += x[i] ref = ref.split() return len(ref) def clear_stopword(li, cl): ref = "" for i in range(len(li)): if li[i] not in Alphabet and li[i] not in number: ref += " " else: ref += li[i] ref = ref.split() ans = [] cl = cl.split() for i in ref: if i not in cl: ans.append(i) return ans def bow_noFH(x): check = [] ans = [] for i in x: if i not in check: check.append(i) ans.append([i, x.count(i)]) return ans def bow_hvFH(x): check = [] ans = [] x.sort() for i in range(len(x)): if x[i][0] not in check: check.append(x[i][0]) ans.append([x[i][0], 0]) ans[-1][1] += x[i][1] return ans file = input("File name = ") check = ["y", "Y", "n", "N"] i = True while i == True: choice = input("Use feature hashing ? (y,Y,n,N) ") if choice not in check: print("Try again.") elif choice in check: i = False fn = open(file, "r") if choice == 'Y' or choice == 'y': M = int(input("M = ")) print("-------------------") char_count = 0 line_count = 0 alpha_count = 0 total_str = "" for line in fn.readlines(): char_count += len(line.strip()) line_count += 1 total_str += line + " " total_str = total_str.lower() # print(total_str) alpha_count += alp_count(total_str) print("char count = " + str(char_count)) print("alphanumeric count = " + str(alpha_count)) print("line count = " + str(line_count)) word_count = word_counter(total_str) print("word count = " + str(word_count)) fn2 = open('stopwords.txt') stopword = "" for line in fn2.readlines(): stopword += line.strip() + " " stopword = stopword.lower() message = clear_stopword(total_str, stopword) #print(message) bow1 = bow_noFH(message) bow1.sort() if choice == "n" or choice == "N": print("BoW =", bow1) else: bow2 = [] for i in range(len(bow1)): bow2.append([fhash(bow1[i][0], M), bow1[i][1]]) #print(bow2) print("BoW =", bow_hvFH(bow2)) fn.close() fn2.close()
# 6330337021 (23.05) 184 (2021-03-21 18:00) File_name=input("File name = ") key = input('Use feature hashing ? (y,Y,n,N) ') while key not in ["y","Y","n","N"] : print("Try again.") key = input('Use feature hashing ? (y,Y,n,N) ') pol = open(File_name,"r") C_count,A_count,L_count,W_count = 0,0,0,0 yo = [] for line in pol : line = line.strip() line_r = "" L_count += 1 C_count += len(line) for i in range(len(line)) : if '0'<=line[i]<='z' : line_r += line[i] A_count += 1 else : line_r += " " yo += line_r.lower().split() W_count = len(yo) fn = open("stopwords.txt","r") stop, you = [], [] for line in fn : line = line.strip() stop += line.lower().split() for c in yo : if c not in stop : you.append(c) if key in ["y","Y"] : M = int(input("M = ")) bow, hum = [], [] for c in you : u = 0 for i in range(len(c)) : u += ord(c[i])*(37**i) hum.append(u%M) for i in range(M) : if hum.count(i) != 0 : bow.append([i,hum.count(i)]) else : bow = [] name = [] for c in you : if c not in name : name.append(c) bow.append([c,you.count(c)]) pol.close() fn.close() print("-------------------") print("char count =",C_count) print("alphanumeric count = ",A_count) print("line count =",L_count) print("word count = ",W_count) print("Bow =",sorted(bow))
# 6330338621 (30.00) 185 (2021-03-22 08:35) def word_not_tag(sen): alp = list(sen) for e in alp: if e.isalnum() == True: pass else: x = alp.index(e) alp.remove(e) alp.insert(x,' ') newalp = ''.join(alp) return newalp def alp_count(sen): alp = list(sen) c = 0 for e in alp: if e.isalnum() == True: c += 1 return c def fhash(w,M): w = list(w) wnum = 0 for i in range(len(w)): wnum += (ord(w[i]))*(37**i) ans = wnum%M return ans def bow(sen,fh,M): ansfh = ['y','Y','n','N'] stp = '' file = open("stopwords.txt", "r") for line in file: newline = line stp += newline file.close() mystp = word_not_tag(stp) mystp = mystp.split() mysen = word_not_tag(sen) mysen = mysen.split() newsen = [] for e in mysen: if e not in mystp: newsen.append(e) if fh in ansfh[2:]: bow = [] numbow = [] for e in newsen: if e not in bow: num = 1 bow.append(e) numbow.append(num) else: i = int(bow.index(e)) numbow[i] += 1 fullbow = [] for i in range(len(bow)): fullbow.append([bow[i],numbow[i]]) else: bow = [] numbow = [] for e in newsen: newe = fhash(e,M) if newe not in bow: num = 1 bow.append(newe) numbow.append(num) else: i = int(bow.index(newe)) numbow[i] += 1 fullbow = [] for i in range(len(bow)): fullbow.append([bow[i],numbow[i]]) return fullbow #---------------------------------------------------------------------------------------- file_name = input('File name = ') fh = input('Use feature hashing ? (y,Y,n,N) ') ansfh = ['y','Y','n','N'] v = True while (v): if fh not in ansfh: print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') else: v = False if fh in ansfh[:2]: M = input('M = ') M = int(M) else: M = None sen = '' numline = 0 file = open(file_name, "r") for line in file: numline += 1 newline = line sen += newline file.close() sen = sen.lower() print('-------------------') count_char = len(sen) - (numline - 1) print('char count =', count_char ) count_alp = alp_count(sen) print('alphanumeric count =', count_alp ) print('line count =', numline ) deltag = word_not_tag(sen) count_word = len(deltag.split()) print('word count =', count_word ) set_bow = bow(sen,fh,M) print('BoW =', set_bow )
# 6330339221 (24.50) 186 (2021-03-21 22:39) file_name = input('File name = ') op = open(file_name , 'r') sw = open('stopwords.txt' , 'r') fh = input('Use feature hashing ? (y,Y,n,N) ') while fh != 'y' and fh != 'Y' and fh != 'n' and fh != 'N' : print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') word = '' sentence = '' lc = 0 cc = 0 ac = 0 for line in op : lc +=1 cc += len(line)-1 for i in line : if 'Z'>=i>='A' or 'z'>=i>='a' or '9'>= i >='0' : word += i ac += 1 else : sentence += word if word != '' : sentence+= ' ' word = '' if sentence[-len(word)-1:-1] != word : sentence += word+' ' sentence = sentence[:-1] sentence = sentence.lower() cc += 1 words = sentence.split(' ') wc = len(words) stopwords = [] for line in sw : line = line[:-1] line = line.split(' ') stopwords += line for i in stopwords : while i in words : words.remove(i) wordBoW = [] countwords = [] for i in words : if i not in wordBoW : wordBoW.append(i) countwords.append(1) else : countwords[wordBoW.index(i)] +=1 bow = [] for i in range(len(wordBoW)) : bow.append([wordBoW[i],countwords[i]]) if fh == 'n' or fh == 'N' : print('-------------------') print('char count =',cc) print('alphanumeric count =',ac) print('line count =',lc) print('word count =',wc) print('BoW =',sorted(bow)) elif fh == 'y' or fh == 'Y' : M = int(input('M = ')) print('-------------------') print('char count =',cc) print('alphanumeric count =',ac) print('line count =',lc) print('word count =',wc) Pi = 0 G = 37 od = 0 fhash = [] countfhash = [] for word in words : for i in word : od += ord(i)*G**Pi Pi +=1 odmod = od%M fhash.append(odmod) Pi = 0 od = 0 FhashBoW = [] for i in fhash : if i not in FhashBoW : FhashBoW.append(i) countfhash.append(1) else : countfhash[FhashBoW.index(i)] +=1 bow = [] for i in range(len(FhashBoW)) : bow.append([FhashBoW[i],countfhash[i]]) print('BoW =',sorted(bow)) op.close() sw.close()
# 6330340821 (30.00) 187 (2021-03-21 02:21) file_name=input('File name = ') op=input('Use feature hashing ? (y,Y,n,N) ') while op not in ['y','Y','n','N']: print('Try again.') op=input('Use feature hashing ? (y,Y,n,N) ') if op in ['y','Y']: M=input('M = ') print('-------------------') stop=open('stopwords.txt','r') file=open(file_name,'r') linecount=0 wordcount=0 xyz='' words=[] charcount=0 alphacount=0 for line in file: linecount+=1 charcount+=len(line) for e in line: if e.isalnum(): xyz+=e else: xyz+=" " word=xyz.split() wordcount+=len(word) for i in word: words.append(i.lower()) for e in range(len(word)): for u in word[e]: if u.lower() in'abcdefghijklmnopqrstuvwxyz0123456789': alphacount+=1 charcount=charcount-linecount+1 print('char count =',charcount) print('alphanumeric count =',alphacount) print('line count =',linecount) print('word count =',wordcount) aa=[] stopword=[] for line in stop: n= line.split() for i in n: stopword.append(i.lower()) ####################################### def removepunc(x): y=[] k='' for i in x: for e in i: if e not in '\'\"\(\),\/\\.:;-><+-*=' : k+=e y.append(k) k='' return y ####################################### for i in words: if i not in stopword: aa.append(i) ww= removepunc(aa) ####################################### w=[]#word n=[]#fre for i in range (len(ww)): if ww[i] not in w: w.append(ww[i]) n.append(1) else: n[w.index(ww[i])]+=1 wn=[] for i in range (len(w)): wn.append([w[i],n[i]]) wn.sort() ####################################### def fhash(w,M): G=37 y=0 for i in range (len(w)): y+=ord(w[i])*G**(i) z=y%int(M) return z ####################################### if op.lower()=='y': ss=[] tt=[] for i in range (len(ww)): if fhash(ww[i],M) not in ss: ss.append(fhash(ww[i],M)) tt.append(1) else: tt[ss.index(fhash(ww[i],M))]+=1 fn=[] for i in range (len(ss)): fn.append([ss[i],tt[i]]) fn.sort() print('BoW =',fn) else: print('BoW =',wn) ####################################### stop.close() file.close()
# 6330341421 (23.15) 188 (2021-03-21 17:10) print('File name =',end = ' ') fn = str(input()) print('Use feature hashing ? (y/Y/n/N)',end = ' ') ynok = str(input()) use = 0 #0again 1yes 2no while use == 0: if ynok == 'y' or ynok == 'Y': use = 1 print('M =',end = ' ') M = int(input()) elif ynok == 'n' or ynok == 'N': use = 2 else: use = 0 print('Try again.') print('Use feature hashing ? (y/Y/n/N)',end = ' ') ynok = str(input()) print('-------------------') fo = open(fn,'r') sym = '\"\'/\\,.:;()[]{} ' st = open('stopwords.txt','r') evst = '' for line in st: evst += line evst = evst.split() lineno = 0 char = 0 evfo = '' word = 0 alpha = 0 boww = [] for line in fo: evfo += line lineno += 1 for i in line: if i != '\n': char += 1 if i not in sym and i != '\n': alpha += 1 evfo = evfo.split() for i in range(len(evfo)): evfo[i] = evfo[i].strip('\"\'/\\,.:;()[]{} ') for i in range(len(evfo)): if evfo[i] not in sym or evfo[i] != ' ': word += 1 if evfo[i].lower() not in evst and evfo[i] not in sym and evfo[i] != '\n': boww.append(evfo[i].lower()) print('char count =', char) print('alphanumeric count =', alpha) print('line count =', lineno) print('word count =', word) #bow G = 37 BOW = [] for i in boww: if [i,boww.count(i)] not in BOW: BOW.append([i,boww.count(i)]) def fhash(word,M): #word = str G = 37 wn = [] word = list(word) for i in word: wn.append(ord(i)) ff = 0 for i in range(len(wn)): ff += wn[i]*(G**i) fhashed = ff % M return fhashed if use == 2: print('BoW =',BOW) elif use == 1: fB = [] for i in range(len(BOW)): k = BOW[i][1] c = 0 while c != k: fB.append(fhash(BOW[i][0],M)) c+=1 fBOW = [] for i in fB: if [i,fB.count(i)] not in fBOW: fBOW.append([i,fB.count(i)]) fBOW.sort() print('BoW =',fBOW)
# 6330342021 (21.40) 189 (2021-03-22 02:06) def fhash(w,M): a=0 G=37 for i in range(len(w)): a+=(ord(w[i])*(G**i)) A=a%M return A #------------------------------------------------------------- file_name = open(input('File name = '),'r') fn='' line_count=0 for line in file_name: fn+=line line_count+=1 file_name.close() #----------------------------------------------------------- stopwords=open('stopwords.txt','r') sw='' for line in stopwords: sw+=line stopwords.close() #---------------------------------------------------------- hashing='m' while hashing not in ['Y','y','N','n']: hashing=input('Use feature hashing ? (y,Y,n,N) ' ) if hashing.lower()=='n': print('-------------------') cc='' for i in range(len(fn)): if fn[i]!='\n' : cc+=fn[i] char_count=len(cc) print('char count =',char_count) ac='' for i in range(len(cc)): if 'a'<=cc[i].lower()<='z' or '0'<=cc[i]<='9' : ac+=cc[i] alphanumeric_count=len(ac) print('alphanumeric count =',alphanumeric_count) print('line count = ',line_count) wc='' for i in range(len(cc)): if 'a'<=cc[i].lower()<='z' or '0'<=cc[i]<='9' or cc[i]==' ' : wc+=cc[i] else : wc+=' ' wc1=wc.lower().split() word_count=len(wc1) print('word count = ',word_count) sw1='' for i in range(len(sw)): if sw[i]!='\n' : sw1+=sw[i] else : sw1+=' ' sw2=sw1.split() bo=[] for i in range(len(wc1)): if wc1[i] not in sw2 : bo.append(wc1[i]) bow=[] for i in range(len(bo)): c=0 for k in bo: if k==bo[i]: c+=1 bow.append([bo[i],c]) bow.sort() BoW=[] for i in range(len(bow)-1): if bow[i]!=bow[i+1]: BoW.append(bow[i]) BoW.append(bow[-1]) print('BoW =',BoW) elif hashing.lower()=='y': M=int(input('M = ')) print('-------------------') cc='' for i in range(len(fn)): if fn[i]!='\n' : cc+=fn[i] char_count=len(cc) print('char count =',char_count) ac='' for i in range(len(cc)): if 'a'<=cc[i].lower()<='z' or '0'<=cc[i]<='9' : ac+=cc[i] alphanumeric_count=len(ac) print('alphanumeric count =',alphanumeric_count) print('line count = ',line_count) wc='' for i in range(len(cc)): if 'a'<=cc[i].lower()<='z' or '0'<=cc[i]<='9' or cc[i]==' ' : wc+=cc[i] else : wc+=' ' wc1=wc.lower().split() word_count=len(wc1) print('word count = ',word_count) sw1='' for i in range(len(sw)): if sw[i]!='\n' : sw1+=sw[i] else : sw1+=' ' sw2=sw1.split() bo=[] for i in range(len(wc1)): if wc1[i] not in sw2 : bo.append(wc1[i]) fh=[] for i in range(len(bo)): k=fhash(bo[i],M) fh.append(k) bow=[] for i in range(len(fh)): c=0 for k in fh: if k==fh[i]: c+=1 bow.append([fh[i],c]) bow.sort() BoW=[] for i in range(len(bow)-1): if bow[i]!=bow[i+1]: BoW.append(bow[i]) BoW.append(bow[-1]) print('BoW =',BoW) else : print('Try again.')
# 6330343721 (23.10) 190 (2021-03-22 23:34) filename = input('File name = ') a = input('Use feature hashing ? (y,Y,n,N) ') f = open(filename,'r',encoding='utf-8') xxx = ['Y', 'y', 'N', 'n'] AN = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', \ 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', \ 'a', 'b','c','d','e','f','g','h','i','j','k','l','m', \ 'n','o', 'p' 'q','r','s','t','u','v','w','x','y','z', \ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] while a not in xxx: print('Try again.') a = input('Use feature hashing ? (y,Y,n,N) ') if a in ['y', 'Y']: a2 = input('M = ') print('-------------------') data1 = [] charf = 0 numberofline = 0 for r in f: for i in range (0, len(r), 1): if str(r[i]) != '\n': charf += 1 numberofline +=1 for i in range(0, len(r), 1): if r[i] not in AN : r = r[:i]+" "+r[i+1:] x = r.split() for i in range(0, len(x), 1): x[i] = x[i].lower() data1.append(x[i]) print('char count =',charf) alphanumericcount = 0 for i in range(len(data1)): alphanumericcount += len(data1[i]) print('alphanumeric count =', alphanumericcount) print('line count =',numberofline) print('word count =', len(data1)) stopword = open('stopwords.txt', 'r', encoding = 'utf-8') dataremoval = [] for r in stopword: x = r.split() for i in range(len(x)): dataremoval.append(x[i]) datawithoutstopword = [] for i in range(0,len(data1), 1): if str(data1[i]) not in dataremoval: datawithoutstopword.append(data1[i]) stopword.close() checknumberofeachword = [datawithoutstopword[0]] for i in range(0, len(datawithoutstopword)-1, 1): if datawithoutstopword[i+1] not in checknumberofeachword: checknumberofeachword.append(datawithoutstopword[i+1]) else: pass newlist = [] for i in range(0, len(checknumberofeachword), 1): t = 0 for j in range(0, len(datawithoutstopword), 1): if datawithoutstopword[j] == checknumberofeachword[i]: t+= 1 newlist.append([checknumberofeachword[i],t]) newlist = sorted(newlist) def Bagofwords(bag, M): #datawithoutstopwordstopwords bagofwords = [] for i in range(0, len(bag), 1): G = 0 for j in range(0, len(bag[i]),1): G += ord(str(bag[i])[j])*37**(j) modulo = G%M bagofwords.append(int(modulo)) return bagofwords if a in ['y', 'Y']: BagofWords = Bagofwords(datawithoutstopword, int(a2)) BoWreduced = [BagofWords[0]] for i in range(0, len(BagofWords)-1, 1): if BagofWords[i+1] not in BoWreduced: BoWreduced.append(BagofWords[i+1]) else: pass TrueBoW = [] for i in range(0, len(BoWreduced), 1): t = 0 for j in range(0, len(BagofWords), 1): if BagofWords[j] == BoWreduced[i]: t+=1 TrueBoW.append([BoWreduced[i], t]) TrueBoW = sorted(TrueBoW) print('BoW =',TrueBoW) else: print('BoW =',newlist)
# 6330345021 (30.00) 191 (2021-03-21 15:46) file_name = input("File name = ") use = input("Use feature hashing ? (y,Y,n,N) ") if use == "y" or use == "Y": m = int(input("M = ")) elif use != "y" and use != "Y" and use != "n" and use != "N": print("Try again.") while use != "y" and use != "Y" and use != "n" and use != "N": use = input("Use feature hashing ? (y,Y,n,N) ") if use == "y" or use == "Y": m = int(input("M = ")) elif use != "y" and use != "Y" and use != "n" and use != "N": print("Try again.") #-------------------------------- def fhash(w,M): G = 37 ; sum_f = 0 ; i = 0 for e in w: sum_f += ord(e)*(G**i) i+=1 fhash_done = sum_f % M return fhash_done #-------------------------------- char_count = 0 ; alpha_count = 0 ; line_count = 0 ; word_count = 0 file = open(file_name,"r") for line in file: line_count += 1 if line[-1:] == "\n": lineout = line[:-1:] else: lineout = line line_ = lineout.lower() new_text = "" for e in line_: if "a" <= e <= "z" or "0" <= e <= "9": alpha_count += 1 char_count += 1 for l in line_: if not l in "abcdefghijklmnopqrstuvwxyz0123456789": new_text += " " else: new_text += l list_new_text = new_text.split() word_count += len(list_new_text) file.close() #-------------------------------- print("-------------------") print("char count =",char_count) print("alphanumeric count =",alpha_count) print("line count =",line_count) print("word count =",word_count) #-------------------------------- file_stop = open("stopwords.txt","r") word_stop = [] for line_s in file_stop: word_stop += line_s.split() file_stop.close() #-------------------------------- file = open(file_name,"r") BoW = [] ; text_stop = [] ; c = 1 BoW_new_text = "" for line_BoW in file: line_BoW_l = line_BoW.lower() for ln in line_BoW_l: if not ln in "abcdefghijklmnopqrstuvwxyz0123456789": BoW_new_text += " " else: BoW_new_text += ln lise_BoW_new_text = BoW_new_text.split() for th in lise_BoW_new_text: if not th in word_stop: text_stop.append(th) text_stop.sort() #-------------------------------- if use == "N" or use == "n": for i in range(len(text_stop)-1): if text_stop[i] == text_stop[i+1]: c+=1 if text_stop[i] != text_stop[i+1]: BoW.append([text_stop[i],c]) c = 1 if text_stop == []: pass elif len(text_stop) == 1: BoW.append([text_stop[0],c]) elif text_stop[i+1] == text_stop[-1] : BoW.append([text_stop[i+1],c]) else: list_fh_word = [] for fh_word in text_stop: list_fh_word.append(fhash(fh_word,m)) list_fh_word.sort() for i in range(len(list_fh_word)-1): if list_fh_word[i] == list_fh_word[i+1]: c+=1 if list_fh_word[i] != list_fh_word[i+1]: BoW.append([list_fh_word[i],c]) c = 1 if list_fh_word == []: pass elif len(list_fh_word) == 1: BoW.append([list_fh_word[0],c]) elif list_fh_word[i+1] == list_fh_word[-1]: BoW.append([list_fh_word[i+1],c]) #-------------------------------- print("BoW =",BoW) file.close()
# 6330346621 (26.00) 192 (2021-03-21 23:58) file_name=open(input('File name = '),'r') F='' F1='' for l in file_name: F+=l.lower() for e in F: if 'a'<=e<='z' or 'A'<=e<='Z' or'0'<=e<='9': F1+=e else :F1+=' ' a=input('Use feature hashing ? (y,Y,n,N) ') while a != 'y' and a != 'Y' and a != 'n' and a != 'N': print('Try again.') print('Use feature hashing ? (y,Y,n,N) ') a=input() if a == 'y' or a == 'Y':M=int(input('M = ')) print('-------------------') S=open('stopwords.txt','r') S1='' S2='' for l in S: S1+=l for e in S1: if e == '\n': S2+=' ' else : S2+=e s=S2.split() F2=F1.split() F2.sort() F3=[] for e in F2: if e not in s: F3.append(e) S.close() count=0 for e in F: if e != '\n':count+=1 print('char count = ',count) count1=0 for e in F1: if e != ' ':count1+=1 print('alphanumeric count = ',count1) count2=0 for e in F: if e == '\n':count2+=1 print('line count = ',count2) count3=len(F1.split()) print('word count = ',count3) F4=[] out=[] o=0 if a == 'y' or a == 'Y': for i in range(len(F3)): o=0 for j in range(len(F3[i])): o+=ord(F3[i][j])*(37**j) F4.append(o%M) F4.sort() i=0 while (i <= len(F4)-1): c = 1 ch = F4[i] j = i while (j < len(F4)-1): if (F4[j] == F4[j+1]): c = c+1 j = j+1 else:break out.append([F4[i],c]) i = j+1 print('BoW =',out) if a == 'n' or a=='N': i=0 while (i <= len(F3)-1): c = 1 ch = F3[i] j = i while (j < len(F3)-1): if (F3[j] == F3[j+1]): c = c+1 j = j+1 else:break out.append([F3[i],c]) i = j+1 print('BoW =',out) file_name.close()
# 6330347221 (18.90) 193 (2021-03-21 21:52) Y_N = ['y', 'Y', 'n', 'N'] def fhash(w, M): fhash = 0 for i in range(len(w)): a = ord(w[i]) G = 37**i fhash += a*G fhash = fhash % M return fhash def remove_punctuation(file_name): file = open(file_name, 'r') x = '' for line in file: for e in line: if e in '\'\"\\/()[].,;:': x += ' ' else: x += e file.close() return x def list_stopwords(file): stopwords = open(file, 'r') a = [] for line in stopwords: x = line.split() for e in x: a.append(e) return a file_name = input("File name = ") fharshing = input('Use feature harshing ? (y,Y,n,N) ') while fharshing not in Y_N: print('Try again.') fharshing = input('Use feature harshing ? (y,Y,n,N) ') if fharshing == 'n' or fharshing == 'N': pass elif fharshing == 'y' or fharshing == 'Y': M = int(input('M = ')) print('-------------------') stopwords = list_stopwords('stopwords.txt') file = open(file_name, 'r') d = 0 for line in file: line = line.strip() for e in line: d += 1 print('char count =', d) file.close() file1 = remove_punctuation(file_name) c = 0 for line in file1: line = line.strip() for e in line: c += 1 print('alphanumeric count =', c) b = 0 file = open(file_name, 'r') for line in file: b += 1 print('line count =', b) file.close() file2 = file1.split() print('word count =', len(file2)) all_words = [] for line in file2: line.lower() a = line.split() for e in a: if e not in stopwords: all_words.append(e) BoW = [] if fharshing == 'n' or fharshing == 'N': all_words.sort() count = 1 for i in range(1, len(all_words)): right = all_words[i] left = all_words[i-1] if right != left: BoW.append([left,count]) count = 1 else: count += 1 if i == len(all_words)-1: BoW.append([right, count]) print('BoW =', BoW) elif fharshing == 'y' or fharshing == 'Y': fea_hash = [] for e in all_words: j = fhash(e, M) fea_hash.append(j) fea_hash.sort() count = 1 for i in range(1, len(fea_hash)): right = fea_hash[i] left = fea_hash[i-1] if right != left: BoW.append([left,count]) count = 1 else: count += 1 if i == len(fea_hash)-1: BoW.append([right, count]) print('BoW =', BoW)
# 6330348921 (24.00) 194 (2021-03-21 23:30) #Prog-08: Bag-of-words # # 6330348921 (24.00) Name Palapol Suetrakoolpanich file_name = input('File name = ') def line_count(file): a = open(file) n = 0 c = '' for line in a: c += line c = c.strip('\n') for e in c: if e == '\n': n += 1 n += 1 a.close() return n def char_count(file): a = open(file) c = '' for line in a: c += line c = c.strip('\n') n = len(c) - line_count(file) +1 a.close() return n def alpha_count(file): a = open(file) n = 0 for line in a: for e in line: if 'a'<= e.lower() <= 'z' or '0'<=e.lower()<='9': n += 1 a.close() return n def word_count(file): a = open(file) c = '' for line in a: for e in line: if 'a'<=e.lower()<='z' or '0'<=e.lower()<='9': c += e else: c += ' ' n = len(c.split()) a.close() return n def bow(file,fhashh,M): g = '' c = '' a = open(file) b = open('stopwords.txt') for k in b: g += k.lower() g = g.split() for line in a: for e in line: if 'a'<=e.lower()<='z' or '0'<=e.lower()<='9': c += e.lower() else: c += ' ' c = c.split() x = [] y = [] boww = [] n= 0 for i in range(len(c)): #['age','hdwqh','wefewif','555','age','age','555','565'] if fhashh == False: if c[i] not in x and c[i] not in g: x.append(c[i]) y.append(0) if c[i] in x: p = x.index(c[i]) y[p] += 1 if fhashh == True: if fhash(c[i],M) not in x and c[i] not in g: x.append(fhash(c[i],M)) y.append(0) if fhash(c[i],M) in x and c[i] not in g: p = x.index(fhash(c[i],M)) y[p] += 1 for j in range(len(x)): boww.append([x[j],y[j]]) boww.sort() a.close() b.close() return boww def fhash(w,M): c = 0 for i in range(len(w)): c += ord(w[i])*(37**i) c = c%M return c feature = input('Use feature hashing ? (y,Y,n,N) ') if feature.lower() == 'y': M = int(input('M = ')) print('-'*19) print('char count =',char_count(file_name)) print('alphanumeric count =',alpha_count(file_name)) print('line count =',line_count(file_name)) print('word count =',word_count(file_name)) print('BoW =', bow(file_name,True,M)) elif feature.lower() == 'n': M = '' print('-'*19) print('char count =',char_count(file_name)) print('alphanumeric count =',alpha_count(file_name)) print('line count =',line_count(file_name)) print('word count =',word_count(file_name)) print('BoW =', bow(file_name,False,M)) else: print("Try again.")
# 6330349521 (30.00) 195 (2021-03-22 23:53) def char_count(file_name): f = open(file_name) c = 0 d = 0 for line in f: c += len(line) if line[-1::]=='\n': d += 1 f.close() return c-d def alnum_count(file_name): f = open(file_name) c = 0 for line in f: for g in line: if g.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': c += 1 f.close() return c def line_count(file_name): f = open(file_name) c = 0 for line in f: c += 1 f.close() return c def word_count(file_name): f = open(file_name) c = '' wc = 0 for line in f: for g in line: if g.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': c += g else: c += ' ' wc += len(c.split()) c = '' f.close() return wc def BoW(file_name,stopwords): f1 = open(file_name) f2 = open(stopwords) lb = [] lc = [] cfn = '' d2 = [] csw = '' cb = '.' for line in f1: for g in line: if g.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': cfn += g.lower() else: cfn += ' ' for line in f2: csw += ' ' if line[-1::1] == '\n': line = line[0:-1:1] for g in line: csw += g.lower() for r in cfn.split(): if r not in csw.split(): cb += r cb += '.' for cdc in cb.split('.'): if cdc not in lc: lc.append(cdc) for e in lc: if e == '': pass else: cnb = 0 w = 0 while cb.find(e,w)!=-1: cnb += 1 w = cb.find(e,w)+1 lb.append([e, cnb]) lb.sort() f1.close() f2.close() return lb def feature_harshing(l,M): s = [] f = '' for l1 in l: c = 0 c1 = 0 for l3 in l1[0]: c += ord(l3)*(37**c1) c1 += 1 fhash = c%M f += (str(fhash)+'.')*l1[1] for i in range(M): c2 = 0 c3 = 0 while f.find(str(i),c3) != -1: c2 += 1 c3 = f.find(str(i),c3)+1 if c2 != 0: s.append([i, c2]) return s def display(file_name,stopwords,x): print('-------------------') print('char count =', char_count(file_name)) print('alphanumeric count =', alnum_count(file_name)) print('line count =', line_count(file_name)) print('word count =', word_count(file_name)) if x=='0': print('BoW =', BoW(file_name,stopwords)) if x=='1': print('BoW =',feature_harshing(BoW(file_name,stopwords),M)) file_name = input("File name = ") x = input("Use feature hashing ? (y,Y,n,N) ").strip() while x not in ['y','Y','n','N']: x = input("Use feature hashing ? (y,Y,n,N) ").strip() if x == 'y' or x == 'Y': M = int(input("M = ").strip()) display(file_name,'stopwords.txt','1') else: display(file_name,'stopwords.txt','0')
# 6330350021 (20.35) 196 (2021-03-22 21:21) def feature_hashing(): a = input('Use feature hashing ? (y,Y,n,N) ') if a == 'y' or a == 'Y': return True elif a == 'n' or a == 'N': return False else: print('Try again.') def remove_punc(t): out = '' for e in t: if 'A' <= e <= 'z' or '0' <= e <= '9': out += e else: out += ' ' return out def fhash(w,M): a = 0 for i in range(len(w)): a += (ord(w[i])*(37**i)) b = a%M return b stopwords = [] fs = open('stopwords.txt','r') for line in fs: z = line.split() for i in z: stopwords.append(i.upper()) fs.close() file_name = input('File name = ') how = feature_hashing() while how != True and how != False: how = feature_hashing() if how == True: M = int(input('M = ')) print('-'*19) fn = open(file_name, 'r') char_count = 0 alphanumeric_count = 0 line_count = 0 word_count = 0 word_list = [] BoW = [] x = 1 if how == False: for line in fn: char_count += len(line) for ch in line: if 'A' <= ch <= 'z' or '0' <= ch <= '9': alphanumeric_count += 1 line_count += 1 count_word = remove_punc(line).split() word_count += len(count_word) for i in count_word: if i.upper() not in stopwords: word_list.append(i.lower()) word_list.sort() print('char count =', char_count-line_count+1) print('alphanumeric count =', alphanumeric_count) print('line count =', line_count) print('word count =', word_count) for j in range(len(word_list)-1): if word_list[j] == word_list[j+1]: x += 1 else: BoW.append([word_list[j],x]) x = 1 BoW.append([word_list[-1],x]) print('BoW =', BoW) if how == True: for line in fn: char_count += len(line) for ch in line: if 'A' <= ch <= 'z' or '0' <= ch <= '9': alphanumeric_count += 1 line_count += 1 count_word = remove_punc(line).split() word_count += len(count_word) for i in count_word: if i.upper() not in stopwords: word_list.append(i.lower()) word_list.sort() print('char count =', char_count-line_count+1) print('alphanumeric count =', alphanumeric_count) print('line count =', line_count) print('word count =', word_count) v = [] for j in word_list: v.append(fhash(j, M)) v.sort() for k in range(len(v)-1): if v[k] == v[k+1]: x += 1 else: BoW.append([v[k],x]) x = 1 BoW.append([v[-1],x]) print('BoW =', BoW) fn.close()
# 6330351721 (30.00) 197 (2021-03-21 15:53) #===================================================== aa = input('File name = ') bb = input('Use feature hashing ? (y,Y,n,N) ') #----------------------------------------------------- def fhash(w,M): x = 0 for i in range(len(w)): x += ((37**i)*ord(w[i])) x = x % M return x def nosum(w): y = [] for i in range(len(w)): if w[i] not in y: y.append(w[i]) return y def nosumfinal(real): real.sort() ans = [] mem = [] for e in real: if e[0] not in mem: ans.append([e[0], 0]) mem.append(e[0]) ans[-1][1] += e[1] return ans #=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= fn = open(aa, 'r') fn2 = open('stopwords.txt', 'r') x = '' c = 0 for line in fn: line = line.strip() line = line.lower() x += line+ ' ' c += 1 y = '' for i in range(len(x)): if 'a'<=x[i]<='z' or 'A'<= x[i]<='Z' or '0'<=x[i]<='9' : y += x[i] else: y += ' ' z = y.split() #============================================================== yfinal = '' for i in range(len(z)): yfinal += z[i] x2 = '' for line in fn2: line = line.strip() line = line.lower() x2 += line+ ' ' z2 = x2.split() real = [] for i in range(len(z)): if z[i] not in z2: real.append(z[i]) xx = [] for i in range(len(real)): xx.append([real[i],real.count(real[i])]) xx = nosum(xx) #============================================================= while bb not in ['Y','y','N','n']: print('Try again.') bb = input('Use feature hashing ? (y,Y,n,N) ') if bb in ['y','Y']: M = int(input('M = ')) print('-------------------') print('char count =',len(x)-c) print('alphanumeric count =',len(yfinal)) print('line count =', c) print('word count =',len(z)) if bb in ['y','Y']: ans = xx for i in range(len(ans)): ans[i][0] = fhash(ans[i][0], M) ans.sort() print('BoW =',nosumfinal(ans)) else: xx.sort() print('BoW =', xx) #------------------------------------------------- fn.close() fn2.close()
# 6330352321 (24.00) 198 (2021-03-22 03:58) def fhash(w,m): c = [] s = 0 for i in range(len(w)): c.append(ord(w[i])) for i in range(len(w)): s += c[i]*(37**i) fh = s%m return fh def char_count(filename): c_line = 0 c_char = 0 f = open(filename) for line in f: c_line += 1 for i in range(len(line)): c_char += 1 f.close() c_char -= c_line-1 return c_char def alp_count(filename): c = 0 f = open(filename) for line in f: for e in range(len(line)): if ("a" <= line[e].lower() <= "z") or ("0" <= line[e] <= "9"): c += 1 f.close() return c def line_c(filename): c = 0 f = open(filename) for line in f: c += 1 f.close() return c def word_count(filename): alp = "abcdefghijklmnopqrstuvwxyz" num = "1234567890" st = "" f = open(filename) for line in f: for e in line: if (e.lower() not in alp) and (e not in num): st += " " else: st += e f.close() c = st.lower().split() return len(c) def bow_n(filename,stopwords): alp = "abcdefghijklmnopqrstuvwxyz" num = "1234567890" s1 = "" s2 = "" l1 = [] l2 = [] l = [] f1 = open(filename) for line in f1: for e in line: if (e.lower() not in alp) and (e not in num): s1 += " " else: s1 += e f2 = open(stopwords) for line in f2: for e in line: s2 += e f1.close() f2.close() l1 = s1.lower().split() l2 = s2.lower().split() for e in l1: if e not in l2: l.append(e) l.sort() bow0 = [l[0]] bow1 = [1] for i in range(1,len(l)): if l[i] == l[i-1]: bow1[-1] += 1 else: bow0.append(l[i]) bow1.append(1) bow = [] for i in range(len(bow0)): bow.append([bow0[i],bow1[i]]) return bow def bow_y(filename,stopwords,m): bow = bow_n(filename,stopwords) for i in range(len(bow)): bow[i][0] = fhash(bow[i][0],m) bow.sort() bowy = [bow[0]] for i in range(1,len(bow)): if bow[i][0] == bow[i-1][0]: bowy[-1][1] += bow[i][1] else: bowy.append(bow[i]) return bowy #--------------------------------------------------------------------- yesno = ["y","Y","n","N"] file_name = input("File name = ") ufh = input("Use feature hashing ? (y,Y,n,N) ") while ufh not in yesno: print("Try again.") ufh = input("Use feature hashing ? (y,Y,n,N) ") if ufh== "n" or ufh == "N": print("-"*19) print("char count = " + str(char_count(file_name))) print("alphanumeric count = " + str(alp_count(file_name))) print("line count = " + str(line_c(file_name))) print("word count = " + str(word_count(file_name))) print("BoW = " + str(bow_n(file_name,"stopword.txt"))) elif ufh == "y" or ufh == "Y": M = int(input("M = ")) print("-"*19) print("char count = " + str(char_count(file_name))) print("alphanumeric count = " + str(alp_count(file_name))) print("line count = " + str(line_c(file_name))) print("word count = " + str(word_count(file_name))) print("BoW = " + str(bow_y(file_name,"stopword.txt",M)))
# 6330353021 (29.80) 199 (2021-03-21 22:17) an = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789' def fhash(w,M) : f = 0 for i in range(len(w)) : f += ord(w[i])*37**i return f%int(M) f = input('File name = ') h = input("Use feature hashing ? (y,Y,n,N) ") sl = '-'*19 o = open(f,'r') o2 = open('stopwords.txt','r') sow2 = '' for line in o2 : for i in line : if i in an : sow2 += i else : sow2 += ' ' sow2 = sow2.lower() low2 = sow2.split(' ') while '' in low2 : low2.remove('') chc = 0 anc = 0 wc = 0 lc = 0 sow= '' for line in o : chc += len(line)-1 for i in line : if i in an : anc += 1 sow += i else : sow += ' ' lc += 1 sow = sow.lower() low = sow.split(' ') for i in low : if i != '' : wc += 1 while '' in low : low.remove('') chc += 1 while h not in ['n','N','y','Y'] : print('Try again') h = input("Use feature hashing ? (y,Y,n,N) ") if h == 'n' or h == 'N': print(sl) print('char count = '+str(chc)) print('alphanumeric count = '+str(anc)) print('line count = '+str(lc)) print('word count = '+str(wc)) fw = low for i in range(len(low)) : if low[i] in low2 : low[i] = '' while '' in low : low.remove('') bow = [] for i in range(len(low)) : n = 1 for j in range(i+1,len(low),1) : if low[i] == low[j] : n += 1 bow.append([low[i],n]) for i in range (len(bow)) : for j in range (i+1,len(bow),1) : if (bow[i])[0] == (bow[j])[0] : bow[j] = ['',0] while ['',0] in bow : bow.remove(['',0]) print('BoW = '+str(bow)) elif h== 'y' or h == 'Y': m = int(input('M = ')) print(sl) print('char count = '+str(chc)) print('alphanumeric count = '+str(anc)) print('line count = '+str(lc)) print('word count = '+str(wc)) z = [] for i in range (len(low)) : if low[i] in low2 : low[i] = '' while '' in low : low.remove('') for i in range (len(low)) : nn = 0 nn += fhash(low[i],m) z.append(nn) bow9 = [] for i in range(len(z)) : k = 1 for j in range (i+1,len(z),1) : if z[i] == z[j] : k += 1 bow9.append([z[i],k]) for i in range (len(bow9)) : for j in range (i+1,len(bow9),1) : if (bow9[i])[0] == (bow9[j])[0] : bow9[j] = [0,0] while [0,0] in bow9 : bow9.remove([0,0]) bow9 = sorted(bow9) print("BoW = "+str(bow9))
# 6330354621 (26.67) 200 (2021-03-22 23:28) file_name=input('File name = ') #------------------------------------------------------------------------------ def stopwordtolist(): b=[] z=open('stopword.txt') for line in z: if line != "\n": line1=line.strip('\n') line2=line1.split(' ') for j in range(len(line2)): b.append(line2[j]) #word chec use for b.o.w z.close() return b #------------------------------------------------------------------------------ def alphanum(word): text='' for i in word: #alphanumeric if i==' ': text+=' ' elif i in 'abcdefghijklmnopqrstuvwxyz0123456789': text+=i else: text+=' ' return text #------------------------------------------------------------------------------ def linecount(k): file=open(k) line_count = 0 x=file.read() x1=x.strip('\n') x2=x1.split('\n') for i in x2: line_count+=1 file.close() #for line in file: # if line != "\n": #line_count+=1 return line_count #------------------------------------------------------------------------------ def texttosent(file): file=open(file) a='' for line in file: if line != "\n": line=line.lower() l1=line.strip('\n') a+=''.join(l1)+' ' file.close() return a #------------------------------------------------------------------------------ def charcount(file): file=open(file) charcount ='' for line in file: linex=line.strip() if linex != "\n": line=line.lower() l1=line.strip('\n') charcount+=''.join(l1) ans=len(charcount) file.close() return ans #------------------------------------------------------------------------------ def allChar(l1): word_stick=''.join(l1.split())#find char count ans=alphanum(word_stick) return ans #------------------------------------------------------------------------------ def BoW(word):#word= alpha a1=word.split() ans=[] num=0 for i in a1: for k in range(len(a1)): if i == a1[k]: num+=1 a2=[i,num] if a2 in ans: num=0 else: ans.append([i,num]) num=0 return ans #feature hashing--------------------------------------------------------------- def BoWfe(w,m):#cut_word='best times worst times age wisdom 555' a1=w.split() listall=[] list1=[] for i in a1: feh=fe(i,m) list1.append(feh) num=0 for j in list1: for k in range(len(list1)): if j == list1[k]: num+=1 a2=[j,num] if a2 in listall: num=0 else: listall.append(a2) num=0 return listall #------------------------------------------------------------------------------ def fe(w,m): sum1=0 k=0 for i in w: sum1+=(ord(i)*(37**(w.find(i,0+k)))) k+=1 ans=sum1 % m return ans #------------------------------------------------------------------------------ chose=0 choice=input('use feature hashing ? (y,Y,n,N) ') while choice!='n' or choice!='N': if choice=='y' or choice=='Y': chose=1 break if choice=='n' or choice=='N': chose=0 break else: print('Try again.') choice=input('use feature hashing ? (y,Y,n,N) ') if chose == 1: m=input('M = ') print('-------------------') a=texttosent(file_name) n=linecount(file_name) b=stopwordtolist() alpha=alphanum(a) alpha2=''.join(alpha.split()) cut_word =' '.join([i for i in alpha.split() if i not in b]) print('char count =', charcount(file_name)) print('alphanumeric count =',len(alpha2)) print('line count =', n) print('word count =', len(alpha.split())) print('BoW =',BoWfe(cut_word,int(m))) elif chose == 0: print('-------------------') a=texttosent(file_name) n=linecount(file_name) b=stopwordtolist() alpha=alphanum(a) alpha2=''.join(alpha.split()) cut_word =' '.join([i for i in alpha.split() if i not in b]) #word that already cut stopwords usr for b.o.w print('char count =', charcount(file_name)) print('alphanumeric count =',len(alpha2)) print('line count =', n) print('word count =', len(alpha.split())) print('BoW =',BoW(cut_word))
# 6330355221 (29.00) 201 (2021-03-22 23:03) #--------------------------------------- def fhash(w,M): confhash = 0 for i in range(len(w)): confhash += ord(w[i])*(37**i) return confhash % M #--------------------------------------- vala = '' valb = '' vocab_one = [] vocab_two = [] linecount = 1 sarawordcount = 0 sicticcount = 0 #--------------------------------------- list_Fileimport = input('File name = ') thename_char = input('Use feature hashing ? (y,Y,n,N) ') #--------------------------------------- while thename_char not in 'yYnN': print('Try again.') thename_char = input('Use feature hashing ? (y,Y,n,N) ') if thename_char in "yY": case_one = int(input("M = ")) print('-------------------') linefilea = open('stopwords.txt') linefileaa = linefilea.read() open_filetwo = linefileaa.lower() linefile = open(list_Fileimport) linefiles = linefile.read() linefiless = linefiles.lower() open_file = linefiless.strip('\n') #--------------------------------------- for i_e in open_file: if i_e != '\n': sicticcount += 1 else: linecount += 1 if 'a'<= i_e <='z' or '0'<= i_e <='9': sarawordcount += 1 vala += i_e elif vala != '': vocab_one.append(vala) vala = '' print('char count =', sicticcount) print('alphanumeric count =', sarawordcount) print('line count =', linecount) #--------------------------------------- if vala != '': vocab_one.append(vala) countvocab = len(vocab_one) print('word count =',countvocab) for i_j in open_filetwo : if '0'<= i_j <='9' or 'a'<= i_j <='z': valb+=i_j elif valb != '': vocab_two.append(valb) valb = '' if valb!='': vocab_two.append(valb) for i_o in vocab_two: for i in range(vocab_one.count(i_o)): vocab_one.remove(i_o) if thename_char in "yY": for i in range(len(vocab_one)): vocab_one[i] = fhash(vocab_one[i],case_one) #--------------------------------------- vala = [] finalBoW =[] for i_k in vocab_one: if i_k not in vala: finalBoW.append([i_k,vocab_one.count(i_k)]) vala.append(i_k) #--------------------------------------- finalBoW.sort() print('BoW =',finalBoW)
# 6330356921 (22.80) 202 (2021-03-21 22:58) #--------------------------------------------------------------------- def fhash(w,M) : a = [] b = 0 c = 0 for i in range(len(w)) : a.append(str(ord(w[i]))) for i in range(len(a)) : b += int(a[i])*(37**c) c += 1 b = int(b)%int(M) return b #--------------------------------------------------------------------- def cutpunc(N) : result = "" for c in N: if c in "\"\'/\\,.:;" : result += "" elif c in "\n" : result += " " else : result +=c return result #--------------------------------------------------------------------- def cutword(N) : N = cutpunc(N) N = N.lower() N = N.split() x = "" a = open("stopwords.txt", "r") for lines in a : x += lines b = cutpunc(x) b = b.split() result = "" for c in N : if c in b : result += "" else : result += c+" " return result #--------------------------------------------------------------------- def BOW1(N) : N = cutword(N) N = N.split() N.sort() N.append("") a = [] c = 1 for i in range(len(N)-1) : if N[i]==N[i+1] : c +=1 else : a.append([N[i],c]) c = 1 return a #--------------------------------------------------------------------- def BOW2(N) : N = cutword(N) N = N.split() N.sort() a = [] c = 1 x = [] for i in range(len(N)) : x.append(fhash(N[i],M)) x.sort() x.append("") for i in range(len(x)-1) : if x[i]==x[i+1] : c +=1 else : a.append([x[i],c]) c = 1 return a #--------------------------------------------------------------------- x = "" character_count = 0 line_count = 0 word_count = 0 alphanumeric_count = 0 d = input("File name = ") a = open(d, "r") for lines in a : x += lines character_count += len(lines) line_count +=1 print(x) y = cutpunc(x) z = "".join(y) character_count -=line_count-1 y = cutpunc(x) y = y.split() for i in range(len(y)): if y[i]==y[i] : word_count += 1 h = 0 for i in range(len(z)) : if " "==z[i] : h +=1 alphanumeric_count += len(z)-h a.close() #--------------------------------------------------------------------- b = input("Use feature hashing ? (y,Y,n,N) ") if b=="y" or b=="Y" : M = int(input("M = ")) print("-------------------") print("char count =",character_count) print("alphanumeric count =",alphanumeric_count) print("line count =",line_count) print("word count =",word_count) print("BoW =",BOW2(x)) if b=="n" or b=="N" : print("-------------------") print("char count =",character_count) print("alphanumeric count =",alphanumeric_count) print("line count =",line_count) print("word count =",word_count) print("BoW =",BOW1(x)) while b!="n" and b!="N"and b!="y" and b!="Y" : print("Try again.") b = input("Use feature hashing ? (y,Y,n,N) ") if b=="y" or b=="Y" : M = int(input("M = ")) print("-------------------") print("char count =",character_count) print("alphanumeric count =",alphanumeric_count) print("line count =",line_count) print("word count =",word_count) print("BoW =",BOW2(x)) if b=="n" or b=="N" : print("-------------------") print("char count =",character_count) print("alphanumeric count =",alphanumeric_count) print("line count =",line_count) print("word count =",word_count) print("BoW =",BOW1(x)) break
# 6330357521 (30.00) 203 (2021-03-22 01:23) def stopwords(file_dir): bow = list() with open(file_dir, 'r') as file: for line in file: if line: bow.extend(line.split()) return bow def do_hash(): while True: do_ten = input("Use feature hashing ? (y,Y,n,N) ") if do_ten == 'y' or do_ten == 'Y': return True elif do_ten == 'n' or do_ten == 'N': return False else: print("Try again.") def hasher(w, m): chars = list(w) sum_ord = 0 for i, c in enumerate(chars): sum_ord += ord(c) * (37 ** i) return sum_ord % m def count_bow(bow): bow_count = list() dictionary = list() for w in bow: if w not in dictionary: bow_count = bow_count + [[w, 0]] dictionary.extend([w]) bow_count[dictionary.index(w)][1] += 1 return bow_count def print_results(num_chars, num_alpha_numeric, num_lines, num_words, bow_count): print("char count =", num_chars) print("alphanumeric count =", num_alpha_numeric) print("line count =", num_lines) print("word count =", num_words) print("BoW =", bow_count) def main(): stop_words = stopwords("stopwords.txt") num_lines = 0 num_words = 0 num_chars = 0 num_alpha_numeric = 0 filename = input("File name = ") with open(filename, 'r') as file: converted_words = list() for line in file: line = line.strip('\n') num_lines += 1 num_chars += len(line) num_alpha_numeric += sum(char.isalnum() for char in line) line = [" " if not char.isalnum() else char for char in line] line = "".join(map(lambda char: char.lower(), line)) words = line.split() converted_words.extend(words) num_words += len(words) file.close() bag_of_words = list(filter(lambda w: w not in stop_words, converted_words)) if do_hash(): m = int(input("M = ")) bag_of_words = [hasher(w, m) for w in bag_of_words] bow_count = count_bow(bag_of_words) bow_count = sorted(bow_count, key=lambda w: w[0]) print_results(num_chars, num_alpha_numeric, num_lines, num_words, bow_count) main()
# 6330358121 (30.00) 204 (2021-03-22 23:46) def condit(): s=input('Use feature hashing ? (y,Y,n,N) ') if s.lower()=='y':return int(input('M = ')) elif s.lower()!='n':print('Try again.');return condit() else:return 'x' def fhash(w,M): t=0 for j in range(len(w)):t+=ord(w[j])*(37**j) return t%M def check_char(t):return 'a'<=t<='z' or '0'<=t<='9' def show_BoW(g): h=[];g.sort();l=[] if g==[]:return [] else: for i in range(1,len(g)): if g[i-1]!=g[i]:h.append([g[i-1],g.count(g[i-1])]) h.append([g[-1],g.count(g[-1])]) return sorted(h) def data_line(l): alph_line=0;t=list(l) for e in l:alph_line+=int(check_char(e)) for j in range(len(t)): if not check_char(t[j]):t[j]=' ' return [len(l.strip()),alph_line,''.join(t).split()] #------------------------------------------------------------- fn=open(input('File name = '),'r') stop=open('stopwords.txt','r').read() m=condit();print('-------------------') char=0;alph=0;words=0;line=0;sonjai=[] l=fn.readline().lower().strip() while len(l)>0: p,q,h=data_line(l) line+=1;char+=p;alph+=q;words+=len(h) for e in h: if e not in stop:sonjai.append(e) l=fn.readline().lower() fn.close() #------------------------------------------------------------ print('char count =',char);print('alphanumeric count =',alph) print('line count =',line);print('word count =',words) if m!='x': for i in range(len(sonjai)):sonjai[i]=fhash(sonjai[i],m) print('BoW =',show_BoW(sonjai)) #------------------------------------------------------------
# 6330360321 (30.00) 205 (2021-03-22 19:57) eng_word = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',] num = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] def cut_noodle(n1): r = '' n = n1.lower() for e in range(len(n)): if n[e] in eng_word or n[e] in num : r += n[e] else : r += ' ' return r def stop_words(): a = open('stopwords.txt', 'r') d = a.readlines() a.close() g = [] for n in d: if '\n' in n : g.append(cut_noodle(n[0:-1])) else : g.append(cut_noodle(n)) c = ' '.join(g) return c def cut_stopword(word): r = [] for n in range(len(word)): if word[n] not in stop_word: r.append(word[n]) return r def cout_line(file_name): f = open(file_name, 'r') n = f.readlines() f.close() return len(n) def cout_charater(file_name): f = open(file_name ,'r') g = f.readlines() f.close() n = 0 for e in g : if '\n' in e: n += len(e[0:-1]) else : n += len(e) return n def fhash(word,M): e = 0 for n in range(len(word)): e += ord(word[n])*(37**n) g = e%M return g def list_fhash(list_word,M): f = [] for n in list_word: f.append(fhash(n,M)) return f def fhash_way(): while True : a = input('Use feature hashing ? (y,Y,n,N) ') if a in ['n','N'] : return 'No', 0 break elif a in ['y','Y'] : M = input('M = ') return 'Yes',M else: print('Try again.') def list_of_words(file_name): f = open(file_name) g = [] while True : n = f.readline() if n != '' : if '\n' in n : g.append(n[0:-1]) else : g.append(n) else : break f.close() g = ' '.join(g) g = cut_noodle(g) g = g.split() return g def cout_alpha(file_name): f = list_of_words(file_name) p = 0 for n in f : p += len(n) return p def BoW(way,M,flie_name): f = list_of_words(flie_name) f = cut_stopword(f) if way == 'No': couted = [] couted_num = [] for n in f : if n not in couted: couted.append(n) couted_num.append([n,f.count(n)]) couted_num.sort(key=None, reverse=False) print('BoW =', couted_num) elif way == 'Yes' : y = list_fhash(f,M) n1 = [] n1_num = [] for n in y : if n not in n1: n1.append(n) n1_num.append([n,y.count(n)]) n1_num.sort(key=None, reverse=False) print('BoW =', n1_num) stop_word = stop_words().split() file_name = input('File name = ') way,M=fhash_way() print('-------------------') print('char count =', cout_charater(file_name)) print('alphanumeric count =', cout_alpha(file_name)) print('line count =', cout_line(file_name)) print('word count =', len(list_of_words(file_name))) BoW(way,int(M),file_name)
# 6330361021 (27.28) 206 (2021-03-22 23:15) def fhash(w,m): ans = 0 for i in range(len(w)): ans += ord(w[i])*(37**i) ans %= m return ans def stopword(): stop = open('stopwords.txt','r') stop1 = [] stop2 = [] for i in stop: stop1 += i.split() for i in stop1: stop2.append(i.lower()) stop3 = [] for i in range(len(stop2)): if stop2.count(stop2[i]) == 1: stop3.append(stop2[i]) else: if not stop3.count(stop2[i]) == 1: stop3.append(stop2[i]) stop.close() return stop3 def makewords(file): char = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVXYZ0123456789' readfile = open(file,'r') allline = [] words = [] for i in readfile: line = '' for j in i.strip(): if j in char: line += j.lower() elif j == ' ': line += j.lower() else: line += ' ' allline.append(line) for i in allline: splitword = i.split() for j in splitword: if not j in stopword(): words.append(j) readfile.close() return words def makebow(words): word = [] bow = [] for i in words: word.append(i) if word.count(i) > 1 : word.pop(-1) for i in word: bow.append([i,words.count(i)]) return bow def changebow(bow,m): inbow = [] for i in range(m): inbow.append([i,0]) newbow = [] for i in range(len(bow)): for j in range(bow[i][1]): newbow.append(fhash(bow[i][0],m)) for i in range(m): inbow[i][1] = newbow.count(i) outbow = [] for i in range(len(inbow)): if not inbow[i][1] == 0 : outbow.append(inbow[i]) return outbow def charcount(file): readfile = open(file,'r') allline = [] words = [] n = 0 for i in readfile: line = '' for j in i.strip(): n+=1 readfile.close() print('char count =', n) def alphanumeric(file): char = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVXYZ0123456789' readfile = open(file,'r') allline = [] words = [] n = 0 for i in readfile: line = '' for j in i.strip(): if j in char: line += j.lower() elif j == ' ': line += j.lower() else: line += ' ' allline.append(line) for i in allline: splitword = i.split() for j in splitword: for l in j: n+=1 readfile.close() print('alphanumeric count =',n) def linecount(file): readfile = open(file,'r') n = 0 for i in readfile: n += 1 readfile.close() print('line count =',n) def wordcount(file): char = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVXYZ0123456789' readfile = open(file,'r') allline = [] words = [] n = 0 for i in readfile: line = '' for j in i.strip(): if j in char: line += j.lower() elif j == ' ': line += j.lower() else: line += ' ' allline.append(line) for i in allline: splitword = i.split() for j in i.split(): n+=1 readfile.close() print('word count =',n) def show(file): charcount(file) alphanumeric(file) linecount(file) wordcount(file) def main(): file = input('File name = ') feature = input('Use feature hashing ? (y,Y,n,N) ') while not feature in ['y','Y','n','N']: print('Try again.') feature = input('Use feature hashing ? (y,Y,n,N) ') if feature in ['y','Y']: m = int(input('M = ')) show(file) print('Bow =',changebow(makebow(makewords(file)),m)) elif feature in ['n','N']: show(file) print('Bow =',makebow(makewords(file))) #--------------------------------------------------------------------------------------- main()
# 6330362621 (29.00) 207 (2021-03-22 00:39) filename=input('File name = ') feature=input('Use feature hashing ? (y,Y,n,N) ') while feature not in 'yYnN': print('Try again.') feature=input('Use feature hashing ? (y,Y,n,N) ') if feature in "yY": M=int(input("M = ")) print('-------------------') file= open(filename).read().lower().strip('\n') file2= open('stopwords.txt').read().lower() charcount =0 alphanumericcount=0 linecount=1 word=[] word2=[] a='' b='' def fhash(w,M): ans=0 for i in range(len(w)): ans+=ord(w[i])*(37**i) return ans%M for e in file: if e!='\n': charcount+=1 else: linecount+=1 if 'a'<=e<='z' or '0'<= e<='9': alphanumericcount+=1 a+=e else: if a!='': word.append(a) a='' print('char count =',charcount) print('alphanumeric count =',alphanumericcount) print('line count =',linecount) if a!='': word.append(a) wordcount=len(word) print('word count =',wordcount) for x in file2: if 'a'<=x<='z' or '0'<=x<='9': b+=x else: if b!='': word2.append(b) b='' if b!='': word2.append(b) for e in word2: for i in range(word.count(e)): word.remove(e) if feature in "yY": for i in range(len(word)): word[i]=fhash(word[i],M) BoW =[] a=[] for e in word: if e not in a: BoW.append([e,word.count(e)]) a.append(e) BoW.sort() print('BoW =',BoW)
# 6330365521 (19.15) 208 (2021-03-21 17:23) def fhash(w,m): su=0 for a in range (len(w)): su+=ord(w[a])*37**a su=su%int(m) return su fname=input("File name = ") ha=input("Use feature hashing ? (y,Y,n,N) ") while ha not in ["y","Y","n","N"]: print("Try again.") ha=input("Use feature hashing ? (y,Y,n,N) ") stw=[] stw1=['"',"'",'\\','.',',',':',';','|','?','(',')','[',']','*','+','-','/','!','#','$','%','^','&','_','{','}','<','>','@','~','`'] fstw=open("stopwords.txt") for line in fstw: x=line.strip().split() stw+=x fstw.close() f=open(fname) word1="" word2="" linecount=0 cout=0 for lin in f: if lin == "\n": linecount-=1 cout+=1 if lin != '\n': linecount+=cout cout=0 y=lin.strip().lower() word1+=y+' ' charcout=len(word1) linecount+=1 alpcount=0 for x in word1: if x not in stw1: word2+=x alpcount+=1 else: word2+=' ' word2=word2.split() wordcount=len(word2) alpcount=alpcount-wordcount charcout=charcout-linecount word3=[] for x in word2: if x not in stw : word3.append(x) word3.sort() bow=[] countword=1 #print(len(word2),'\n',stw,'\n',linecount,charcout,word2) has=[] bow2=[] countword=1 if ha not in ["n","N"]: m=input() print('-'*19) print("char count =",charcout) print("alphanumeric count =",alpcount) print("line count =",linecount) print("word count =",wordcount) for wo in word3: x=fhash(wo,m) has.append(x) has.sort() for x in range (len(word3)-1): if word3[x]!=word3[x+1]: bow.append([word3[x],countword]) countword=1 else: countword+=1 x+=1 bow.append([word3[x],countword]) for y in range (len(has)-1): if has[y]!=has[y+1]: bow2.append([has[y],countword]) countword=1 else: countword+=1 y=y+1 bow2.append([has[y],countword]) print("BoW =",bow2) else: print('-'*19) print("char count =",charcout) print("alphanumeric count =",alpcount) print("line count =",linecount) print("word count =",wordcount) for x in range (len(word3)-1): if word3[x]!=word3[x+1]: bow.append([word3[x],countword]) countword=1 else: countword+=1 x+=1 bow.append([word3[x],countword]) print("BoW =",bow)
# 6330366121 (23.40) 209 (2021-03-21 17:03) #----------------------------------------------------------------- def char_count (H): #หาจำนวนตัวอักษร len(H) return len(H) def alphanumeric_count (I): #หาตัวจำนวนเพียงตัวอักษรและตัวเลข A = 'abcdefghijklmnopqrstuvwxyz0123456789' c = 0 for i in range (len(I)): if I[i] in A: c += 1 return c def word_count (J): #หาจำนวนนwordd alphanum = 'abcdefghijklmnopqrstuvwxyz0123456789' K = "" for j in range (len(J)): if J[j] in alphanum: K += J[j] else : K += " " Sakura = K.split() w = len(Sakura) return w,Sakura def fhash (word,M): g = 0 for i in range (len(word)): g += ord(word[i])*(37**i) return g % M def BOW_Y (Sasuke,M): Sasuke1 = [] Sasuke2 = [] Sasuke3 = [] for i in range (len(Sasuke)): if fhash(Sasuke[i],M) in Sasuke2: for number in Sasuke3 : if fhash(Sasuke[i],M) == number[0] : number[1] += 1 else : Sasuke2.append(fhash(Sasuke[i],M)) Sasuke3.append([fhash(Sasuke[i],M),1]) return Sasuke3 def BOW_N (Sasuke): Sasuke1 = [] Sasuke2 = [] Sasuke3 = [] for i in range (len(Sasuke)): if Sasuke[i] in Sasuke2: for number in Sasuke3 : if Sasuke[i] in number : number[1] += 1 else : Sasuke2.append(Sasuke[i]) Sasuke3.append([Sasuke[i],1]) return Sasuke3 #----------------------------------------------------------------- #file stopwords stop_words = open("stopwords.txt",'r') Madara = [] for line in stop_words: for word in line.strip().lower().split(): Madara.append(word) #แปลงข้อความเป็นตัวเล็ก file_name = open(input("File name = "),"r") Hokage1 = '' Hokage2 = '' z=0 for line in file_name: Hokage1 += line.strip("\n").lower() #ใช้กับ char count และ alphanumeric Hokage2 += " "+line.strip("\n").lower() #ใช้กับwordcount z += 1 file_name.close() #input Naruto = input("Use feature hashing ? (y,Y,n,N) ") while Naruto not in 'yYnN' : print('Try again') Naruto = input("Use feature hashing ? (y,Y,n,N) ") if Naruto == 'y' or Naruto == 'Y': M = int(input("M = ")) x=char_count(Hokage1) y=alphanumeric_count(Hokage1) w,Sakura=word_count(Hokage2) D = [] for word in Sakura: if word in Madara: pass else : D.append(word) v=BOW_Y(D,M) print('-------------------') print('char count = '+str(x)) print('alphanumeric_count = '+str(y)) print('line count = '+str(z)) print('word count = '+str(w)) print("BoW =",sorted(v)) elif Naruto == 'n' or Naruto == 'N': x=char_count(Hokage1) y=alphanumeric_count(Hokage1) w,Sakura=word_count(Hokage2) D = [] for word in Sakura: if word in Madara: pass else : D.append(word) u=BOW_N(D) print('-------------------') print('char count = '+str(x)) print('alphanumeric_count = '+str(y)) print('line count = '+str(z)) print('word count = '+str(w)) print("BoW =",sorted(u))
# 6330367821 (26.00) 210 (2021-03-21 23:06) def fhash(w, M): y = 0 for i in range(0, len(w), 1): x = (ord(w[i]))*(37**i) y += x fhash1 = y%M return fhash1 def lis2(li1, li2): a = [] for i in range(0, len(li1), 1): b = [li1[i], li2[i]] a.append(b) return a def first(): print('-------------------') f = open(file_name, 'r') k = 0 for line in f: line = line.strip() k += len(line) print('char count =', k) f = open(file_name, 'r') g = 0 for line in f: line = line.strip() y = 0 for i in line: if i in q: y += 1 g += y print('alphanumeric count =', g) f = open(file_name, 'r') a = 0 for line in f: if True: a+=1 print('line count =', a) f = open(file_name, 'r') r = '' for line in f: line = ' '+line.strip() for i in range(0, len(line), 1): if line[i] in q: r += line[i] else: r += ' ' t = r.split(' ') s = [] for i in range(0, len(t), 1): if t[i] != '': s.append(t[i]) print('word count =', len(s)) f.close() def feature_n(): first() c = open(file_name, 'r') d = '' e = [] for line in c: line = line.strip() for i in range(0, len(line), 1): if line[i] in q: d += line[i] else: if i == len(line)-1: pass else: d += ' ' e += d.lower().split() d = '' c.close() filename = 'stopwords.txt' f = open(filename, 'r') k = '' for line in f: k += ' '+line.strip() line2 = k.split() f.close() j = '' for i in e: if i not in line2: j += ' '+i g = j.lower().split() m = sorted(g) l =[m[0]] for i in range(1, len(m), 1): if m[i] in l: pass else: l.append(m[i]) n =[] o = 1 for i in range(0, len(m)-1, 1): if m[i+1] == m[i]: o += 1 if i == len(m)-2: n.append(o) else: n.append(o) o = 1 if i == len(m)-2: n.append(o) print('BoW =', lis2(l, n)) def feature_y(): first() c1 = open(file_name, 'r') d1 = '' e1 = [] for line in c1: line = line.strip() for i in range(0, len(line), 1): if line[i] in q: d1 += line[i] else: if i == len(line)-1: pass else: d1 += ' ' e1 += d1.lower().split() d1 = '' c1.close() filename = 'stopwords.txt' f1 = open(filename, 'r') k1 = '' for line in f1: k1 += ' '+line.strip() line21 = k1.split() f1.close() j1 = '' for i in e1: if i not in line21: j1 += ' '+i g1 = j1.lower().split() h = [] for i in range(0, len(g1), 1): h += [fhash(g1[i], int(a))] h.sort() j =[h[0]] for i in range(1, len(h), 1): if h[i] == h[i-1]: pass else: j.append(h[i]) k =[] l = 1 for i in range(0, len(h)-1, 1): if h[i+1] == h[i]: l += 1 if i == len(h)-2: k.append(l) else: k.append(l) l = 1 if i == len(h)-2: k.append(l) print('BoW =', lis2(j, k)) q = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', \ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', \ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] file_name = input('File name = ') feature = input('Use feature hashing ? (y,Y,n,N) ') if feature == 'n' or feature == 'N': feature_n() elif feature == 'y' or feature == 'Y': a = input('M = ') feature_y() else: print('Try again.') feature = input('Use feature hashing ? (y,Y,n,N) ') while feature not in ['y', 'Y', 'n', 'N']: print('Try again.') feature = input('Use feature hashing ? (y,Y,n,N) ') if feature == 'n' or feature == 'N': feature_n() if feature == 'y' or feature == 'Y': a = input('M = ') feature_y()
# 6330370621 (20.00) 211 (2021-03-21 16:19) file_name = input('File name = ') fh = input('Use feature hashing ? (y,Y,n,N) ') fh = fh.upper() while fh != 'N' and fh != 'Y' : print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') if fh == 'Y' : M = int(input('M = ')) use_fh = True elif fh == 'N' : use_fh = False fstop = open("stopwords.txt", "r") stopwords = [] for line in fstop: stopwords += line.split() fstop.close() print('-------------------') fin = open(file_name, "r") def char_count(x): c = 0 for line in x: if "\n" in line: c += len(line)-1 else: c += len(line) return str(c) print("char count =", char_count(fin)) fin.close() fin = open(file_name, "r") def alphnum_count(x): c = 0 for line in x: for e in line: if 'A'<=e<='Z' or 'a'<=e<='z' or '0'<=e<='9': c += 1 return str(c) print('alphanumeric count =', alphnum_count(fin)) fin.close() fin = open(file_name, "r") def line_count(x): c = 0 for line in x: c += 1 return str(c) print('line count =', line_count(fin)) fin.close() fin = open(file_name, "r") def blank(t): new = '' letters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' for e in t: if e in letters: new += e else: new += ' ' return new def word_count(x): words = [] for line in x: words += blank(line).split() return str(len(words)) print('word count =', word_count(fin)) fin.close() fin = open(file_name, "r") def remove_stopwords(w): removed = [] for e in w: if e not in stopwords: removed.append(e) return removed def fhash(w, M): s = 0 for i in range(len(w)): s += ord(w[i])*(37**i) fh = s % M return fh fin = open("sample.txt", "r") words = [] for line in fin: line2 = line.lower() words += blank(line2).split() words = remove_stopwords(words) if not use_fh: used = [] ; freq = [] ; BoW = [] k = 1 for e in words: if e not in used: used.append(e) freq.append(k) else: freq[used.index(e)] += 1 for i in range(len(used)): BoW.append([used[i], freq[i]]) else: feathash = [] for e in words: feathash.append(str(fhash(e, M))) feathash.sort() used = [] ; freq = [] ; BoW = [] k = 1 for e in feathash: if e not in used: used.append(e) freq.append(k) else: freq[used.index(e)] += 1 for i in range(len(used)): BoW.append([used[i], freq[i]]) for i in range(len(BoW)): (BoW[i])[0] = int((BoW[i])[0]) print('BoW =', BoW) fin.close()
# 6330371221 (13.00) 212 (2021-03-22 22:56) q = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789' def fhash(w,M): k = 0 for i in range(len(w)): k += ord(w[i])*(37)**(i) z = k%M return z file_name = input('File name = ') b = input('Use feature hashing ? (y,Y,n,N) ') while b not in ['y','Y','n','N']: print('Try again.') b = input('Use feature hashing ? (y,Y,n,N) ') if b in ['y','Y']: print(input('M = ')) else: pass c = open("stopwords.txt","r") a = open(file_name,"r") line = 0 word = 0 character = 0 alpha = 0 for f in a: wordslist = f.split() line += 1 word += len(wordslist) character += len(f) characters = character-line for i in range(len(f)): if f[i] in q: alpha +=1 print('-'*19) print('char count = ',characters) print('alphanumeric count = ',alpha) print('line count = ',line) print('word count = ',word)
# 6330372921 (26.00) 213 (2021-03-19 13:54) def fh(w, M): G = 37 pow = 0 sum = 0 for c in w: asc = ord(c) sum += asc * (G ** pow) pow += 1 return sum % M def remove_stopwords(string): file = open("stopwords.txt") stopwords = [] result = [] for line in file: stopwords += line.strip().split() words = string.split() for word in words: if word not in stopwords: result.append(word) return " ".join(result) def BoW_noHash(string): bow = [] words = string.split() for word in words: found = False index = 0 for w, n in bow: if w == word: found = True bow[index][1] += 1 break index += 1 if not found: bow.append([word, 1]) return bow def BoW_Hash(string, M): words = string.split() bow = [] for word in words: fhash = fh(word, M) found = False index = 0 for h, n in bow: if h == fhash: found = True bow[index][1] += 1 break index += 1 if not found: bow.append([fhash, 1]) return bow def process_file(file): characters = 0 alphanumeric = 0 line_count = 0 beginning_of_word = False word_count = 0 string = "" for line in file: line = line.strip() beginning_of_word = False for char in line: characters += 1 if char.isalnum(): alphanumeric += 1 beginning_of_word = True string += char.lower() else: if beginning_of_word: word_count += 1 string += " " beginning_of_word = False string += " " line_count += 1 return [characters, alphanumeric, line_count, word_count, string] file_name = input("File name = ") file = open(file_name) h = input("Use feature hashing ? (y,Y,n,N) ") hashing = False M = 0 while True: if h == "y" or h == "Y": M = int(input("M = ")) hashing = True break elif h == "n" or h == "N": hashing = False break else: print("Try again.") h = input("Use feature hashing ? (y,Y,n,N) ") print("------------------") count_result = process_file(file) print("char count = " + str(count_result[0])) print("alphanumeric count = " + str(count_result[1])) print("line count = " + str(count_result[2])) print("word count = " + str(count_result[3])) process_string = remove_stopwords(count_result[4]) if hashing: print("BoW = ", end="") print(BoW_Hash(process_string, M)) else: print("BoW = ", end="") print(BoW_noHash(process_string))
# 6330374121 (29.00) 214 (2021-03-18 15:00) file_name = input('File name = ') modeHashing = input('Use feature hashing ? (y,Y,n,N) ').lower() while modeHashing not in 'ny': modeHashing = input('Use feature hashing ? (y,Y,n,N) ').lower() if modeHashing == 'y': M = int(input('M = ')) print('-------------------') #input string output list - special characters & whitespace def list_word(s): new_word = '' for e in s: if not e.isalnum(): new_word += ' ' else: new_word += e return new_word.split() def fHash(w, M): s = 0 for i in range(len(w)): s += ord(w[i])*(37**i) return s % M #input string output list --> 'BoW =' def BoW(s): stopword_list = f_stopword.read().split() BoW = [] for e in list_word(s.lower()): if e not in stopword_list: if BoW == []: BoW.append([e, 1]) else: for i in range(len(BoW)): if e == BoW[i][0]: BoW[i][1] += 1 break else: BoW.append([e, 1]) if modeHashing == 'n': return BoW else: temp = [] for e in BoW: for k in range(e[1]): temp.append(fHash(e[0], M)) #temp = [int(0-M), int(0-M), int(0-M), ...] fHashing = [] for i in range(M): fHashing.append([i,0]) #fHashing = [[0,0], [1,0], [2,0], ..., [M-1,0]] for e in temp: fHashing[e][1] += 1 #fHashing = [[0,count0], [1,count1], ..., [M-1,count(M-1)]] for i in range(len(fHashing)): if fHashing[i][1] == 0: fHashing[i] = 'zero' #remove element count=0 while 'zero' in fHashing: fHashing.remove('zero') return fHashing #------------------------------------------------ f_stopword = open('stopwords.txt', 'r') f_work = open( file_name, 'r') char_count = 0 alnum_count = 0 line_count = 0 word_count = 0 word_string = '' for line in f_work: line_count += 1 char_count += len(line.strip()) for e in line.lower(): if 'a' <= e <= 'z' or '0' <= e <= '9': alnum_count += 1 word_count += len(list_word(line)) word_string += line print('char count =', char_count) print('alphanumeric count =', alnum_count) print('line count =', line_count) print('word count =', word_count) print('BoW =', BoW(word_string)) f_stopword.close() f_work.close()
# 6330375821 (24.35) 215 (2021-03-22 22:25) file_name=input('File name = ') ft=input('Use feature hashing ? (y,Y,n,N) ') while ft not in 'yYnN': print('Try again.') ft=input('Use feature hashing ? (y,Y,n,N) ') if ft in "yY": M=int(input("M = ")) print('-------------------') file= open(file_name).read().lower().strip('\n') file2= open('stopwords.txt').read().lower() Ch_c,Ap_c,L_c,word,word2,A,B =0,0,1,[],[],'','' def fhash(w,M): ans=0 for i in range(len(w)): ans+=ord(w[i])*(37**i) return ans%M def PrBow(word): Bow,A =[],[] for e in word: if e not in A: Bow.append([e,word.count(e)]) A.append(e) Bow.sort() print('BoW =',Bow) return(Bow) for e in file: if e!='\n': Ch_c+=1 else: L_c+=1 if 'A'<=e<='z' or '0'<= e<='9': Ap_c+=1 A+=e else: if A!='': word.append(A) A='' print('char count =',Ch_c) print('alphanumeric count =',Ap_c) print('line count =',L_c) if A!='': word.append(A) word_count=len(word) print('word count =',word_count) for x in file2: if 'A'<=x<='z' or '0'<=x<='9': B+=x else: if B!='': word2.append(B) B='' if B!='': word2.append(B) for e in word2: for i in range(word.count(e)): word.remove(e) if ft in "yY": for i in range(len(word)): word[i]=fhash(word[i],M) PrBow(word)
# 6330376421 (10.90) 216 (2021-03-22 14:00) def fhash(w,M): a=0 for i in range(len(w)): a+=ord(w[i])*(37**i) return a%M def fhash_all(a,M): e=[] for i in a: e.append(fhash(i,M)) return e def allwords(a): c=[] for i in a: if not i in c: c.append(i) return c def countwords(inside,allwords): c=[] for i in allwords: c.append([i,inside.count(i)]) return c #input------------------ file_name = input('File name = ') while True: feture = input("Use feature hashing ? (y,Y,n,N) ") if feture =='Y'\ or feture =='y'\ or feture =='N'\ or feture =='n': M=int(input('M = ')) break else: print('Try again.') print('-------------------') #stopword-------------- infile = open('stopword.txt','r') stopword=[] for line in infile: e=line.lower().split() stopword+=e infile.close() #infile----------------- infile = open(file_name,'r') inside='' charcount=0 alphanumeric_count =0 line_count=0 word_count=0 stopword_count=0 for line in infile: line_count+=1 charcount+=len(line.strip()) d=line.lower().strip() for i in d: if i in '!@#$%^&*()_+"\'\\/:;-=.?,{}': inside+=' ' else: inside+=i inside+=' ' print('char count =',charcount) e=inside.split() inside=[] for i in e: alphanumeric_count+=len(i) if not i in stopword: inside.append(i) print('alphanumeric count =',alphanumeric_count) print('line count =',line_count) print('word count =',len(e)) infile.close() if feture=='Y' or feture=='y': inside=fhash_all(inside,M) #Output--------------------------------- print('BoW =',sorted(countwords(inside,allwords(inside))))
# 6330377021 (17.90) 217 (2021-03-21 17:18) def fhash(w,M): G = 37 feature_hashing = 0 for i in range(len(w)): feature_hashing += (ord(w[i])*(G**i)) feature_hashing = feature_hashing % int(M) return feature_hashing def BoW(f1): new = [] i = 0 while i < len(f1)-1: h = 1 if f1[i] == f1[i+1]: z = i+1 while f1[i] == f1[z]: h += 1 if z < len(f1)-1: z += 1 else: break new.append([f1[i], h]) i = z else: new.append([f1[i], 1]) i += 1 if f1[i] == f1[i-1]: pass else: new.append([f1[i], 1]) return new File_name = input("File name = ") choose_BoW = input("Use feature hashing ? (y,Y,n,N) ") while True: if choose_BoW == "y" or choose_BoW == "Y": M = input("M = ") k = True break elif choose_BoW == "n" or choose_BoW == "N": k = False break else: print("Try again.") choose_BoW = input("Use feature hashing ? (y,Y,n,N) ") print("-------------------") name = open(File_name,"r") stop_w = open("stopwords.txt","r") line = name.readline()[:-1] sentence = "" line_count = 0 while len(line) > 0: if line[-1] == '\n': line = line[:-1] sentence += line line = name.readline() line_count += 1 char_count = len(sentence) print("char count =", char_count) b = "" for i in range(len(sentence)): if '0'<=sentence[i]<='9' or 'a'<=sentence[i]<='z' or 'A'<=sentence[i]<='Z': b += sentence[i] else: b += " " c = b.split() alp_count = "".join(c) print("alphanumeric count =",len(alp_count)) print("line count =",line_count) print("word count =",len(c)) line2 = stop_w.readline()[:-1]+" " sentence2 = "" while len(line2) > 0: if line2[-1] == '\n': line2 = line2[:-1]+" " sentence2 += line2 line2 = stop_w.readline() stop_words = sentence2.split() c = b.lower().split() c.sort() pre = [] for i in range(len(c)): if c[i] in stop_words: pass else: pre.append(c[i]) if k == False: BoW = BoW(pre) else: f1 = [] for i in range(len(pre)): f1.append(fhash(pre[i],M)) f1.sort() BoW = BoW(f1) print("BoW =",BoW) name.close() stop_w.close()
# 6330378721 (17.45) 218 (2021-03-21 01:09) #-------------------------------------------------# def get_unique( words ): unique_words = [] for i in range(len(words)): if words[i] in words and not words[i] in unique_words: unique_words.append(words[i]) return unique_words def BoW0(ness_word): unique = get_unique( ness_word ) freq_list = [] for i in range(len(unique)): f = ness_word_str.count(unique[i]) freq_list.append(f) bow0 = [] for i in range(len(unique)): bow0.append([unique[i],freq_list[i]]) return(bow0) def fhash(w,M): G = 37 fhash_cal = 0 sig_fhash_cal = 0 for i in range(len(w)): sig_fhash_cal += (w[i])*(G**i) fhash_w = sig_fhash_cal % M return fhash_w def change_2_num(ness_word): c = [] list_c = [] for i in range(len(ness_word)): for k in range(len(ness_word[i])): list_c.append(ord(ness_word[i][k])) c.append(fhash(list_c,M)) list_c = [] return(c) def BoW1(c): unique = get_unique(c) freq_list = [] c_str = [] uniq_c_str = [] bow1 = [] for i in range(len(c)): c_str.append(str(c[i])) c_str =(' ').join(c_str) for i in range(len(get_unique(c))): uniq_c_str.append(str(get_unique(c)[i])) for i in range(len(unique)): f = c_str.count(uniq_c_str[i]) freq_list.append(f) for i in range(len(unique)): bow1.append([int(uniq_c_str[i]),freq_list[i]]) return(bow1) #-------------------------------------------------# file_name0 = input('File name = ') file_name = open(file_name0,'r') a = '' line_count = 0 for line in file_name: a += (line.strip()) line_count += 1 char_count = len(a) a = a.lower() list_file =[] alpha_count = 0 for i in range(char_count): if a[i] in ['/','.',',','"',':',':',"'"]: list_file += ' ' else: list_file += a[i] if not a[i] == ' ': alpha_count += 1 normal_txt = ('').join(list_file) list_o_str_words = normal_txt.split() word_count = len(list_o_str_words) file_stp_word = open('stopwords.txt','r') b = '' for line in file_stp_word: b += (line.strip()) b += ' ' b = b.split() bow = [] ness_word =[] for i in range(word_count): if not list_o_str_words[i] in b: ness_word.append(list_o_str_words[i]) ness_word_str = (' ').join(ness_word) answer = False while answer == False: choice = input('Use feature harshing ? (y,Y,n,N) ') if choice in ['Y','y','N','n']: answer = True if choice == 'Y' or choice == 'y': M = int(input('M = ')) print('-'*19) print('char count =',char_count) print('alphanumeric count =',alpha_count) print('line count = ',line_count) print('word count =',word_count) print('BoW =',BoW1(change_2_num(ness_word))) else: print('-'*19) print('char count =',char_count) print('alphanumeric count =',alpha_count) print('line count = ',line_count) print('word count =',word_count) print('BoW =',BoW0(ness_word)) else: print('Try Again.') file_name.close() file_stp_word.close()
# 6330379321 (24.00) 219 (2021-03-21 17:03) alphabet = 'abcdefghijklmnopqrstuvwxyz' number = '0123456789' File_name = input('File name = ') file_name = open(File_name,'r') stopword_file = open('stopwords.txt','r') #================================================================ def stopword(list_of_word,stopword_file) : stopwords = [] stop_word =[] for line in stopword_file : line = line.strip().lower().split() stopwords+=(line) for i in list_of_word : if i in stopwords : continue else : stop_word.append(i) return stop_word def easy_stop_word(line) : out = '' for i in line : if i in alphabet+number : out+=i else : out+=' ' return out def fhash(word,M) : sum = 0 for e in range(len(word)) : sum+=((ord(word[e]))*37**e) result = sum%M return result def Yes(list_of_word,M) : C=[] for i in list_of_word : num = fhash(i,M) C.append(num) C.sort() c=1 BoW =[] for j in range(len(C)-1) : if C[j] == C[j+1] : c+=1 else : BoW.append([C[j],c]) c=1 BoW.append([C[j+1],c]) return BoW def countword(s) : countword = [] for t in s : c = 1 k = s.index(t) while t in s[k+1:] : c += 1 k = s.index(t,k+1) countword.append([t,c]) BoW = [] for i in countword : if i not in BoW : BoW.append(i) return BoW #================================================================ Y = 0 while True : answer = input('Use feature hashing ? (y,Y,n,N) ') if answer == 'y' or answer == 'Y' : M=int(input('M = ')) Y = 1 break elif answer == 'n' or answer =='N' : break else : print('Try again.') #================================================================== char = 0 alphanumeric_count = 0 line_count = 0 word_count = 0 list_of_word=[] for line in file_name : line = line.strip().lower() for i in line : char+=1 if i in alphabet+number : alphanumeric_count+=1 line = easy_stop_word(line) line = line.split() for e in line : list_of_word.append(e) word_count += 1 line_count +=1 list_of_word = stopword(list_of_word,stopword_file) if Y == 0 : BoW = countword(list_of_word) elif Y == 1: BoW = Yes(list_of_word,M) #============================================================================================= print('-------------------') print('char count =',char) print('alphanumeric count =',alphanumeric_count) print('line count =',line_count) print('word count =',word_count) print('Bow =',BoW)
# 6330380921 (30.00) 220 (2021-03-21 17:14) stopwords_list=[] stopword_in=open('stopwords.txt',"r") for line in stopword_in: stop=line.strip().split() for i in stop: stopwords_list.append(i) stopword_in.close() def fhash(w,M): G=37 sum_of=0 ws=str(w).strip() for i in range(len(ws)): a=ord(ws[i]) sum_of += (a*(G)**i) real_sum=sum_of%M return real_sum def pack_bow(text,stopwords_list): a=[] b=[] for e in text: if not(e in stopwords_list): a.append(e) return a def char(text): if '\n' in text: a=len(text)-1 else: a=len(text) return a def alpha(text): c=0 for i in text: if i in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789': c+=1 return c def word(text): a='' for e in text: if e in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789': a+= e elif e== ' ': a+=' ' else: a+=' ' b=a.split() c=[] i=0 while i<len(b): if b[i]==' ': i+=1 else: c.append(b[i].lower()) i+=1 return c def bow(L_list): a=[] b=[] for e in L_list: a+=e return a def no_of_bow(word,allword): num_bow=[] for e in word: c=0 k=0 while k<len(allword): if e == allword[k]: c+=1 k+=1 else: k+=1 num_bow.append(c) return num_bow def remove_rep(word): b=[] b+=word c=[] k=0 while k<len(b): x=word.pop(0) if not(x in word): c.append(x) k+=1 else: k+=1 return c def add_no_to_bow(word,no): a=[] for k in range(len(word)): a.append([word[k],no[k]]) return a def add_no_to_bow_hash(word,no): a=[] for k in range(len(word)): a.append([int(word[k]),no[k]]) return a file_name = input('File name = ') fh = input('Use feature hashing ? (y,Y,n,N) ') active_fhash=0 M_no=0 while fh != 'y' or fh !='Y': if fh == 'n' or fh == 'N': active_fhash = 0 break elif fh=='y' or fh=='Y': active_fhash = 1 M_no=input('M = ') break else: print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') char_count=0 alpha_count=0 line_count=0 word_count=0 fh_list=[] bow_list=[] word_list=[] f_in=open(file_name,"r") for line in f_in: char_count+=char(line) alpha_count+=alpha(line.strip()) word_count+=(len(word(line.strip()))) if len(line)!=0: line_count+=1 word_list.append(word(line.strip())) bow_list.append(pack_bow(word(line),stopwords_list)) print('-------------------') print('char count =',char_count) print('alphanumeric count =',alpha_count) print('line count =',line_count) print('word count =',word_count) f_in.close() fh_list=[] if active_fhash==0: word=remove_rep(bow(bow_list)) allword=bow(word_list) no=no_of_bow(word,allword) BoW=sorted(add_no_to_bow(word,no)) elif active_fhash==1: fhword=bow(bow_list) for w in fhword: fh_list.append(str(fhash(w,int(M_no)))) allfhash=bow(fh_list) wordhash=remove_rep(bow(fh_list)) nohash=no_of_bow(wordhash,allfhash) BoW=sorted(add_no_to_bow_hash(wordhash,nohash)) print('BoW =',BoW)
# 6330381521 (20.20) 221 (2021-03-22 23:26) file_name = input('File name = ') fh = input('Use feature hashing ? (y,Y,n,N) ') fh = fh.upper() while fh != 'N' and fh != 'Y' : print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') fh = fh.upper() if fh == 'N' and 'n' : file1= open(file_name).read().lower().strip('\n') file2= open('stopwords.txt').read().lower() character_count = 0 alphanumeric_count = 0 U1 = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' L1 = 'abcdefghijklmnopqrstuvwxyz' number = '0123456789' line_count = 1 word_count = 0 BoW = [] print('-------------------') for line in file1 : if line != '\n' : character_count += 1 else : line_count += 1 for i in range(len(file1)) : if file1[i] in U1 : alphanumeric_count += 1 elif file1[i] in L1 : alphanumeric_count += 1 elif file1[i] in number : alphanumeric_count += 1 word1 = "" for i in range(len(file1)) : if not file1[i] in [',', '"', "'", '-', '_', '=', '.', '(', ')', '>', '<', ';', ':'] : word1 += file1[i] else: word1 += ' ' word1 = word1.split() word_count += len(word1) word_stop = [] word_without_stop = '' for word2 in file2 : if "a" <= word2 <= "z" or "0" <= word2 <= "9" : word_without_stop += word2 else: if word_without_stop != "" : word_stop.append(word_without_stop) word_without_stop = "" for w_e_W in word_stop : for i in range(word1.count(w_e_W)) : word1.remove(w_e_W) aaaa = [] for BoW_word in word1 : if BoW_word not in aaaa : BoW.append([BoW_word ,word1.count(BoW_word)]) aaaa.append(BoW_word) BoW.sort() print('char count = ' + str(character_count)) print('alphanumeric count = ' + str(alphanumeric_count)) print('line count = ' + str(line_count)) print('word count = ' + str(word_count)) print('BoW =',BoW) if fh == 'Y' and 'y' : M_solution = int(input("M = ")) print('-------------------') file1 = open(file_name).read().lower().strip('\n') file2 = open('stopwords.txt').read().lower() character_count = 0 alphanumeric_count = 0 line_count = 1 word1 = [] word2 = [] string_word1 = "" string_word2 = "" for e_e_e in file1 : if e_e_e != "\n" : character_count += 1 else : line_count += 1 if 'a' <= e_e_e <= 'z' or '0 '<= e_e_e <= '9' : alphanumeric_count += 1 string_word1 += e_e_e else : if string_word1 != "" : word1.append(string_word1) string_word1 = "" print('char count =',character_count) print('alphanumeric count =',alphanumeric_count) print('line count =',line_count) if string_word1 != "" : word1.append(string_word1) word_count = len(word1) print('word count =', word_count) for word_word in file2 : if "a" <= word_word <= "z" or "0" <= word_word <= "9" : string_word2 += word_word else : if string_word2 != "" : word2.append(string_word2) string_word2 = "" if string_word2 != "" : word2.append(string_word2) for e_e_e_e in word2 : for i in range(word1.count(e_e_e_e)) : word1.remove(e_e_e_e) def feature_hashing(word,M_solution) : ans = 0 for e_e_e_e in range(len(word)) : ans += ord(word[e_e_e_e])*(37**e_e_e_e) return ans % M_solution if fh in "yY" : for e_e_e_e in range(len(word1)) : word1[e_e_e_e] = feature_hashing(word1[e_e_e_e], M_solution) BoW = [] string_word1 = [] for w_o_r_d in word1 : if w_o_r_d not in string_word1 : BoW.append([w_o_r_d , word1.count(w_o_r_d)]) string_word1.append(w_o_r_d) BoW.sort() print('BoW =',BoW)
# 6330382121 (14.00) 222 (2021-03-22 23:23) def flash(w, M): a = 0 for i in range(len(w)): a += ord(w[i]) * (37 ** i) b = a % M return b def repeat(stop, file_name): alphabet_list = 'abcdefghijklmnopqrstwxyz' number_list = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] stop_file = open(stop, 'r') text_file = open(file_name, 'r') stop_texts = stop_file.read() file_texts = text_file.read() new_textfile1 = '.' new_textfile = '' new_stopfile = '' for e in file_texts: if e.lower() not in alphabet_list and e.lower() not in number_list: new_textfile += ' ' else: new_textfile += e.lower() for e in file_texts: if e.lower() not in alphabet_list and e.lower() not in number_list: new_textfile1 += '.' else: new_textfile1 += e.lower() for i in stop_texts: if i.lower() not in alphabet_list and i.lower() not in number_list: new_stopfile += ' ' else: new_stopfile += i.lower() new_stopfile = new_stopfile.split() result = [] for a in new_textfile.split(): if a in new_stopfile or a in result: pass else: result.append(a) BoW = [] for c in result: x = 0 x = new_textfile1.count('.' + c + '.') BoW.append([c, x]) stop_file.close() text_file.close() return BoW def feature_hashing(BoW, M): new_BoW = '' for i in BoW: a = flash(i[0], M) b = i[1] new_BoW += ('.' + str(a) + ',') * b new_BoW1 = [] for k in range(M): b = 0 x = 0 x = new_BoW.count('.' + str(k) + ',') if x != 0: new_BoW1.append([k, x]) return new_BoW1 file_name = input('File name = ') a = input('Use feature hashing ? (y,Y,n,N) ') while a != 'y' and a != 'Y' and a != 'n' and a != 'N': print('Try again.') a = input('Use feature hashing ? (y,Y,n,N) ') if a == 'y' or a == 'Y': M = int(input('M = ')) print('-------------------') file = open(file_name, "r") texts = file.read() char = 0 line = 0 space_bar = 0 special = 0 number = 0 number_list = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] special_list = ['"', '(', '[', ']', ')', '.', ',', '\'', '/', ':', ';', '!'] for i in texts: if i == '\n': line = line + 1 elif i == ' ': space_bar += 1 elif i in number_list: number += 1 elif i in special_list: special += 1 else: char += 1 print('char count =', char + space_bar + special + number) print('alphanumeric count =', char + number) print('line count =', line + 1) print('word count =', space_bar + 1*(line + 1)) file.close() print(feature_hashing(repeat('stopwords.txt', file_name), M)) # feature hashing elif a != 'n' or a != 'N': print('-------------------') file = open(file_name, "r") texts = file.read() char = 0 line = 0 space_bar = 0 special = 0 number = 0 number_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9'] special_list = ['"', '(', '[', ']', ')', '.', ',', '\'', '/', ':', ';', '!'] for i in texts: if i == '\n': line = line + 1 elif i == ' ': space_bar += 1 elif i in number_list: number += 1 elif i in special_list: special += 1 else: char += 1 print('char count =', char + space_bar + special + number) print('alphanumeric count =', char + number) print('line count =', line + 1) print('word count =', space_bar + 1*(line + 1)) file.close() print(repeat('stopwords.txt', file_name)) # no feature hashing
# 6330384421 (30.00) 223 (2021-03-22 20:27) M = 0 file_name = input('File name = ') nb = input('Use feature hashing ? (y,Y,n,N) ') while not nb in ['y', 'Y', 'n', 'N']: print('Try again.') nb = input('Use feature hashing ? (y,Y,n,N) ') if nb in ['y', 'Y']:M = int(input('M = ')) print('-------------------') information = [] l=[] stopwords = open('stopwords.txt', 'r') for line in stopwords: l+=line.strip().lower().split() for i in range(len(l)): if not l[i] in information: information+=[l[i]] stopwords.close() charactercount=0 ab12=0 linecount=0 L=[] L2='' information2= [] information3= [] openfile = open(file_name, 'r') for line in openfile: linecount+=1 for i in line: charactercount+=1 if i=='\n':charactercount-=1 if '0'<=i<='9'or'A'<=i<='Z'or'a'<=i<='z': ab12+=1 L2+=i else: L2+=' ' L+=L2.strip().lower().split() L2='' for i in range(len(L)): information2+=[L[i]] if not L[i] in information3: information3+=[L[i]] wordcount=len(information2) openfile.close() def fhash(w, M): total=0 for i in range(len(w)): total+=ord(w[i])*(37**i) t=total%M return t information4=information3.copy() information5=information2.copy() bo=[] bo2=[] t=[] total=0 for i in range(len(information3)): if information3[i] in information:information4.remove(information3[i]) for i in range(len(information2)): if information2[i] in information:information5.remove(information2[i]) if (nb=='y')or (nb== 'Y'): for i in range(len(information5)): bo.append(fhash(information5[i], M)) for i in range(len(bo)): if bo[i]not in bo2:bo2+=[bo[i]] for i in range(len(bo2)): for r in range(len(bo)): if bo2[i]==bo[r]: total+=1 t.append([bo2[i],total]) total=0 elif (nb=='n')or (nb== 'N'): for i in range(len(information5)): bo.append(information5[i]) for i in range(len(bo)): if bo[i]not in bo2:bo2+=[bo[i]] for i in range(len(bo2)): for r in range(len(bo)): if bo2[i]==bo[r]: total+=1 t.append([bo2[i],total]) total=0 print('char count =', charactercount) print('alphanumeric count =', ab12) print('line count =', linecount) print('word count =', wordcount) print('BoW =', t)
# 6330387321 (21.40) 224 (2021-03-22 19:38) def fhash(w,M): a = 0 G = 37 for i in range(len(w)): a += ord(w[i])*(G**i) b = a%M return b def alphanumeric_count(w): ac = 0 for i in range(len(w)): if w[i] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789': ac += 1 else: pass return ac def word_count(W): for i in range(len(W)): if W[i] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789': pass else: W = W.replace(W[i],' ') wc = len(W.split()) return wc def BoW(w, u, m): f = open("stopwords.txt", "r") f1 = [] line = f.readline() while len(line) != 0: f1.append(line) line = f.readline() f.close() f1 = "".join(f1) f1 = f1.replace('\n',' ') f1 = f1.split() w = w.lower() for i in range(len(w)): if w[i] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789': pass else: w = w.replace(w[i],' ') w = w.split() bb = [] for i in w: if i in f1 : pass else: bb.append(i) bb.sort() if u in ('y','Y'): z = [] for i in range(len(bb)): z0 = fhash(bb[i],m) z.append(z0) z.sort() a = [] b = [] bow = [] y = 1 for j in z: if j not in a: a.append(j) b.append(y) else: b[a.index(j)] +=1 for k in range(len(a)): bow.append([a[k], b[k]]) else: a = [] b = [] bow = [] y = 1 for j in bb: if j not in a: a.append(j) b.append(y) else: b[a.index(j)] +=1 for k in range(len(a)): bow.append([a[k], b[k]]) return bow file_name = input('File name = ') ufh = input('Use feature hashing ? (y,Y,n,N) ') while ufh not in ('y','Y','n','N'): print('Try again.') ufh = input('Use feature hashing ? (y,Y,n,N) ') if ufh in ('y','Y'): M = int(input('M = ')) else: M = 1 fn = open(file_name, "r") nf = [] lc = 0 line = fn.readline() while len(line) != 0: nf.append(line) line = fn.readline() lc += 1 fn.close() nf = "".join(nf) nf = nf.replace('\n','') cc = len(nf) ac = alphanumeric_count(nf) wc = word_count(nf) b = BoW(nf, ufh, M) print('-------------------') print('char count = '+str(cc)) print('alphanumeric count = '+str(ac)) print('line count = '+str(lc)) print('word count = '+str(wc)) print('BoW = '+str(b))
# 6330388021 (22.95) 225 (2021-03-22 20:38) def count_line(file_name): fn = open(file_name) c = 0 for line in fn: c += 1 fn.close() return c def fn2st(file_name): fn = open(file_name) c = '' for i in fn: c+=i c=c.split('\n') c=' '.join(c) fn.close() return c def replace_punctuation(file_name): s=fn2st(file_name) t = "" for e in s: if e in "\"\'/\\,.:;()[]{}": t += " " else: t += e return t def remove_stopwords(file_name,stop_words): a=replace_punctuation(file_name).split() s=fn2st(stop_words).split() t=[] for e in a: if e.lower() not in s: t+=[e.lower()] return t def BoW(t): a=[] for i in t: c=0 for e in t: if i == e: c+=1 if [i,c] not in a: a+=[[i,c]] return a def count_char(file_name): a=0 fn=open(file_name) for i in fn : a+=len(i.strip()) fn.close() return a def fhash(w,M): a=0 for c in range(len(w)): a+=(ord(w[c])*37**c) a=a%int(M) return a def fhash_all(t,M): a=[] for w in t: a+=[fhash(w,M)] return a def count_alb(file_name): a=replace_punctuation(file_name) c=0 for i in a: if i!=' ' : c+=1 return c def count_words(file_name): a=replace_punctuation(file_name) a=a.split() c=0 for i in a: c+=1 return c #------------------------------------------------ file_name = input('File name = ') fh = input('Use feature hashing ? (y,Y,n,N) ') while fh not in ['y','Y','n','N']: print('Try again.') fh=input('Use feature hashing ? (y,Y,n,N) ') if fh in ['y','Y']: M=input('M = ') fn=remove_stopwords(file_name,'stopwords.txt') fn=fhash_all(fn,M) elif fh in ['n','N']: fn=remove_stopwords(file_name,'stopwords.txt') print('-------------------') cc=count_char(file_name) print('char count =',cc) ca=count_alb(file_name) print('alphanumeric count =',ca) cl=count_line(file_name) print('line count =',cl) cw=count_words(file_name) print('word count =',cw) bow=BoW(fn) print('BoW =',bow)
# 6330389621 (22.15) 226 (2021-03-22 19:45) #------------------------------------------------------------------# def blank1(t1): result1="" for k in t1.lower(): if k in"\"\'/\\,.:;\n ": result1+="" else: result1+=k return result1 def blank2(t2): result2="" for l in t2.lower(): if l in"\n": result2+="" else: result2+=l return result2 def blank3(t3): result3="" for m in t3.lower(): if m in"\"\'/\\,.:;\n": result3+=" " else: result3+=m return result3 def bow(dt,sw): BoW=[] for i in range(len(dt)): if dt[i] not in sw: BoW+=[dt[i]] BoW=sorted(BoW) i=0 message=[] while (i <= len(BoW)-1): count = 1 ch = BoW[i] j = i while (j < len(BoW)-1): if (BoW[j] == BoW[j+1]): count = count+1 j = j+1 else: break message.append([BoW[j],count]) i = j+1 return message def delete(dt,sw): BoW=[] for i in range(len(dt)): if dt[i] not in sw and dt[i] not in BoW: BoW+=[dt[i]] return BoW def fahash(w,M): calc=0 for i in range(len(w)): calc+=ord(w[i])*(G**i) calc=calc%M return calc def Bow(a): BoW=sorted(a) i=0 message=[] while (i <= len(BoW)-1): count = 1 ch = BoW[i] j = i while (j < len(BoW)-1): if (BoW[j] == BoW[j+1]): count = count+1 j = j+1 else: break message.append([BoW[j],count]) i = j+1 return message #------------------------------------------------------------------# file_name=open(input("File name = "),"r") fhash=input("Use feature hashing ? (y,Y,n,N) ") while fhash not in "y,Y,n,N": print("Try again") fhash=input("Use feature hashing ? (y,Y,n,N) ") if fhash in "n,N": char="" alphanumeric="" word=[] count_line=0 x="" for line in file_name: char+=blank2(line.lower()) alphanumeric+=blank1(line.lower()) count_line+=1 word+=line.split() x+=blank3(line) data=x.split() fn=open("stopwords.txt","r") y="" for line in fn: y+=blank3(line) stop_word=y.split() fn.close() show=bow(data,stop_word) print("-------------------") print("char count = ",len(char)) print("alphanumeric count = ",len(alphanumeric)) print("line count =",count_line) print("word count =",len(word)) print("BoW =",show) elif fhash in "y,Y": G=37 M=int(input("M = ")) char="" alphanumeric="" word=[] count_line=0 x="" for line in file_name: char+=blank2(line.lower()) alphanumeric+=blank1(line.lower()) count_line+=1 word+=line.split() x+=blank3(line) data=x.split() fn=open("stopwords.txt","r") y="" for line in fn: y+=blank3(line) stop_word=y.split() fn.close() BoW=delete(data,stop_word) BoW_y=[] for i in range(len(BoW)): find=fahash(BoW[i],M) BoW_y+=[find] show=Bow(BoW_y) print("-------------------") print("char count = ",len(char)) print("alphanumeric count = ",len(alphanumeric)) print("line count =",count_line) print("word count =",len(word)) print("BoW =",show) file_name.close()
# 6330391821 (21.90) 227 (2021-03-21 21:01) def alphanumeric_count(x): c = 0 for e in range(len(x)): if "0" <= x[e] <= "9" or "A" <= x[e] <= "z": c += 1 return c def char_count(x): c = 0 for e in range(len(x)): if x[e] != "\n": c += 1 return c def line_count(file_name): a = open(file_name,'r') z = [] for line in a: z.append(line) a.close() p = [] c= 0 for e in range(len(z)): p.append(z[-e-1]) for i in range(len(p)): if p[i] != "\n": break else: c += 1 if c == 0: return len(z) else: return len(z[:-c:]) def split_word(x): y = "" for e in range(len(x)): if "0" <= x[e] <= "9" or "A" <= x[e] <= "z": y += x[e] elif x[e] == "\n": pass else: y += " " x = [] z = y.split(" ") for e in range(len(z)): if z[e] == "": pass else: x.append(z[e]) return x def word_count(x): y = split_word(x) return len(y) def list_word_count(x): y = [] for e in range(len(x)): if x[e] not in y: y.append(x[e]) z = [] for i in range(len(y)): c = 0 for e in range(len(x)): if y[i] == x[e]: c += 1 z.append([y[i],c]) return z def Bag_of_word(x): b = open("stopword.txt",'r') y = [] for line in b: y += split_word(line) x = split_word(x) z = [] for e in range(len(x)): if x[e] in y: pass else: z.append(x[e]) b.close() return list_word_count(z) def feature_flashing(x,m): b = open("stopword.txt",'r') y = [] for line in b: y += split_word(line) x = split_word(x) z = [] for e in range(len(x)): if x[e] in y: pass else: z.append(x[e]) m = int(m) u = [] for e in range(len(z)): c = 0 for i in range(len(z[e])): c += ord(z[e][i])*(37**(i)) c %= m u.append(c) b.close() return list_word_count(u) #__________________________________________________ file_name = input("File name = ") feature = input("Use feature hashing ? (y,Y,n,N) ") while feature not in ['y','Y','n','N']: print("Try again.") feature = input("Use feature hashing ? (y,Y,n,N) ") M = 1 if feature in ['y','Y']: M = input("M = ") print("-------------------") a = open(file_name,'r') char = 0 alphanumeric = 0 word = 0 y = "" for line in a: line = line.lower() for e in range(len(line)): if line[e] == "\n": y += " " else: y += line[e] char += char_count(line) alphanumeric += alphanumeric_count(line) word += word_count(line) print("char count = "+str(char)) print("alphanumeric count = "+str(alphanumeric)) print("line count = "+str(line_count(file_name))) print("word count = "+str(word)) x1 = Bag_of_word(y) x1.sort() x2 = feature_flashing(y,M) x2.sort() a.close() if feature in ['n','N']: print("Bow = "+str(x1)) else: print("Bow = "+str(x2))
# 6330392421 (20.50) 228 (2021-03-22 23:05) # input from user (file_name, feature, (M?)) file_name = input('File name = ') feature = input('Use feature hashing ? (y,Y,n,N) ') k = 0 while k <= 0: if feature in ['y', 'Y']: M = input('M = ') break k += 1 if feature in ['n', 'N']: k += 1 if feature not in ['y', 'Y', 'n', 'N']: print('Try again.') feature = input('Use feature hashing ? (y,Y,n,N) ') k -= 1 # finding list of stop words stopwin = open('stopword.txt', 'r') stop_words = [] for line in stopwin: for e in line.strip().split(): stop_words.append(e) stopwin.close() # def blank(t): result = '' for c in t: if c in '\"\'/\\,.:;': result += ' ' else: result += c result = result.strip() return result # read info from file_name fin = open(file_name,'r') char_count = 0 alphanumeric_count = 0 line_count = 0 word_count = 0 list_of_lower_mee_stopwords = [] for line in fin: list_of_words = blank(line).split() # นับตัวทั้งหมด char_count += len(line.strip()) # นับจำนนบรรทัด line_count += 1 # นับจำนวนตัวอักษร for e in list_of_words: list_of_lower_mee_stopwords.append(e.strip().lower()) alphanumeric_count += len(e) # นับจำนวนคำ word_count += len(list_of_words) print('-------------------') print('char count = ' + str(char_count)) print('alphanumeric count = '+ str(alphanumeric_count)) print('line count = ' + str(line_count)) print('word count = ' + str(word_count)) list_of_screened_words = [] for e in list_of_lower_mee_stopwords: if e not in stop_words: list_of_screened_words.append(e) BoW_words_pre = [] counter = [] for e in list_of_screened_words: if e not in BoW_words_pre: BoW_words_pre.append(e) counter.append(1) else: i = BoW_words_pre.index(e) counter[i] += 1 # Bag of Words no feature BoW_words_no_feature = [] for i in range(len(BoW_words_pre)): BoW_words_no_feature.append([BoW_words_pre[i],counter[i]]) # Bag of Words no feature BoW_words_fhash = [] def fhash(w,M): sum_of_ord = 0 G = 37 for i in range(len(w)): sum_of_ord += ord(w[i])*(G**i) result = sum_of_ord % int(M) return result if feature in ['y', 'Y']: for e in list_of_screened_words: BoW_words_fhash.append(fhash(e,M)) bow_fhash_final = [] c = 0 for e in BoW_words_fhash: for i in range(len(BoW_words_fhash)): if e == BoW_words_fhash[i]: c += 1 if [e,c] not in bow_fhash_final: bow_fhash_final.append([e,c]) c = 0 bow_fhash_final.sort() if feature in ['y', 'Y']: print('BoW = ' + str(bow_fhash_final)) if feature in ['n', 'N']: print('BoW = '+ str(BoW_words_no_feature)) fin.close()
# 6330395321 (30.00) 229 (2021-03-22 23:39) #---------------------------------------------------------------------------- def Allcount(Filename): countline = 0 alphabe = 0 chartnum = '' wordcount = '' sentence = [] alpha = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0'] for line in Filename: chartnum += line.strip('\n') line = line.lower() countline += 1 for e in line: if e in alpha : alphabe += 1 wordcount += ' ' + words(line) for e in wordcount.split(): if e.strip() not in stopwords: sentence.append(e.strip()) print('char count =' ,len(chartnum) ) print('alphanumeric count =' ,alphabe) print('line count =' ,countline) print('word count =',len(wordcount.split())) return sentence #---------------------------------------------------------------------------- def flash(word,M): all = 0 for i in range(len(word)): all += ord(word[i])* (37**i) toon = all % M return toon #---------------------------------------------------------------------------- def words(line): nword = '' alpha = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0'] for e in line: if e not in alpha: nword += ' ' else: nword += e return nword #---------------------------------------------------------------------------- stopwords = [] st = open('stopwords.txt', 'r') for line in st: for word in line.split(): stopwords.append(word) st.close() file_name = input('File name = ') Filename = open(file_name, 'r') fh = input('Use feature hashing ? (y,Y,n,N) ') while fh not in 'y,Y,n,N': print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') if fh.lower() == 'y' : N = input('M = ') M = int(N) print('-------------------') sentence = Allcount(Filename) nword = [] bow = [] repeat = [] for word in sentence: if flash(word,M) not in repeat: repeat.append(flash(word,M)) bow.append([flash(word,M),1]) else: for w in bow: if flash(word,M) == w[0]: w[1] += 1 arrange = sorted(bow) print('BoW =',arrange) else: repeat = [] bow = [] sentence = Allcount(Filename) for word in sentence: if word in repeat: for t in bow: if t[0] == word: t[1] += 1 else: bow.append([word,1]) repeat.append(word) arrangeb = sorted(bow) print('BoW =',arrangeb) Filename.close()
# 6330396021 (18.00) 230 (2021-03-22 21:21) def list_only_word(str_data): a = "" words = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" for i in range(len(str_data)): if str_data[i] in words: a += str_data[i] else : a += " " only_words = a.lower().strip() return only_words def char_count(file_name): fin = open(file_name,"r") line = fin.readline() a = line.strip() for line in fin: a += line.strip() fin.close() return len(a) def alphanumeric_count(file_name): fin = open(file_name,"r") line = fin.readline() words = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" n = 0 a = line for i in fin: a += i for e in range(len(a)): if a[e] in words : n += 1 alphanumeric_count1 = n fin.close() return alphanumeric_count1 def line_count(file_name): #fin = open("sample.txt","r") fin = open(file_name,"r") line = fin.readline() n = 0 for line in fin: n += 1 fin.close() return n+1 def word_count(file_name): fin = open(file_name,"r") line = fin.readline() a = line for i in fin: a += i b = list_only_word(a) word_count_in_fin = b.split() return len(word_count_in_fin) def normal_BoW(file_name): fin1 = open(file_name,"r") line1 = fin1.readline() line2 = fin2.readline() a = line2 for i in fin2: a += i a.strip().lower() list_stopwords = a.split() str_words = line1 for e in fin1: str_words += e b = list_only_word(str_words) list_words1 = b.split() list_words2 = [] for q in list_words1: if q not in list_stopwords : list_words2.append(q) list_words3 = [] for z in list_words2: if z not in list_words3 : list_words3.append(z) list_words2.sort() list_words3.sort() normal_BoW = [] for word in list_words3: count_word = 0 for p in list_words2: if p == word : count_word += 1 normal_BoW.append([word,count_word]) return normal_BoW def fhash(w,M): a = 0 for i in range(len(w)): a += ord(w[i])*37**i return a%M def num_BoW(file_name,M): fin1 = open(file_name,"r") line1 = fin1.readline() line2 = fin2.readline() a = line2 for i in fin2: a += i a.strip().lower() list_stopwords = a.split() str_words = line1 for e in fin1: str_words += e b = list_only_word(str_words) list_words1 = b.split() list_words2 = [] for q in list_words1: if q not in list_stopwords : list_words2.append(q) list_words2.sort() Q = [] for i in list_words2: Q.append(fhash(i,M)) Q.sort() list_words22 = [] for q in Q: if q not in list_words22 : list_words22.append(q) num_BoW = [] for word in list_words22: count_word = 0 for p in Q: if p == word : count_word += 1 num_BoW.append([word,count_word]) return num_BoW fin2 = open("stopwords.txt","r") file_name = input("File name = ") yes_no = input("Use feature hashing ? (y,Y,n,N) ") while yes_no not in "nNyY" : print("Try again.") yes_no = input("Use feature hashing ? (y,Y,n,N) ") else : if yes_no in 'nN' : print("-------------------") print("char count =",char_count(file_name)) print("alphanumeric_count =",alphanumeric_count(file_name)) print("line_count =",line_count(file_name)) print("word_count =",word_count(file_name)) print("BoW =",normal_BoW(file_name)) elif yes_no in 'yY' : M = int(input("M = ")) print("-------------------") print("char count =",char_count(file_name)) print("alphanumeric_count =",alphanumeric_count(file_name)) print("line_count =",line_count(file_name)) print("word_count =",word_count(file_name)) print("BoW =",num_BoW(file_name,M))
# 6330397621 (21.40) 231 (2021-03-21 21:13) # fhash function def fhash(string,M): if M==0: return string return sum(ord(string[i])*37**(i) for i in range(len(string)))%M #Calculate parameter and Bag of word function def BoW(file_name,M): with open(file_name) as f: wordlist = f.readlines() char = 0 alp = 0 line =0 words = 0 line = len(wordlist) wordlist = "".join(wordlist).replace('\n','').lower() char = len(wordlist) alp = sum(c.isalpha() for c in wordlist)+sum(c.isdigit() for c in wordlist) wordlist = "".join([ c if c.isalnum() else " " for c in wordlist ]) wordlist = ' '.join(wordlist.split()).split(" ") words = len(wordlist) clean_words = [] for w in wordlist: if w not in stop_words: clean_words.append(fhash(w,M)) wordfreq = [clean_words.count(w) for w in clean_words] BoW = [] for i in range(len(clean_words)): if [clean_words[i],wordfreq[i]] not in BoW: BoW.append([clean_words[i],wordfreq[i]]) BoW = sorted(BoW) print('-------------------') print('char count =',char) print('alphanumeric count =',alp) print('line count =',line) print('word count =',words) print('BoW =',BoW) #Open stop words with open('stopwords.txt') as f: stop_words = f.read() stop_words = stop_words.replace('\n',' ') stop_words = stop_words.split(' ') # User input file_name = input("File name = ") while True: use_M = input('use feature hashing ? (y,Y,n,N) ') if use_M in ['y','Y']: M = int(input('M = ')) break elif use_M in ['n','N']: M=0 break else: print('Try again.') BoW(file_name,M)
# 6330398221 (30.00) 232 (2021-03-21 19:16) def flash(w,M): feature=0 G=37 n=0 for c in w: feature+=ord(c)*G**n n+=1 feature_hashing=feature%M return feature_hashing #----------------------------------- file_name=input('File name = ') use=input('Use feature hashing ? (y,Y,n,N) ') while use!='Y' and use!='y' and use!='N' and use!='n': print('Try again.') use=input('Use feature hashing ? (y,Y,n,N) ') if use=='Y' or use=='y': M=int(input('M = ')) #----------------------------------- stop=open('stopwords.txt','r') stopwords=[] for e in stop: s=e.split() stopwords+=s stop.close() #----------------------------------- file=open(file_name,'r') word='' word1='' word_2=[] for x in file: word+=x[:-1:] word1+=x word_2.append(x) # n_word=len(word)+1 n_word=0 for l in range(len(word_2)): if word_2[-1]=='' or word_2[-1]=='\n': word_2=word_2[:-1:] for ch in word_2: if ch[-1]=='\n': n_word+=len(ch)-1 else: n_word+=len(ch) print('-'*19) print('char count = '+str(n_word)) #----------------------------------- word1=word1.lower() collect='0123456789abcdefghijklmnopqrstuvwxyz' list_word=[] word_1='' for x1 in word1: if x1 in collect: word_1+=x1 elif word_1!='': list_word.append(word_1) word_1='' if word_1!='': list_word.append(word_1) #----------------------------------- alphabet_count=0 for x2 in list_word: alphabet_count+=len(x2) print('alphanumeric count = '+str(alphabet_count)) #----------------------------------- line_count=len(word_2) print('line count = '+str(line_count)) #----------------------------------- word_count=len(list_word) print('word count = '+str(word_count)) #----------------------------------- word_cut=[] for w in list_word: if w not in stopwords: word_cut.append(w) #----------------------------------- BoW=[] if use=='Y' or use=='y': BoW_y=[] for By in word_cut: BoW_y.append(flash(By,M)) BoW_y.sort() ny=1 if word_cut==[]: BoW=[] else: for Byes in range(1,len(BoW_y)): if BoW_y[Byes]!=BoW_y[Byes-1]: BoW.append([BoW_y[Byes-1],ny]) ny=1 else: ny+=1 BoW.append([BoW_y[-1],ny]) else: nn=1 word_cut.sort() if word_cut==[]: BoW=[] else: for Bno in range(1,len(word_cut)): if word_cut[Bno]!=word_cut[Bno-1]: BoW.append([word_cut[Bno-1],nn]) nn=1 else: nn+=1 BoW.append([word_cut[-1],nn]) file.close() print('BoW = '+str(BoW)) #-----------------------------------
# 6330399921 (22.34) 233 (2021-03-22 01:23) def file_to_lowerstr(file): txt='' c=0 for i in file: if i[-1]=='\n': txt+=i[:-1]+' ' else: txt+=i c+=1 txt=txt.lower() return [txt,c] def del_punc(txt): new='' for i in txt: if i in '?!{}[]()+-*/=:;\'\"&_%$#@^><\\': new+=' ' else: new+=i lis=new.split() while '' in lis: lis.remove('') return lis def del_stp(lis,stp): nlis=lis[:] for i in lis: if i in stp: nlis.remove(i) return nlis def count_wrd(lis): count=[] nlis=[] freq_wrd=[] for i in lis: if i not in nlis: nlis.append(i) count.append(1) else: count[nlis.index(i)]+=1 for i in range(len(nlis)): freq_wrd.append([nlis[i],count[i]]) return freq_wrd def fhash(wrd,m): x=0 for i in range(len(wrd)): x+=ord(wrd[i])*37**i y=x%int(m) return y def act_fh(lis,m): fh=[] count=[] f=[] c=[] l=[] for i in lis: fh.append(fhash(i[0],m)) count.append(i[1]) for i in range(len(fh)): if fh[i] not in f: f.append(fh[i]) c.append(count[i]) else: c[f.index(fh[i])]+=count[i] for i in range(len(f)): l.append([f[i],c[i]]) return l def main(): file_name=input('File name = ') file=open(file_name,'r') stp=open('stopwords.txt','r') z=file_to_lowerstr(file) d=z[1] a=z[0] b=del_punc(a) x='' for i in b: x+=i file_txt=z[0] stp_lis=file_to_lowerstr(stp)[0].split() file_lis=del_stp(del_punc(file_txt),stp_lis) Bow=count_wrd(file_lis) while True: check=input('Use feature hashing ? (y,Y,n,N) ') if check in 'yY': m=input('M = ') print('-------------------') Bow=act_fh(Bow,m) break elif check in 'nN': print('-------------------') break else: print('Try again') print('char count = ',len(a)+d-1) print('alphanumeric count = ',len(x)) print('line count = ',d) print('word count = ',len(b)) print('BoW = ',Bow) file.close() stp.close() main()
# 6330400821 (19.65) 234 (2021-03-21 12:40) file_name = input("File name = ") choice = input("Use feature hashing ? (y,Y,n,N) ") sum_len = 0 while True : if choice == "y" or choice == "Y" : M_putin = input("M = ") print("-------------------") stop_word = open("stopwords.txt","r") stop_word_list = '' for i in stop_word : i.strip() for e in i : e.strip() stop_word_list += e.strip() my_file = open(file_name,"r") for i in my_file : sum_len += len(i.strip()) my_file.close() print("char count = " + str(sum_len)) #--------------------------------------- my_file = open(file_name,"r") alphanumeric_count = 0 for e in my_file : for i in e.strip() : if i.lower() in "qwertyuiopasdfghjklzxcvbnm" : alphanumeric_count += 1 if i in "0123456789" : alphanumeric_count += 1 else : continue my_file.close() print("alphanumeric count = " + str(alphanumeric_count)) #-------------------------------------------------- my_file = open(file_name,"r") line_count = 0 for i in my_file : line_count += 1 print("line count = " + str(line_count)) my_file.close() #-------------------------------------------------- my_file = open(file_name,"r") component = '' list_ti = [] for e in my_file : i = 0 while i < len(e.strip()) : if i - len(e.strip()) != -1 : if e[i].lower() in "qwertyuiopasdfghjklzxcvbnm0123456789" and e[i+1].lower() in "qwertyuiopasdfghjklzxcvbnm0123456789": component += e[i] if e[i+1].lower() not in "qwertyuiopasdfghjklzxcvbnm0123456789" : component += e[i] list_ti.append(component) component = '' if i - len(e.strip()) == -1 : if e[i] == "qwertyuiopasdfghjklzxcvbnm0123456789" : component += e[i] list_ti.append(component) component = '' break i += 1 print("word count = " + str(len(list_ti))) my_file.close() #-------------------------------------------------------- my_file = open(file_name,"r") G = 37 su_m = 0 def flash(string): z = ord(string) return z def hlash(su_m, M_putin) : T = su_m % int(M_putin) return T def sortgunti(one,li_st) : i = 0 e = 0 number = 0 while i < len(li_st) : if li_st.find(one,i) != -1 : e += 1 i += 1 return e def get_unique(unique, words): z = 1 for i in range(len(words)): words.remove(unique) if unique not in words: words.append(unique) else: z += 1 return [unique, z] for_1word = [] for i in list_ti : p = 0 if i.lower() not in stop_word_list : for e in i : su_m += flash(e)*(G**p) p += 1 for_1word.append(hlash(su_m, M_putin)) su_m = 0 for i in for_1word : i = int(i) for_1word.sort() i = 0 set = [] while i < len(for_1word) : for_1word.sort() unique = for_1word[i] set.append(get_unique(unique,for_1word)) i += get_unique(unique,for_1word)[1] set.sort() print("BoW = " + str(set)) my_file.close() break if choice == "n" or choice == "N" : print("-------------------") stop_word = open("stopwords.txt", "r") stop_word_list = '' for i in stop_word: i.strip() for e in i: e.strip() stop_word_list += e.strip() my_file = open(file_name, "r") for i in my_file: sum_len += len(i.strip()) my_file.close() print("char count = " + str(sum_len)) # --------------------------------------- my_file = open(file_name, "r") alphanumeric_count = 0 for e in my_file: for i in e.strip(): if i.lower() in "qwertyuiopasdfghjklzxcvbnm": alphanumeric_count += 1 if i in "0123456789": alphanumeric_count += 1 else: continue my_file.close() print("alphanumeric count = " + str(alphanumeric_count)) # -------------------------------------------------- my_file = open(file_name, "r") line_count = 0 for i in my_file: line_count += 1 print("line count = " + str(line_count)) my_file.close() # -------------------------------------------------- my_file = open(file_name, "r") component = '' list_ti = [] for e in my_file: i = 0 while i < len(e.strip()): if i - len(e.strip()) != -1: if e[i].lower() in "qwertyuiopasdfghjklzxcvbnm0123456789" and e[ i + 1].lower() in "qwertyuiopasdfghjklzxcvbnm0123456789": component += e[i] if e[i + 1].lower() not in "qwertyuiopasdfghjklzxcvbnm0123456789": component += e[i] list_ti.append(component) component = '' if i - len(e.strip()) == -1: if e[i] == "qwertyuiopasdfghjklzxcvbnm0123456789": component += e[i] list_ti.append(component) component = '' break i += 1 print("word count = " + str(len(list_ti))) my_file.close() # -------------------------------------------------------- my_file = open(file_name, "r") G = 37 su_m = 0 def flash(string): z = ord(string) return z def hlash(su_m, M_putin): T = su_m % int(M_putin) return T def sortgunti(one, li_st): i = 0 e = 0 number = 0 while i < len(li_st): if li_st.find(one, i) != -1: e += 1 i += 1 return e def get_unique(unique, words): z = 1 for i in range(len(words)): words.remove(unique) if unique not in words: words.append(unique) else: z += 1 return [unique, z] i = 0 set = [] while i < len(list_ti): list_ti.sort() if list_ti[i].lower() not in stop_word_list : unique = list_ti[i].lower() set.append(get_unique(unique, list_ti)) i += get_unique(unique, list_ti)[1] else : i += 1 print("BoW = " + str(set)) my_file.close() break else : print("Try again.") choice = input("Use feature hashing ? (y,Y,n,N) ")
# 6330401421 (22.00) 235 (2021-03-22 23:28) # ------------------------------------------------------- def inp(): M = 1 file_name = input("File name = ") enable_fhash = input("Use feature hashing ? (y,Y,n,N) ") while enable_fhash.lower() not in 'yn': print("Try again.") enable_fhash = input("Use feature hashing ? (y,Y,n,N) ") if enable_fhash.lower() in 'yn': if enable_fhash.lower() == 'y': M = int(input("M = ")) return file_name, enable_fhash.lower(), M # ------------------------------------------------------- def fhash(w,M): G = 37; fh = 0 for i,ch in enumerate(w): fh += ord(ch) * (G**i) return fh % M # ------------------------------------------------------- def stopwords(): file = open('stopwords.txt','r') words = [] for line in file: for word in line.strip().split(): words.append(word.lower()) file.close() return words # ------------------------------------------------------- def filereader(fname,fhash,M): file = open(fname) #------------ charcount = 0; alnucount = 0; linecount = 0 words = [] #------------ for line in file: linecount += 1 word = "" for ch in line.strip(): charcount += 1 if ch == '\n': charcount -= 2 if ('a' <= ch <= 'z') or ('A' <= ch <= 'Z') or ('0' <= ch <= '9'): alnucount += 1 word += ch elif len(word) != 0: words.append(word.lower()) word = "" else: pass file.close() return charcount, alnucount, linecount, len(words), words # ------------------------------------------------------- def bow(words, stopwords, M, fh): bag = [] for word in words: if word not in stopwords: if fh == 'y': hashed = fhash(word,M) bag.append(hashed) if fh == 'n': if [word,words.count(word)] not in bag: bag.append([word,words.count(word)]) else: pass nb = [] if fh == 'y': for e in bag: if [e,bag.count(e)] not in nb: nb.append([e,bag.count(e)]) return sorted(nb) if fh == 'n': return sorted(bag) return bag # ------------------------------------------------------- fname, fhsh, M = inp() cc, ac, lc, wc, words = filereader(fname,fhash,M) print('-' * 19) print('char count =',cc) print('alphanumeric count =',ac) print('line count =',lc) print('word count =',wc) print('BoW =',bow(words, stopwords(), M, fhsh))
# 6330402021 (23.13) 236 (2021-03-21 23:25) def fhash(w,m): c = 0 for i in range(len(w)) : c += ord(w[i])*(37**i) return c % m #------------------------------------------------------------------------------------- file_name = input("File name = ") hashing = input("Use feature hashing ? (y,Y,n,N) ") while hashing not in ["Y","y","n","N"]: print("Try again.") hashing = input("Use feature hashing ? (y,Y,n,N) ") if hashing in ["Y","y"]: m = int(input("M = ")) print("-------------------") stopword = open("stopword.txt","r") sw = [] for i in stopword: word_char1 = "" for a in i: if a.isalpha() == True : word_char1 += a.lower() elif word_char1 != "" : sw.append(word_char1) word_char1 = "" else: word_char1 = "" if word_char1.isalpha() == True : sw.append(word_char1) stopword.close() line = 0 char = 0 alnum = 0 file = open(file_name,"r") for i in file: line += 1 char += len(i)-1 for a in i: if a.isalnum() == True : alnum += 1 h = a if h != "\n" : char += 1 file.close() print("char count = ",char) print("alphanumeric count = ",alnum) print("line count = ",line) file = open(file_name,"r") word_char = [] for i in file : word_char1 = "" for a in i: if a.isalnum() == True : word_char1 += a.lower() elif word_char1 != "" : word_char.append(word_char1) word_char1 = "" else: word_char1 = "" if word_char1.isalnum() == True : word_char.append(word_char1) file.close() word = len(word_char) print("word count = ",word) word_clear = [] for i in word_char: if i not in sw : word_clear.append(i) BoW = [] for i in word_clear: bow_count = 0 for a in range(len(word_clear)): if i == word_clear[a]: bow_count += 1 if [i,bow_count] not in BoW : BoW.append([i,bow_count]) if hashing in "Nn": print("BoW = ",BoW) elif hashing in "Yy": BoW_fhash = [] for i in BoW : BoW_fhash.append([fhash(i[0],m),i[1]]) BoW_fhash_clear = [] for i in BoW_fhash: bow_fhash_count = 0 for a in range(len(BoW_fhash)): if i[0] == BoW_fhash[a][0]: bow_fhash_count += BoW_fhash[a][1] if [i[0],bow_fhash_count] not in BoW_fhash_clear : BoW_fhash_clear.append([i[0],bow_fhash_count]) print("BoW = ",BoW_fhash_clear)
# 6330403721 (25.00) 237 (2021-03-20 19:15) #--------------------------------------------------------------- def file_to_calw(file_name): words = '' for word in file_name: words += word words = words.lower() c = 0 for sym in words: if sym == '\n': c += 1 text = '' for e in words: if not 'a' <= e <= 'z' and not '0' <= e <= '9': e = ' ' text += e else: text += e text = text.split() text1 = ''.join(text) print('char count =',len(words)-c) print('alphanumeric count =',len(text1)) print('line count =',c+1) print('word count =',len(text)) return(text) def _stopwords(text): stop = open('stopwords.txt', 'r') stop_str = '' for i in stop: stop_str += i stop_list = stop_str.split() fi_text = [] for i in range(len(text)): fi_text.append(text[i]) for i in range(len(text)): if text[i] in stop_list: fi_text.remove(text[i]) return(fi_text) def get_unique(text): unique_text = [] text.sort() if len(text) != 0 : for i in range(len(text)): if text[i-1] != text[i]: unique_text.append(text[i]) return unique_text def bow_1(unique_text, fi_text): bow1 = [] c = 0 while c < len(unique_text): s = 0 for i in range(len(fi_text)): if unique_text[c] == fi_text[i]: s += 1 bow1.append([unique_text[c], s]) c += 1 return bow1 def fhash(w, m): sum = 0 for i in range(len(w)): sum += ord(w[i])*(37)**i fhash = sum % m return(fhash) def bow_fhash(bow): bow2 = [] for i in range(len(bow)): bow2.append([fhash(bow[i][0], m), bow[i][1]]) bow2.sort() bow3 = [] if m == 1: c = 0 for i in range(len(bow2)): c += bow2[i][1] bow3 = [[0, c]] else: for i in range(len(bow2)): if bow2[i-1][0] != bow2[i][0]: bow3.append(bow2[i]) else: bow3[-1][1] += bow2[i][1] return bow3 #--------------------------------------------------------------- file = input("File name = ") yn = input("Use feature hashing ? (y,Y,n,N) ") while yn != 'y' and yn != 'Y' and yn != 'n' and yn != 'N': print("Try again.") yn = input("Use feature hashing ? (y,Y,n,N) ") if yn == 'y' or yn =='Y': m = int(input('M = ')) print('-------------------') file_name = open(file , 'r') text = file_to_calw(file_name) fi_text = (_stopwords(text)) unique_text = get_unique(fi_text) bow1 = bow_1(unique_text, fi_text) print(bow_fhash(bow1)) if yn == 'n' or yn == 'N': print('-------------------') file_name = open(file , 'r') text = file_to_calw(file_name) fi_text = (_stopwords(text)) unique_text = get_unique(fi_text) print('BoW =',bow_1(unique_text, fi_text))
# 6330404321 (23.00) 238 (2021-03-22 00:48) stop_words = [] stop_file = open("stopwords.txt", 'r') for line in stop_file: if line: stop_words.extend(line.split()) num_lines = 0 num_words = 0 num_chars = 0 num_alpha_numeric = 0 filename = input("File name = ") f = open(filename, 'r') converted_words = [] for line in f: line = line.strip('\n') num_lines += 1 num_chars += len(line) num_alpha_numeric += sum(char.isalnum() for char in line) new_line = "" for char in line: if char.isalnum(): new_line += char else: new_line += " " words = new_line.split() converted_words.extend(words) num_words += len(words) f.close() bag_of_words = [] for w in converted_words: if w not in stop_words: bag_of_words.append(w) while True: do_hash = input("Use feature hashing ? (y,Y,n,N) ") if do_hash == 'y' or do_hash == 'Y': m = int(input("M = ")) new_bag_of_words = [] for w in bag_of_words: chars = list(w) sum_ord = 0 count =0 for c in chars: sum_ord += ord(c) * (37 ** count) count+=1 new_bag_of_words.append(sum_ord % m) bag_of_words = new_bag_of_words break elif do_hash == 'n' or do_hash == 'N': break else: print("Try again.") bow_count = [] dict1 = [] for w in bag_of_words: if w not in dict1: bow_count = bow_count + [[w, 0]] dict1.extend([w]) bow_count[dict1.index(w)][1] += 1 bow_count = sorted(bow_count) print("char count =", num_chars) print("alphanumeric count =", num_alpha_numeric) print("line count =", num_lines) print("word count =", num_words) print("BoW =", bow_count)
# 6330405021 (29.00) 239 (2021-03-22 22:17) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def fhash(x,y): a=0 for i in range(len(x)): a+=ord(x[i])*37**i return a%y #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ file_name=input('File name = ') f1=open(file_name,'r',encoding='utf-8') sw=open('stopwords.txt','r') swl=[] for line in sw: swl+=line.strip().split() #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ while True: use=input('Use feature hashing ? (y,Y,n,N) ') if use in'yYnN' and use!='': break else: print('Try again.') cc,ac,l,wc,a4w=0,0,[],0,[] #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ for line in f1: cc+=len(line)-int(line[-1]=='\n') l.append(line) s,re='','' for i in line: if i in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ': re+=i.lower() else: re+=' ' ac+=len(''.join(re.split())) wc+=len(re.split()) a4w+=[m for m in re.split() if m not in swl] for i in range(len(l)): if l[-1]=='\n': l=l[:-1] #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ L1=[] L2=[] if use in 'yY': M=int(input('M = ')) for i in a4w: if fhash(i,M) not in L1: L1.append(fhash(i,M)) L2+=[1] else: L2[L1.index(fhash(i,M))]+=1 else: for i in a4w: if i not in L1: L1.append(i) L2+=[1] else: L2[L1.index(i)]+=1 BoW=[[L1[i],L2[i]] for i in range(len(L1))] #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print('-'*19) print('char count =',cc) print('alphanumeric count =',ac) print('line count =',len(l)) print('word count =',wc) print('BoW =',sorted(BoW)) f1.close() sw.close()
# 6330406621 (21.40) 240 (2021-03-22 01:28) a = input("File name = ") b = input("Use feature hashing ? (y,Y,n,N) ") #z = input("M = ") ccc = 0 c1 = "" c2 = 0 d = "" sw = [] wf = [] x2 = "" read = open("stopwords.txt", "r") for line in read: sw += line.split() while b != "n" and b != "N" and b != "y" and b != "Y": print("Try again.") b = input("Use feature hashing ? (y,Y,n,N) ") b = b if b == "n" or b == "N": file_name = open(a, "r") for line in read: d = line.strip() for line in file_name: ccc += len(line.strip()) c2 += 1 s = line.lower().strip() for ch in (s): if ch in "1234567890abcdefghijklmnopqrstuvwxyz": c1 += ch x2 += ch elif ch not in "1234567890abcdefghijklmnopqrstuvwxyz": c1 += " " wn = c1.split() wl = [] cc = 1 n = 0 xl = [] for word in wn: if word not in sw: wf += [word] #print(wf) for j in range(len(wf)): n = wf.count(wf[j]) if [wf[j], n] not in xl: xl.append([wf[j], n]) #print(xl) print("char count = " + str(ccc)) print("alphanumeric count = " + str(len(x2))) print("line count = " + str(c2)) print("word count = " + str(len(wn))) print("BoW =", xl) elif b == "y" or b == "Y": z = input("M = ") file_name = open(a, "r") for line in read: d = line.strip() for line in file_name: ccc += len(line.strip()) c2 += 1 s = line.lower().strip() for ch in (s): if ch in "1234567890abcdefghijklmnopqrstuvwxyz": c1 += ch x2 += ch elif ch not in "1234567890abcdefghijklmnopqrstuvwxyz": c1 += " " #print('wwwwwwww') wn = c1.split() #print(wn) wl = [] cc = 1 n = 0 xl = [] for word in wn: if word not in sw: wf += [word] #print(wf) for j in range(len(wf)): n = wf.count(wf[j]) if [wf[j], n] not in xl: xl.append([wf[j], n]) #print(xl) #print(hashing(wn, int(z))) g = 37 M = int(z) h = [] w = [] w1 = [] v = [] o = [] e = [] r = [] rr = [] for i in wf: h += [i] for l in h: w = list(l) w1.append(w) #print(w1) x = 0 c = 0 o1 = 0 u = [] for i in w1: for t in range(len(i)): o = (ord(i[t])*(g**t)) x += o c += 1 #print(i[t], c) if c == len(i): x = int(x) o1 = x % M h = [o1, "".join(i)] c = 0 x = 0 o1 = 0 u += [h] u = u temp = [] ct = [] ct1 = [] for x in u: temp += [(x[0])] for y in temp: ct.append([y, temp.count(y)]) for z in ct: if z not in ct1: ct1.append(z) print("char count = " + str(ccc)) print("alphanumeric count = " + str(len(x2))) print("line count = " + str(c2)) print("word count = " + str(len(wn))) print("BoW =", ct1) read.close()
# 6330407221 (30.00) 241 (2021-03-21 21:14) file_name = input('File.name = ') set_1 = [] set_3 = [] d = open(file_name ,'r') usehashing = input('Use feature hashing ? (y,Y,n,N) ') while usehashing not in ['y','Y','n','N']: print ('Try again.') usehashing = input('Use feature hashing ? (y,Y,n,N) ') if (usehashing == 'y') or (usehashing == 'Y') : Numberofbow = int(input('M = ')) c = open('stopwords.txt', 'r') print('-------------------') #stopwords for i in c: word = i.split() set_1.append(word) set_2 =[] for y in set_1: for z in y: set_2.append(z) c.close() #text for j in d: words = j.split() set_3.append(words) set_4 =[] for l in set_3: for k in l: set_4.append(k.strip()) d.close() #character_count d = open(file_name) charrr_count = 0 for i in d: if '\n' in i: charrr_count += len(i)-1 else : charrr_count += len(i) print('char count =',charrr_count) d.close() #number and eng_character use = [] for d in set_4: used = '' newset_4 = [] for e in d: if ('a'<=e<='z') or ('A'<=e<='Z') or ('0'<=e<='9'): used += e use.append(used) count = 0 for h in use: count += int(len(h)) print('alphanumeric count','=',count) #line_count d = open(file_name ,'r') line_count = 0 for i in d: line_count += 1 print('line count','=',line_count) d.close() #wordcount wordcount = 0 d = [] p = [] for h in set_4: e ='' o = '' for i in h: if ('a'<=i<='z') or ('A'<=i<='Z') or ('0'<=i<='9'): e += i else : e += ' ' b = e.split() d.append(b) wordcount += int(len(b)) for r in d: for g in r: p.append(g) print('word count','=',wordcount) #bow check = [] check_2 = [] for d in p: check.append(d.lower()) for i in check : if i not in set_2: check_2.append(i) check_2.sort() bow = [] for i in check_2: c = 0 for y in check_2: if y == i: c+=1 bow_f =[i,c] if bow_f not in bow: bow.append(bow_f) #feature hashing def fhash(a, M): sum = 0 for i in range(len(a)): sum += ord(a[i])*(37**i) answer = sum%M return answer hashing = [] for i in bow : aa = fhash((i[0]),Numberofbow) bb = [aa,i[1]] hashing.append(bb) hashing.sort() use = [] for y in hashing: cc = 0 for h in hashing: if y[0] == h[0]: cc += h[1] add = [y[0],cc] if add not in use: use.append(add) print('BoW','=',use) #no elif (usehashing == 'n') or (usehashing=='N'): c = open('stopwords.txt', 'r') print('-------------------') #stopwords for i in c: word = i.split() set_1.append(word) set_2 =[] for y in set_1: for z in y: set_2.append(z) c.close() #text for j in d: words = j.split() set_3.append(words) set_4 =[] for l in set_3: for k in l: set_4.append(k.strip()) d.close() d = open(file_name) #character_count d = open(file_name) charrr_count = 0 for i in d: if '\n' in i: charrr_count += len(i)-1 else : charrr_count += len(i) print('char count =',charrr_count) d.close() #number and eng_character use = [] for d in set_4: used = '' newset_4 = [] for e in d: if ('a'<=e<='z') or ('A'<=e<='Z') or ('0'<=e<='9'): used += e use.append(used) count = 0 for h in use: count += int(len(h)) print('alphanumeric count','=',count) #line_count d = open(file_name ,'r') line_count = 0 for i in d: line_count += 1 print('line count','=',line_count) d.close() #wordcount wordcount = 0 d = [] p = [] for h in set_4: e ='' o = '' for i in h: if ('a'<=i<='z') or ('A'<=i<='Z') or ('0'<=i<='9'): e += i else : e += ' ' b = e.split() d.append(b) wordcount += int(len(b)) for r in d: for g in r: p.append(g) print('word count','=',wordcount) #bow check = [] check_2 = [] for d in p: check.append(d.lower()) for i in check : if i not in set_2: check_2.append(i) check_2.sort() bow = [] for i in check_2: c = 0 for y in check_2: if y == i: c+=1 bow_f =[i,c] if bow_f not in bow: bow.append(bow_f) print('BoW =',bow)
# 6330408921 (30.00) 242 (2021-03-21 19:29) def BoW(list_str): list_str.sort() k = [] v = [] bow = [] for s in list_str: if not (s in k): k.append(s) v.append(1) else: v[k.index(s)] += 1 for i in range(len(k)): bow.append([k[i], v[i]]) return bow def fhash(w, M): n = 0 ans = 0 for s in w: ans += ord(s)*(37**n) n += 1 return ans % int(M) file_name = input('File name = ') file = open(file_name, 'r') st = open('stopwords.txt', 'r') StrFile = '' StopFile = '' char_count = 0 c = [] for line in file: for s in line: if s != '\n': char_count += 1 c.append(line) StrFile += line.lower().strip()+' ' StrFileNoEtc = '' for i in range(len(c)): if c[-1] == '\n': c = c[:-1] line_count = len(c) for s in StrFile: if '0' <= s <= '9' or 'a' <= s <= 'z' or s == ' ': StrFileNoEtc += s else: StrFileNoEtc += ' ' StrFileNoStop = [] for line in st: StopFile += line.strip()+' ' StopFile = StopFile.split() StrFileNoEtc = StrFileNoEtc.split() for i in range(len(StrFileNoEtc)): if not StrFileNoEtc[i] in StopFile: StrFileNoStop.append(StrFileNoEtc[i]) x = input('Use feature hashing ? (y,Y,n,N) ').lower() while x != 'y' and x != 'n': print('Try again.') x = input('Use feature hashing ? (y,Y,n,N) ').lower() if x == 'y': m = input('M = ') fh = [] for s in StrFileNoStop: fh.append(fhash(s, m)) print('-------------------') print('char count =', char_count) print('alphanumeric count =', len(''.join(StrFileNoEtc))) print('line count =', line_count) print('word count =', len(StrFileNoEtc)) print('BoW =', BoW(fh)) else: print('-------------------') print('char count =', char_count) print('alphanumeric count =', len(''.join(StrFileNoEtc))) print('line count =', line_count) print('word count =', len(StrFileNoEtc)) print('BoW =', BoW(StrFileNoStop)) st.close() file.close()
# 6330409521 (0.00) 243 (2021-03-22 22:08) def fhash(w,M): result = 0 G = 37 for i in range(len(w)): result = result + ord(w[i])*(G**i) result = result% int(M) return result file_name = input('File name = ') while True: mode = input('User feature hashing ? (y,Y,n,N)') if mode not in ['y','Y','n','N']: print('Try again') elif mode == 'y' or mode == 'Y': M = input('M = ') break elif mode == 'n' or mode == 'N': M = -1 break print('--------------------') fle = open('stopwords.txt','r') stw = [] stw2= [] for line in fle: k = line.lower() k = k.strip().split() stw.append(k) for i in range(len(stw)): w =stw[i] for j in range(len(w)): stw2.append(w[j]) fle = open(file_name,"r") y = 0 p = 0 q = 0 words = 0 for line in fle: k = line.strip() y = y +len(k) fle = open(file_name,"r") for line in fle: k = line.strip().split() for i in range(len(k)): o = k[i] for e in range(len(o)): u = o[e] if u in ['\"','\'',',','.','|','/',';',':']: p +=0 else: p +=1 fle = open(file_name,"r") for line in fle: q +=1 fle = open('file_name.txt','r') z = [] v = [] q = 4 for line in fle: k = line.lower() k = k.strip('\n') k = k.strip(',') k = k.strip('"') k = k.strip('.').split() z.append(k) for i in range(len(z)): c=z[i] for j in range(len(c)): v.append(c[j]) copy1 = v.copy() pos =0 times =0 while True: if times == len(v): break if v[pos] in stw2: v.remove(str(v[pos])) else: pos +=1 times +=1 pos = 0 bow = [] checker = [] def count( data, element ): c = 0 for e in data: if e == element: c += 1 return c if mode == 'y' or mode =='Y': fhash_value = [] for i in range(len(v)): g = fhash(str(v[i]),M) fhash_value.append(str(g)) for i in range(len(fhash_value)): semibow = [] if fhash_value[i] in checker: pass else: amount = count(fhash_value,str(fhash_value[i])) checker.append(str(fhash_value[i])) semibow.append(str(fhash_value[i])) semibow.append(amount) bow.append(semibow) else: for i in range(len(v)): semibow = [] if v[i] in checker: pass else: amount = count(v,str(v[i])) checker.append(str(v[i])) semibow.append(str(v[i])) semibow.append(amount) bow.append(semibow) print('char count = '+str(y)) print('alphanumeric count = '+str(p)) print('line count = '+str(q)) print('word count = '+str(len(copy1))) print('BoW = '+ str(bow)) fle.close()
# 6330410021 (22.58) 244 (2021-03-21 22:55) def fhash(w,m): c=0 for i in range(len(w)): c+=ord(w[i])*37**i return c%m def yakword(x): r=open(x,'r') sarae='"\'\\/,-=-+.#$%^&*()[]{}:;<>?|\n ' e=[] allc=0 alphac=0 wn=0 nline=0 l=r.readline() while len(l)>0: nline+=1 allc+=len(l) allc-=1 l=clear(l) l=l.split() for i in range(len(l)): e.append(l[i]) for i1 in l[i]: if not i1 in sarae: alphac+=1 l=r.readline() wn=len(e) return [e,allc,alphac,wn,nline] def clear(x): e='' x=x.lower() sarae='"\'\\/,-=-+.#$%^&*()[]{}:;<>?|\n ' for i in x: if i in sarae: e+=' ' else: e+=i return e def bow1(word): e=[] e1=[] stpw=yakword('stopwords.txt')[0] for i in word: if not (i in e or i in stpw): e.append(i) for i in e: c=0 for i1 in word: if i==i1: c+=1 e1.append([i,c]) return e1 def bow2(word,m): e=[] e1=[] e2=[] w=bow1(word) for i in w: e.append(fhash(i[0],m)) for i in range(len(e)): c=0 for i1 in range(len(e)): if e[i]==e[i1]: c+=w[i1][1] e1.append([e[i],c]) for i in e1: if not i in e2: e2.append(i) e2.sort() return e2 def main(): yn =['y','Y','n','N'] x=input('File name = ') y=input('Use feature hashing ? (y,Y,n,N) ') while not y in yn: print('Try again.') y=input('Use feature hashing ? (y,Y,n,N) ') if y=='y'or y=='Y': m=int(input('M = ')) e=yakword(x) print('-------------------') print('char count =',e[1]) print('alphanumeric count =',e[2]) print('line count =',e[4]) print('word count =',e[3]) if y=='y'or y=='Y': print('Bow =',bow2(e[0],m)) else: print('BoW =',bow1(e[0])) main()
# 6330411721 (27.00) 245 (2021-03-21 16:15) def makelist(file): a = lowerb(file) a = a.split() return a def lowerb(file): a = '' infile = open(file,'r') for line in infile: for e in line: if 'A' <= e <= 'Z' or 'a' <= e <= 'z' or '0' <= e <= '9': a += e elif e == ' ': a += ' ' elif e == '\n': a += ' ' else: a += ' ' a = a.lower() return a def removestop(words,stop): output = [] for e in words: if e in stop: pass else: output.append(e) return output def charc(file): isBlankFile = True charC = 0 for line in file: isBlankFile = False charC += len(line) - 1 if not isBlankFile: charC += 1 return charC def alph(words): w = ''.join(words) c = len(w) return c def line(file): c=0 for k in file: c += 1 return c def wordc(words): l = len(words) return l def bow(data): temp = [] BoW = [] for word in data: if word not in temp: BoW.append([word,data.count(word)]) temp.append(word) return BoW def fhash(data,M): M = int(M) fhNum = [] for word in data: n = 0 i = 0 for e in word: n += ord(e)*((37)**i) i += 1 fhNum.append(n%M) temp = [] BoW = [] for num in fhNum: if num not in temp: BoW.append([num,fhNum.count(num)]) temp.append(num) BoW.sort() return BoW #--------------------------------------------------------------------------------- filename = input('File name = ') fh = input('Use feature hashing ? (y,Y,n,N) ') while fh not in 'yYnN': print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') data = makelist(filename) stop = makelist('stopwords.txt') dataNoStop = removestop(data,stop) if fh == 'n' or fh == 'N' : print('-------------------') filen = open(filename,'r') print('char count =', charc(filen)) filen.close() print('alphanumeric count =',alph(data)) filen = open(filename,'r') print('line count =',line(filen)) filen.close() print('word count =',wordc(data)) print('BoW =',bow(dataNoStop)) elif fh == 'y' or fh == 'Y' : m = input('M = ') print('-------------------') filen = open(filename,'r') print('char count =', charc(filen)) filen.close() print('alphanumeric count =',alph(data)) filen = open(filename,'r') print('line count =',line(filen)) filen.close() print('word count =',wordc(data)) print('BoW =',fhash(dataNoStop,m))
# 6330412321 (30.00) 246 (2021-03-22 21:13) file_name = input('File name = ') def char_count(a): a = open(file_name,'r') cc = 0 for line in a: cc += int(len(line.strip())) a.close() return cc def alphanumeric_count(a): a = open(file_name,'r') alp = 0 for line in a: for e in line: if '0' <= e <= '9' or 'A' <= e <= 'Z' or 'a' <= e <= 'z': alp += 1 a.close() return alp def line_count(a): a = open(file_name,'r') lc = 0 for line in a: lc += 1 a.close() return lc def word_count(a): a = open(file_name,'r') k = '' wc = 0 for line in a: for e in line: if (e not in 'abcdefghijklmnopqrstuvwxyz') and (e not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') and(e not in '0123456789'): k += ' ' else: k += e words = k.split() wc += len(words) a.close() return wc def BoW(a): a = open(file_name, "r") stop_words = open("stopwords.txt","r") new = '' for line in a: line = line.lower() for e in line: if (e in 'abcdefghijklmnopqrstuvwxyz') or (e in '0123456789'): new += e else: new += ' ' new1 = new.split(' ') sw = '' for line in stop_words: line = line.lower() for e in line: if (e in 'abcdefghijklmnopqrstuvwxyz') or (e in '0123456789'): sw += e else: sw += ' ' sw1 = sw.split(' ') new2 = [] for e in new1: if e in sw1: new2.append('') else: new2.append(e) new3 = [] for e in new2: if e != '': new3.append(e) u = [] v= [] for e in new3: if e not in u: u.append(e) v.append([e,1]) else: t = u.index(e) v[t] = [e,v[t][1]+1] a.close() stop_words.close() return v def fhash(w,M): s = 0 for i in range(len(w)): s += int(ord(w[i])*((37)**i)) fhash = s%M return fhash def new_bow(a): s = BoW(a) u = [] v = [] for e in s: i = fhash(e[0],M) j = e[1] if i not in u: u.append(i) v.append([i,j]) else: k = u.index(i) v[k] = [i,v[k][1]+j] v.sort() return v fh = input('Use feature hashing ? (y,Y,n,N) ') while fh != 'n' and fh != 'N' and fh != 'y' and fh != 'Y': print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') if fh == 'n' or fh == 'N': print('-------------------') print('char count = '+str(char_count(file_name))) print('alphanumeric count = '+str(alphanumeric_count(file_name))) print('line count = '+str(line_count(file_name))) print('word count = '+str(word_count(file_name))) print('BoW = '+str(BoW(file_name))) elif fh == 'y' or fh == 'Y': M = int(input('M = ')) print('-------------------') print('char count = '+str(char_count(file_name))) print('alphanumeric count = '+str(alphanumeric_count(file_name))) print('line count = '+str(line_count(file_name))) print('word count = '+str(word_count(file_name))) print('BoW = '+str(new_bow(file_name)))
# 6330413021 (30.00) 247 (2021-03-21 22:09) '''def senstrip(sentence) : a = '' for e in sentence : if e.isalnum(): a += e else : a += ' ' return a''' def senstrip(sentence) : a = '' for e in sentence : if '0' <= e <= '9' or 'a' <= e <= 'z' : a += e elif e == '\n' : pass else : a += ' ' return a def read_file(file): f = open(file) wordslist = [senstrip(line.lower()) for line in f.readlines()] f.close() return wordslist def fhash(words,m) : num = 0 for i in range(len(words)) : num += ord(words[i:i+1])*37**(i) return num % m def spstrip(word): a = '' for e in word : if e.isalnum() : a += e return a file_name = input('File name = ') hashornot = input('Use feature hashing ? (y,Y,n,N) ') while hashornot not in ['Y','y','N','n'] : print('Try again.') hashornot = input('Use feature hashing ? (y,Y,n,N) ') stopwords = ' '.join(read_file('stopwords.txt')).split() + [''] datalist = read_file(file_name) chain = ' '.join(datalist).split() words = [] for e in chain: if e not in stopwords : words.append(e) bow = [] if hashornot in ['Y','y'] : m = int(input('M = ')) hashed = [fhash(x,m) for x in words] for i in range(m) : a = hashed.count(i) if a > 0 : bow.append([i,a]) elif hashornot in ['N','n'] : words.sort() ind = 0 while ind != len(words) : bow.append([words[ind],words.count(words[ind])]) ind += words.count(words[ind]) alphacount = len(''.join(chain)) linecount = len(datalist) charcount = len(''.join(datalist)) wordcount = len(chain) print('-------------------') print('char count =',charcount) print('alphanumeric count =',alphacount) print('line count =',linecount) print('word count =',wordcount) print('BoW =',sorted(bow))
# 6330415221 (27.65) 248 (2021-03-22 23:37) Alp = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' alp = 'abcdefghijklmnopqrstuvwxyz' num = '1234567890' def remove(t) : #หาจน.ตัวอังกฤษกับตัวเลข out = '' for e in t : if e in Alp or e in alp or e in num : out += e else : out +='' return out def remove_punc(t) : #หาจน.คำ ex = '' for i in t : if i in "\"\'/\\().,;:!$%&*+-<=>?@[]^_`{|}~'" : ex += ' ' else : ex += i exs = ex.split() return exs def fhash(w,M) : G = 37 a = 0 for i in range(len(w)): a += ord(w[i])*(G**i) a = a % int(M) return a def find(s, t) : pp = [] for i in range(len(s)) : if s[i] == t : pp.append(s[i]) return pp file_name = input('File name = ') choose = input('Use feature hashing ? (y,Y,n,N) ') while choose not in 'yYnN' : print('Try again.') choose = input('Use feature hashing ? (y,Y,n,N) ') if choose == 'y' or choose == 'Y' : M = input('M = ') print('-------------------') else : print('-------------------') char_count = 0 alp_count = 0 word_count = 0 line_count = 0 infile = open(file_name, "r") temp = infile.read().splitlines() for line in temp : #หาจน.อักขระ char_count += len(line) alp_count += len(remove(line)) word_count += len(remove_punc(line)) line_count += 1 infile.close() print('char count =', char_count) print('alphanumeric count =', alp_count) print('line count =', line_count) print('word count =', word_count) infile2 = open("stopwords.txt", "r") sw = [] while True : line = infile2.readline() if line == '' : break else : sw += line.split() infile2.close() text = ' '.join(temp) text = remove_punc(text.lower()) text2 = [] for i in range(len(text)) : if not text[i] in sw : text2.append(text[i]) if choose == 'n' or choose == 'N' : BoW = [] words = [] for i in range(len(text2)) : if not text2[i] in words : words.append(text2[i]) frequency = text2.count(text2[i]) BoW.append([text2[i], frequency]) print('BoW =', BoW) if choose == 'y' or choose == 'Y' : fhash_num = [] for i in range(len(text2)) : fhash_num.append(fhash(text2[i],M)) p = [] f = [] k = [] BoW_Y = [] for i in range(len(fhash_num)) : u = find(fhash_num, fhash_num[i]) if u not in p : p.append(u) else : pass for i in range(len(p)) : a = p[i] f.append(len(a)) k.append(a[0]) for i in range(len(k)) : BoW_Y.append([k[i], f[i]]) BoW_Y.sort() print('BoW =', BoW_Y)
# 6330416921 (3.00) 249 (2021-03-21 23:28) #----------------------------------------------------- def line_count(a) : fin = open(a,'r') line = fin.readline() linecount=0 while len(line) > 0: linecount += 1 line = fin.readline() fin.close() return str(linecount) #----------------------------------------------------- def char_count(a): fin = open(a,'r') line=fin.readline().strip() charcount=0 while len(line) > 0: for i in line: charcount += 1 line=fin.readline().strip() fin.close() return str(charcount) #----------------------------------------------------- def alp_count(a): fin = open('sample.txt','r') line=fin.readline().strip() alpcount=0 while len(line) > 0: for i in line: if 'a'<=i<='z' or 'A'<=i<='Z' or '0'<=i<='9': alpcount += 1 line=fin.readline().strip() fin.close() return str(alpcount) #----------------------------------------------------- def remove_punc(t): out = "" for e in t : if 'a'<=e<='z' or 'A'<=e<='Z' or '0'<=e<='9' : out += e else : out += ' ' return out #----------------------------------------------------- def word_count(a): fin = open(a,'r') line=fin.readline().strip() wordcount=0 while len(line) > 0: wordcount += int(len(remove_punc(line).split())) line=fin.readline().strip() fin.close() return str(wordcount) #----------------------------------------------------- def remove_stopword(t): fin = open('stopwords.txt','r') stopword=[] line = fin.readline() while len(line) > 0: x=line.strip().split() for p in x: stopword.append(p) line = fin.readline() newtext=[] for e in t : if e in stopword : pass else : newtext.append(e) return newtext #----------------------------------------- def bownothash(a): fin = open(a,'r') line=fin.readline().strip() bow=[] newline=[] while len(line) > 0: low= line.lower() new=remove_stopword(remove_punc(low.strip()).split()) for f in new: newline.append(f) line=fin.readline().strip() d=newline.copy() for i in range(len(d)): if newline != []: x=newline.pop(0) if x in newline: n=1 aa=newline.copy() for w in aa: if w == x: n+=1 newline.remove(x) bow.append([x,n]) else: bow.append([x,1]) bow.sort() fin.close() return bow #----------------------------------------------------- def fhash(a,m): summ=0 for i in range (len(a)): summ += ord(a[i])*(37)**i c = summ % int(m) return c #----------------------------------------- def bowhash(a,m): fin = open(a,'r') line=fin.readline().strip() bow=[] newline=[] while len(line) > 0: low= line.lower() new=remove_stopword(remove_punc(low.strip()).split()) for f in new: newline.append(fhash(f,m)) line=fin.readline().strip() d=newline.copy() for i in range(len(d)): if newline != []: x=newline.pop(0) if x in newline: n=1 aa=newline.copy() for w in aa: if w == x: n+=1 newline.remove(x) bow.append([x,n]) else: bow.append([x,1]) bow.sort() fin.close() return bow #----------------------------------------------------- a = input('File name = ') b = input('Use feature hashing ? (y,Y,n,N) ') #----------------------------------------------------- while True: if b == 'y' or b=='Y': m = input('M = ') print ('-------------------') print ('char count = '+char_count(a)) print ('alphanumeric count = '+alp_count(a)) print ('line count = '+line_count(a)) print ('word count = '+word_count(a)) print ('BoW =',bowhash(a,m)) break #----------------------------------------------------- elif b == 'n' or b=='N': print ('-------------------') print ('char count = '+char_count(a)) print ('alphanumeric count = '+alp_count(a)) print ('line count = '+line_count(a)) print ('word count = '+word_count(a)) print ('BoW =',bownothash(a)) break #----------------------------------------------------- else: print('Try again.') b = input('Use feature hashing ? (y,Y,n,N) ')
# 6330417521 (30.00) 250 (2021-03-22 16:14) def fhash(text, M): i = 0 plu = 0 for e in text: plu += ord(e)*(37**i) i += 1 return plu %M file_name = input('File name = ') ch = input('Use feature hashing ? (y,Y,n,N) ') while ch not in ['y','Y','n','N']: print('Try again.') ch = input('Use feature hashing ? (y,Y,n,N) ') if ch == 'y' or ch == 'Y': m = int(input('M = ')) f = open(file_name, 'r') st_f = open('stopwords.txt', 'r') fd = [] for e in st_f: if e[-1] == '\n': fd += e[:-1].split() else: fd += e.split() f_u = '' cc = 0 ac = 0 lc = 1 for e in f: for k in e.lower(): if k != '\n': cc += 1 if k in 'abcdefghijklmnopqrstuvwxyz1234567890': ac += 1 if k == '\n': lc += 1 if k not in 'abcdefghijklmnopqrstuvwxyz1234567890': f_u += ' ' else: f_u += k li_f = f_u.split() li_fu = [] for e in li_f: if e not in fd: li_fu.append(e) print('-------------------') print('char count =', cc) print('alphanumeric count =', ac) print('line count =', lc) print('word count =', len(li_f)) if ch == 'y' or ch == 'Y': b = [] for_ch = [] for e in li_fu: a = fhash(e, m) if a not in for_ch: for_ch.append(a) b.append([a, 1]) else: c = for_ch.index(a) d = b[c][1] b[c] = [a, d+1] b.sort() else: b = [] for_ch = [] for e in li_fu: if e not in for_ch: for_ch.append(e) b.append([e, 1]) else: a = for_ch.index(e) c = b[a][1] b[a] = [e, c+1] print('BoW =', b) f.close() st_f.close()
# 6330418121 (20.88) 251 (2021-03-22 18:23) def remove_punc(t): out = "" for e in t: if e not in "\"\'/\\().,;:": out += e return out def fhash(w,M): c = 0 for i in range(len(w)): c += ord(w[i])*(37**i) out = c%M return out def word_count(word, wordslist): wc = 0 for w in wordslist: if w == word: wc += 1 return wc def cut_words(words, stopwords): cw = [] for i in range(len(words)): words[i] = words[i].lower() for e in words: if e not in stopwords: cw.append(e) return cw def Bow(wordslist): bow = [] wordslist.sort() for e in wordslist: if e not in bow : bow.append(e) for i in range(len(bow)): bow[i] = [bow[i],word_count(bow[i],wordslist)] return bow file_name = input("File name = ") feature_hashing = input("Use feature hashing ? (y,Y,n,N) ").lower() while feature_hashing not in "yn": print("Try again.") feature_hashing = input("Use feature hashing ? (y,Y,n,N) ").lower() if feature_hashing == "y": M = int(input("M = ")) stopwords = [] s_file = open("stopwords.txt","r") for l in s_file: if len(l) > 0 : for i in l.split(): stopwords.append(i) s_file.close() alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" character_count = 0 alphabet_count = 0 line_count = 0 words = [] file = open(file_name,"r") for l in file: character_count += len(l.strip()) if len(l) > 0: line_count += 1 w = "" for i in range(len(l)): if l[i] in alphabet: w += l[i] if l[i] not in alphabet and w != "": words.append(w) w = "" for x in l: if x in alphabet: alphabet_count += 1 final = cut_words(words, stopwords) if feature_hashing == "n": bow = Bow(final) else: for i in range(len(final)): final[i] = fhash(final[i],M) bow = Bow(final) print("-------------------") print("char count =",character_count) print("alphanumeric count =",alphabet_count) print("line count =",line_count) print("word count =",len(words)) print("BoW =",bow) file.close()
# 6330420321 (21.40) 252 (2021-03-22 23:33) def fhash(w,M) : sumss = 0 G = 37 for c in range(len(w)) : sumss += ord(w[c])*(G**c) return sumss % M def get_stopwords(stopwordss) : stopwords = [] with open(stopwordss,'r') as f : for line in f : for word in line.split() : stopwords.append(word.lower()) return stopwords def char_count(filen) : c = 0 aln = 0 line_c = 0 alllins = [] with open(filen,'r') as readfil: # while True : # line = readfil.readline().strip() # if not line : # break # line_c += 1 # c += len(line) # for each_chr in line : # if each_chr.isalnum(): # aln += 1 Alliine = readfil.readlines() for i in Alliine: if i != '\n' and not i.isspace(): alllins.append(i.strip('\n')) line_c = len(Alliine) for i in alllins: c += len(i) for eachchar in i : if eachchar.isalnum(): aln+=1 print('char count =',c) print('alphanumeric count =',aln) print('line count =',line_c) # return c , aln , line_c def word_count(filen,BowFlag,M) : file_str = '' alllins = [] with open(filen,'r') as readfil : # while True : # line = readfil.readline().strip() # if not line : # break # for c in line : # if c.isalnum(): # file_str += c.lower() # else : # file_str += ' ' Alliine = readfil.readlines() for i in Alliine: if i != '\n' and not i.isspace(): alllins.append(i.strip('\n')) for eachline in alllins : for c in eachline : if c.isalnum() : file_str += c.lower() else : file_str += ' ' if BowFlag : BOww = BoWwhash(file_str,M) else : BOww = BoW(file_str) words_c = len(file_str.split()) print('word count =',words_c) print('BoW =',BOww) # return words_c , BOww def BoW(sentence) : Bows = [] ss_l = [] stopword = get_stopwords('stopwords.txt') f_str = sentence f_str = f_str.split() for f in f_str : if f not in stopword : ss_l.append(f) for f in ss_l: if not any(f == subB[0] for subB in Bows) : #Checking in Sub-list Bows.append([f,f_str.count(f)]) Bows.sort() return Bows def BoWwhash(sentence,M) : Bows = [] hash_value = [] ss_l = [] stopword = get_stopwords('stopwords.txt') f_str = sentence f_str = f_str.split() for f in f_str : if f not in stopword : ss_l.append(f) for f in ss_l: hashf = fhash(f,M) hash_value.append(hashf) for i in hash_value : if not any(i == subBow[0] for subBow in Bows) : Bows.append([i,hash_value.count(i)]) Bows.sort() return Bows def main() : file_name = input('File name = ') while True: choice = input('Use feature hashing ? (y,Y,n,N) ').lower() if choice == 'y' : hash = True break elif choice == 'n' : hash = False break else : print('Try again.') if hash : M = int(input('M = ')) print('-'*19) char_count(file_name) word_count(file_name,True,M) else : print('-'*19) char_count(file_name) word_count(file_name,False,None) #------------------------------------------ main()
# 6330422621 (14.60) 253 (2021-03-21 20:05) def fhash(w,M): mixfhash = 0 for i in range(len(w)): a = int(ord(w[i])) b = a*(37)**i mixfhash += b fhash = mixfhash%M return fhash def freq1(str1): str2 = [] for i in str1: if i not in str2: str2.append(i) rt = [] for i in range(len(str2)): y = str1.count(str2[i]) rt.append(str("[\'"+str2[i]+"\', "+ str(y)+"]")) op = "["+", ".join(rt)+"]" return op def freq(str1): str2 = [] for i in str1: if i not in str2: str2.append(i) rt = [] for i in range(len(str2)): y = str1.count(str2[i]) rt.append(str("["+str2[i]+", "+ str(y)+"]")) op = "["+", ".join(rt)+"]" return op def remove(word): line = "" for e in word: if e in "\"\'/\\().,;:": line += " " else: line += e return line def choose(word): cutout = [] for i in range(len(word)): cut = remove(word[i]) cutout.append(cut.strip()) string = "" for i in range(len(cutout)): string += str(cutout[i])+" " return string def for_M(M): hash = [] for i in range(len(strink)): d=strink[i] rr = fhash(d,M) hash.append(str(rr)) return hash file = input("File name = " ) file_name = open(file,"r") file_stop = open("stopwords.txt","r") sp = file_stop fn = file_name stop = [] for line in sp: stopp = line.strip().split() for i in range(len(stopp)): stop.append(stopp[i]) sp.close() allword = [] word = [] allwordcount = "" count = 0 c = 0 ab = "" for line in fn: allmix = line.lower().strip().split() for i in range(len(allmix)): allword.append(allmix[i]) allwordcount += str(allmix[i])+" " c += 1 alword = choose(allword) alphacount = 0 for i in range(len(allword)): alphacount += len(remove(allword[i]).strip()) for i in range(len(allword)): if allword[i] not in stop: word.append(allword[i]) fn.close() strink = choose(word).split() count = len(allwordcount)-c while True: hon = input("Use feature hashing ? (y,Y,n,N) " ) if hon == "n" or hon == "N": print("-------------------") print("char count = " ,count) print("alphanumeric count = " ,alphacount) print("line count = " ,c) print("word count = " ,len(allword)) print("BoW = " ,freq1(strink)) break elif hon == "y" or hon == "Y": M = int(input("M = ")) print("-------------------") print("char count = ", count) print("alphanumeric count = ",alphacount) print("line count = ",c) print("word count = " ,len(allword)) print("BoW = " ,freq(for_M(M))) break else: print("try again.")
# 6330423221 (19.95) 254 (2021-03-22 22:57) file_name= str(input()) print("File name ="," "+file_name) file_name1=open(file_name,"r") file_name2=open(file_name,"r") file_name3=open(file_name,"r") file_name4=open(file_name,"r") file_name5=open(file_name,"r") file_name6=open(file_name,"r") file_delete=open("stopwords.txt","r") hh="" for line in file_delete: x=line.lower().split() #["u","d","e"] hh+="".join(x) x=hh def fhash(w,M): h=0 for i in range(len(w)): h+=(ord(w[i]))* (37**(i)) return h%M def line_count(n): ss=0 for line in n: ss+=1 return ss print("Use feature hashing ? (y,Y,n,N)") fh=input() while fh not in "YyNn": print("Try again.") fh=input() if fh in "Yy": M=int(input()) print("M =",M) def char_count(n): c=0 for line in n: for e in line: c+=1 return c file_name1=open(file_name,"r") def alphanumeric_count(n): l=0 for line in n : line=line.split() line="".join(line) for e in line: if "0"<=e<="9" or "a"<=e<="z" or "A"<=e<="Z": l+=1 return l file_name1=open(file_name,"r") def word_count(n): c=0 for line in n : f="" for e in line : #It was the best of times, if e in "\\/\"\'!@#$%^&*()_-+=|{[}]:;<,>.?*": f+=" " else: f+=e f=f.split() for e in f: c+=1 return c def bow (fh,n): if fh in "Nn": t=[] f="" new=[] for line in n : for e in line : #It was the best of times, if e in "\\/\"\'!@#$%^&*()_-+=|{[}]:;<,>.?*" : f+=" " else: f+=e ##It was the best of times f=f.lower().split() for i in range(len(f)): if f[i] in x: f[i]="" for e in f: if len(e)!=0: new.append(e) k=[] for e in new: if e in k:pass else: k.append(e) h=[0]*len(k) for e in new : if e in k: h[k.index(e)]+=1#k=['best', 'times', 'worst', 'age', 'wisdom', '555'] #h=[1, 2, 1, 1, 1, 1] d=[] for i in range(len(h)): d.append([k[i],h[i]]) return d if fh in "Yy" : a=[] t=[] f="" new=[] p=[] for line in n : for e in line : #It was the best of times, if e in "\\/\"\'!@#$%^&*()_-+=|{[}]:;<,>.?*" : f+=" " else: f+=e ##It was the best of times f=f.lower().split() for i in range(len(f)): if f[i] in x: f[i]="" for e in f: if len(e)!=0: new.append(e) for e in new: a.append(fhash(e,M)) #[1, 0, 0, 3, 1, 3, 0, 0, 0, 1, 0, 2, 1, 3, 0, 0, 3, 2, 0, 0, 1, 1, 1, 3, 0, 2, 1, 0, 0, 3, 1, 3, 0, 0, 0, 1, 0, 3, 3, 2, 3, 0, 0, 3, 2, 0, 0, 1, 1, 1, 3, 0, 2, 1, 0, 0, 3, 1, 3, 0, 0, 0, 1, 0, 1, 3, 1, 0, 3, 2, 0, 3, 1, 3, 0, 3, 1, 2, 2, 2, 1, 1, 1, 2, 2] for e in a: if e not in p: p.append(e) p.sort() h=[0]*(M) for e in a: h[int(e)]+=1 b=[] for i in range(M): b.append(int(i)) gg=[] for i in range(len(b)): gg.append([b[i],h[i]]) return gg print("-------------------") print("char count = ",char_count(file_name1)-line_count(file_name6)) print("alphanumeric count = ",alphanumeric_count(file_name2)) print("line count = ",line_count(file_name5)) print("word count = ",word_count(file_name3)) print("BoW = ",bow (fh,file_name4))
# 6330424921 (16.87) 255 (2021-03-22 22:00) def removepunc(t): out = '' for e in t: if e not in [ '(', ')', '-', '', '[', ']' ,'"' ,"'" ,';', ':', '>', '<','.','' ]: out += e return out def fhashword(w,M): a = [] show =[] for e in w: fhword = 0 n=0 for i in range(len(e)): fhword += ord(e[i])*37**n n += 1 a.append(fhword % int(M)) for i in range(min(a),max(a)+1): if a.count(i)!=0: show.append([i,a.count(i)]) return show file_name = input('File name = ' ) feature_hashing = input('Use feature hashing ? (y,Y,n,N) ').lower() while feature_hashing not in "yn" and len(feature_hashing)!=1 : print("Try again") feature_hashing = input('Use feature hashing ? (y,Y,n,N) ').lower() if feature_hashing == 'y': m = input('M = ') print('-------------------') filebow = open(file_name,'r') linebow = filebow.readlines() filebow.close() box = [] for i in linebow: line = i.replace('\n','') box.append(line) sample = str(box) sample2 = removepunc(sample) file = open('stopwords.txt','r') stopword = file.readlines() file.close() reallinebow = '' for i in sample2: reallinebow += i reallinebow = reallinebow.lower().split() lstopword = '' for i in stopword: lstopword += i lstopword.split() newbow = '' for i in range(len(reallinebow)): if reallinebow[i] not in lstopword: newbow += reallinebow[i]+',' else: newbow += '' finalbow = newbow.split(',')[:-1] finalbow2 = [] for i in finalbow: if i != '': finalbow2.append(i) wordfreq= [] for w in finalbow2: wordfreq.append([w,finalbow2.count(w)]) last = [] for i in wordfreq: if i not in last: last.append(i) last2 = [] for i in range(len(last)): if last[i][0] != '': last2.append(last[i]) finallast = [] for i in range(len(last2)): if last2[i][0] != '\\n': finallast.append(last2[i]) wordcount=0 for words in box: splitedwords=words.split() wordcount+=(len(splitedwords)) alphabetcount=0 for words in box: for alphabet in words: if alphabet in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789": alphabetcount+=1 charcount=0 for words in box: for char in words: charcount+=1 print('char count =',charcount) print('alphanumeric count =',alphabetcount) print('line count =',len(box)) print('word count =',wordcount) print('BoW =',fhashword(finalbow2,m)) elif feature_hashing == 'n': print('-------------------') filebow = open(file_name,'r') linebow = filebow.readlines() filebow.close() box = [] for i in linebow: line = i.replace('\n','') box.append(line) sample = str(box) sample2 = removepunc(sample) file = open('stopword.txt','r') stopword = file.readlines() file.close() reallinebow = '' for i in sample2: reallinebow += i reallinebow = reallinebow.lower().split() lstopword = '' for i in stopword: lstopword += i lstopword.split() newbow = '' for i in range(len(reallinebow)): if reallinebow[i] not in lstopword: newbow += reallinebow[i]+',' else: newbow += '' finalbow = newbow.split(',')[:-1] finalbow2 = [] for i in finalbow: if i != '': finalbow2.append(i) wordfreq= [] for w in finalbow2: wordfreq.append([w,finalbow2.count(w)]) last = [] for i in wordfreq: if i not in last: last.append(i) last2 = [] for i in range(len(last)): if last[i][0] != '': last2.append(last[i]) finallast = [] for i in range(len(last2)): if last2[i][0] != '\\n': finallast.append(last2[i]) wordcount=0 for words in box: splitedwords=words.split() wordcount+=(len(splitedwords)) alphabetcount=0 for words in box: for alphabet in words: if alphabet in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789": alphabetcount+=1 charcount=0 for words in box: for char in words: charcount+=1 print('char count =',charcount) print('alphanumeric count =',alphabetcount) print('line count =',len(box)) print('word count =',wordcount) print('BoW =',finallast)
# 6330425521 (30.00) 256 (2021-03-22 23:11) def fhash(word, M): Fhash = 0 M = int(M) for i in range(len(word)): Fhash += ord(word[i])*(37**i) return Fhash % M def BoW(clause): bow = [] clause = clause.split() clause.sort() n = 1 b_word = None for word in clause: if word == b_word: n += 1 b_word = word else: bow.append([b_word, n]) b_word = word n = 1 bow.append([b_word, n]) bow = bow[1::] return bow def cut_symbol(clause): cut = '' clause = clause.lower() for e in clause: if 'a' <= e <='z' or '0' <= e <= '9': cut += e else: cut += ' ' cut = ' '.join(cut.strip().split()) return cut def clear_all(clause, list_of_stopwords): clause = cut_symbol(clause).split() clear = '' for word in clause: if word not in list_of_stopwords: clear += word + ' ' return clear.strip() file_name = input('File name = ') open_file = open(file_name, 'r') file = open_file.read() open_file.close() read_file = '' for e in file: if e != '\n': read_file += e line_file = open(file_name, 'r') n_line_file = 0 for line in line_file: n_line_file += 1 line_file.close() char_file = open(file_name, 'r') n_char_file = 0 for line in char_file: n_char_file += len(line) char_file.close() stopwords = open('stopwords.txt', 'r') list_of_stopwords = stopwords.read().split() stopwords.close() cut_file = cut_symbol(file) clear_file = clear_all(file, list_of_stopwords) n_char = len(read_file) n_alphanumeric = len(''.join(cut_file.split())) n_word = len(cut_file.split()) FH = input('Use feature hashing ? (y,Y,n,N) ') while True: if FH not in ['y','Y','n','N']: print('Try again.') FH = input('Use feature hashing ? (y,Y,n,N) ') else: break if FH in ['y','Y']: M = input('M = ') print('-------------------') print('char count =', n_char) print('alphanumeric count =', n_alphanumeric) print('line count =', n_line_file) print('word count =', n_word) if FH in ['y','Y']: Fhash = '' for word in clear_file.split(): Fhash += str(fhash(word, M)) + ' ' bow_ = BoW(Fhash.strip()) bow = [] for fh, n in bow_: bow.append([int(fh), n]) else: bow = BoW(clear_file) print('BoW =', bow)
# 6330426121 (30.00) 257 (2021-03-22 18:01) def fhash(w,M): x = 0 for i in range(len(w)): x += ord(w[i])*37**i return x%M def to_alpha(s): i = 0 for c in s: if c.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': i += 1 return i def check_word(s): x = [] w = '' for c in s: if c.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': w += c else: if w != '': x.append(w) w = '' if w != '': x.append(w) return x file_name = input('File name = ') checkfh = input('Use feature hashing ? (y,Y,n,N) ') while True: if checkfh == 'y' or checkfh == 'Y': M = int(input('M = ')) checkfh = True break elif checkfh == 'n' or checkfh == 'N': checkfh = False break else: print('Try again.') checkfh = input('Use feature hashing ? (y,Y,n,N) ') print('-------------------') stopword = open('stopwords.txt', 'r') t = open(file_name, 'r') stw = [] for line in stopword: for e in line.strip().split(): stw.append(e.lower()) stopword.close() count_line = 0 count_c = 0 count_alp = 0 count_word = 0 for line in t: count_line += 1 count_c += len(line) count_alp += to_alpha(line) count_word += len(check_word(line)) count_c -= count_line-1 t.close() t = open(file_name, 'r') BoW = [] if checkfh == True: W_in_BoW = [] for line in t: for e in check_word(line): if e.lower() not in stw: if fhash(e.lower(),M) not in W_in_BoW: W_in_BoW.append(fhash(e.lower(),M)) BoW.append([fhash(e.lower(),M),1]) else: for i in range(len(BoW)): if BoW[i][0] == fhash(e.lower(),M): BoW[i][1] += 1 BoW.sort() else: W_in_BoW = [] for line in t: for e in check_word(line): if e.lower() not in stw: if e.lower() not in W_in_BoW: W_in_BoW.append(e.lower()) BoW.append([e.lower(),1]) else: for i in range(len(BoW)): if BoW[i][0] == e.lower(): BoW[i][1] += 1 t.close() print('char count =',count_c) print('alphanumeric count =',count_alp) print('line count =',count_line) print('word count =',count_word) print('BoW =',BoW)
# 6330427821 (24.80) 258 (2021-03-22 14:04) def fhash(w, M): result = 0 G = 37 for i in range(len(w)): result += ord(w[i])*(G**i) return result % M def stop_words(): f = open("stopwords.txt", "r") stop_words = [] for line in f: s = line.lower().split() if len(s) != 0: stop_words.append(s) stop_words = [item.lower() for sublist in stop_words for item in sublist] return stop_words def BOW(f, stop_words): # f = open(file, "r") # print(char_count(f)) sentences = [] for sentence in f: long_text = ''.join(c for c in sentence.rstrip('\n') if c not in '?:!/;,."') sentences.append(long_text) # print(sentences) # length = 0 length = [len(sentence.split()) for sentence in sentences] resultwords = [] for sentence in sentences: for word in sentence.split(): if word.lower() not in stop_words: resultwords.append(word.lower()) result = ' '.join(resultwords) # result = ''.join(c for c in result if c not in '?:!/;,."') return (result), sum(length) def isNot_fhash(bag_of_words): words = [] wordfreq = [] for w in bag_of_words.split(): if w not in words: words.append(w) wordfreq = [[w, bag_of_words.split().count(w)] for w in words] return sorted(wordfreq) def is_fhash(bag_of_words, M): result_fhash = [fhash(w, M) for w in bag_of_words.split()] words_fhash = [] for w in result_fhash: if w not in words_fhash: words_fhash.append(w) wordfreq_fhash = [[w, result_fhash.count(w)] for w in words_fhash] return sorted(wordfreq_fhash) def letter_count(read_file): # print(read_file) length_char = 0 alphanumeric_count = 0 line_count = 0 for line in read_file: for c in line.rstrip('\n'): if(c.isalpha()) or (c.isdigit()): alphanumeric_count += 1 # digits = digits + 1 if len(line) != 0: line_count += 1 length_char += len(line.rstrip('\n')) return length_char, alphanumeric_count, line_count def _print(file, M=-1): length_char, alphanumeric_count, line_count = letter_count(open(file, "r")) print("-------------------") print("char count = ", length_char) print("alphanumeric count = ", alphanumeric_count) print("line count = ", line_count) bag_of_words, len_BOW = BOW(open(file, "r"), stop_words()) print("word count = ", len_BOW) if M != -1: print("BoW = ", is_fhash(bag_of_words, M)) else: print("Bow = ", isNot_fhash(bag_of_words)) if __name__ == '__main__': try: file = input('File name = ') while True: choice = input("Use feature hashing ? (y,Y,n,N)") if choice not in ('y', 'Y', 'N', 'n'): print("Try again.") else: M = -1 if choice in ('y', 'Y'): M = input("M = ") _print(file, int(M)) input() break except BaseException: input()
# 6330428421 (30.00) 259 (2021-03-22 00:37) def cha_count(line): ch_count = 0 for ch in line: if ch != '\n': ch_count+=1 return ch_count #-------------------------------- def alnum_count(line): alp_num_count = 0 for ch in line: if ch.isalnum(): alp_num_count+=1 return alp_num_count #-------------------------------- def word_list(line): result = '' for e in line: if not e.isalnum(): result+=' ' else: result+=e.lower() word = (result.split()) return word #-------------------------------- def BoW(s): for i in range(len(s)): if not s[i] in words: words.append(s[i]) freq.append(1) elif s[i] in words: freq[words.index(s[i])]+=1 #------------------------------------- def fhash(word,M): fhash = 0 for i in range(len(word)): fhash+=ord(word[i])*37**i fhash %=int(M) return fhash #------------------------------------ def remove_stop_words(s,stop_words): result = [] for e in s: if e in stop_words: pass else: result.append(e) return result #---------------------------------------- file_name = input('File name = ')+'.txt' Fstop_words = open('stopwords.txt','r') fin = open(file_name,'r') while True: use_feature = input('Use feature hashing ? (y,Y,n,N) ') if use_feature == 'y' or use_feature=='Y': M=input('M = ') break elif use_feature == 'n' or use_feature=='N': break else: print('Try again.') print('-'*19) #Variables ch_count=0 alp_num_count=0 line_count = 0 word_count = 0 fhashed=[] stop_words = [] words = [] freq = [] bow = [] #--------------- for line in Fstop_words: stop_words+=word_list(line) for line in fin: line_count+=1 ch_count+=cha_count(line) alp_num_count+=alnum_count(line) word_count+=len(word_list(line)) fhashed=remove_stop_words(word_list(line),stop_words) if use_feature=='n' or use_feature=='N': BoW(fhashed) else: for i in range(len(fhashed)): fhashed[i]=fhash(fhashed[i],M) BoW(fhashed) for i in range(len(words)): bow.append([words[i],freq[i]]) bow.sort() print('char count =',ch_count) print('alphanumeric count =',alp_num_count) print('line count =',line_count) print('word count =',word_count) print('BoW =',bow) fin.close() Fstop_words.close()
# 6330429021 (21.70) 260 (2021-03-20 23:27) n = input('File name = ') dh = input('Use deature hashing ? (y,Y,n,N) ') stop_words = open('stopwords.txt', 'r') stop_w = [] new_file = [] neww_file = '' for e in stop_words: e = e.split() for i in range(len(e)): stop_w.append(e[i]) #---------------------------------------------- file_name = open(n,'r') for r in file_name: r = r.split() for t in r: t = t.lower() if t not in stop_w: new_file.append(t) file_name.close() stop_words.close() #---------------------------------------------- file_name = open(n,'r') c = 0 for r in file_name: c += 1 r = r.split() for t in r: for u in t: if u not in '.\'\"\\:;/,': neww_file += u file_name.close() #----------------------------------------------- word_count = '' file_name = open(n,'r') for r in file_name: word_count += ' ' for t in r: if t not in '.\'\"\\:;/,': word_count += t word_count = word_count.strip().split() #---------------------------------------------- new2_file = '' new3_file = '' for s in new_file: new2_file += ' ' for u in s: if u not in '.\'\"\\:;/,': new2_file += u for s in new_file: for u in s: if u not in '.\'\"\\:;/,': new3_file += u new2_file = new2_file.strip().split() #----------------------------------------------- lis = [] count = [] new22_file = [] for i in range(len(new2_file)): if new2_file[i] not in lis: lis.append(new2_file[i]) count.append(new2_file.count(new2_file[i])) for r in range(len(lis)): new22_file.append([lis[r],count[r]]) #------------------------------------------------- new_file_name = '' file_name = open(n,'r') for l in file_name: l = l.strip() for e in l: new_file_name += e file_name.close() file_name = open(n,'r') #------------------------------------------------ def fhash(w,M): s = 0 for i in range(len(w)): s += ord(w[i])*(37**i) return s%int(M) #------------------------------------------------ while dh not in 'YynN': print('Try again.') dh = input('Use deature hashing ? (y,Y,n,N) ') if dh in 'Yy': M = input('M = ') lis2 = [] count2 = [] f_new2 = [] new222_file = [] for s in new2_file: f_new2.append(fhash(s,M)) for i in range(len(new2_file)): if fhash(new2_file[i],M) not in lis2: lis2.append(fhash(new2_file[i],M)) count2.append(f_new2.count(f_new2[i])) for r in range(len(lis2)): new222_file.append([lis2[r],count2[r]]) print('-------------------') print('char count =',len(new_file_name)) print('alphanumeric count =',len(neww_file)) print('line count =',c) print('word count =',len(word_count)) print('BoW =',sorted(new222_file)) elif dh in 'Nn': print('char count =',len(new_file_name)) print('alphanumeric count =',len(neww_file)) print('line count =',c) print('word count =',len(word_count)) print('BoW =',sorted(new22_file))
# 6330430621 (19.27) 261 (2021-03-22 23:52) file_name = open(input('File name = '),'r') k1 = '' k2 = '' for i in file_name: k1 += i.lower() for e in k1: if e in 'abcdefghijklmnopqrstuvwxyz' or e in '0123456789' : k2 += e else : k2 += ' ' kk = k2.split() stop = open('stopword.txt','r') bb = '' for e in stop: if e != '\n': bb += e+' ' bbb = bb.split() c = [] for e in kk: if e in bbb: pass else : c.append(e) alpha = 0 line = 0 word = len(kk) def fhash(x,M): sumc = 0 for i in range(len(x)) : xi = ord(x[i])*((37)**(i)) sumc += xi sums = (sumc)%M return sums for e in k1: if not e!= '\n' : line += 1 cha = 0 for e in k1: if e == '\n': cha += 1 char = len(k1)-cha hon = 0 for e in k2: if e == ' ': hon += 1 alpha = len(k2) - hon b = input('Use feature hashing ? (y,Y,n,N) ') while not b == 'n' and not b == 'N' and not b == 'y' and not b == 'Y': print('Try again.') b = input('Use feature hashing ? (y,Y,n,N) ') if b == 'n' or b == 'N': print('-------------------') print('char count = ' + str(char)) print('alphanumeric count = '+str(alpha)) print('line count = '+str(line)) print('word count = '+str(word)) Bow = [] i = 0 while not (i > len(c)-1): p = 1 y = i while y < len(c)-1: if (c[y] == c[y+1]): p += 1 y += 1 else: break Bow.append([c[i],p]) i = y+1 mm = [] for e in Bow: if not e in mm: mm.append(e) else: k = int(e[1])+1 t = e[0] mm.remove(e) mm.append([t,k]) print('Bow = ',mm) elif b == 'y' or b == 'Y': M = int(input('M = ')) print('-------------------') print('char count = ' + str(char)) print('alphanumeric count = '+str(alpha)) print('line count = '+str(line)) print('word count = '+str(word)) r = [] boww = [] for i in range(len(c)): h = fhash(c[i],M) r.append(h) r.sort() w = 0 u = 0 while not (w > len(r)-1): cc = 1 y = w while (y < len(r)-1): if not (r[y] == r[y+1]): u += 1 break else: cc += 1 y += 1 boww.append(cc) w = y+1 lis = [] g = [] for e in r: if not e in g: g.append(e) else: g.remove(e) g.append(e) for i in range(len(boww)): lis.append([g[i],boww[i]]) print('BoW =',lis) file_name.close() stop.close()
# 6330431221 (15.00) 262 (2021-03-22 23:11) file_name =input("File name = ") k = open("stopwords.txt","r") y = k.read().lower().split() k.close() stopword=y def line_count(file_name): a = open(file_name,'r') count = 0 for line in a: if len(line) != 0: count +=1 a.close() return count def char_count(file_name): a = open(file_name, 'r') count = 0 for line in a: x =line if '\n' in x: count += len(x)-1 else: count += len(x) a.close() return count def alphanum_count(file_name): a = open(file_name,'r') count = 0 while True: x = a.readline().lower() if len(x)!=0: for i in x: if i in 'abcdefghijklmnopqrstuvwxyz' or i in '1234567890': count += 1 else: break a.close() return count def word(file_name): a = open(file_name,'r') v='' c=[] b=a.read().lower() for e in b: c+=e for e in c: if e in 'abcdefghijklmnopqrstuvwxyz' or e in '1234567890': v+=e else: v+=' ' a.close() z=v.split() return z def word_count(file_name): a=word(file_name) return len(a) def fhash(w,M): a = 0 for i in range(len(w)): a += ord(w[i])*(37**i) x = a % M return x def cutword(n): x = [] for i in n: if i not in stopword : x.append(i) x.sort() return x def BOW(f,M): if f =='y': c=[] v=[] ppp =cutword(pp) for i in ppp: c.append(fhash(i,M)) for i in c: if i not in v: v.append(i) v.sort() k=[] for i in range(len(v)): n=0 for e in c : if e == v[i]: n+=1 k.append(n) ss=[] for i in range(len(v)): ss.append([v[i],k[i]]) return ss else: ppp = cutword(pp) poop = [] for u in ppp: if u not in poop: poop.append(u) Bowy=[] for i in range(len(poop)): n=0 for e in ppp : if e == poop[i]: n+=1 Bowy.append(n) zz=[] for i in range(len(poop)): zz.append([poop[i],Bowy[i]]) return zz while True: f=input("Use feature hashing ? (y,Y,n,N) ") g=f.lower() if g == 'y': M = int(input("M = ")) print('-------------------') print('char count = ', char_count(file_name)) print('alphanumeric count = ', alphanum_count(file_name)) print('line count = ', line_count(file_name)) print('word count = ', word_count(file_name)) pp = word(file_name) YYY = BOW(g,M) print("BoW = ",YYY) break elif g == 'n': M = none print('-------------------') print('char count = ', char_count(file_name)) print('alphanumeric count = ', alphanum_count(file_name)) print('line count = ', line_count(file_name)) print('word count = ', word_count(file_name)) pp = word(file_name) cc = cutword(pp) BOW = BOW(g,M) print("BoW = ",BOW) break else: print("Try again.")
# 6330432921 (18.90) 263 (2021-03-22 12:29) def read_txt(): f = open(input("File name = ")) text = f.read() f.close() return text def read_stop(): s = open("stopword.txt") stp = s.read().split() s.close() return stp txt = read_txt() stop = read_stop() def fhash(txt,M): value = 0 for j in range(len(txt)): value += (ord(txt[j])*(37**j)) return value%M def line_count(txt): line = (txt.count("\n"))+1 return line def char_count(txt): char = (len(txt)-txt.count("\n")) return char def alphanumeric_count(txt): count = 0 for i in txt: if((ord(i) >= 65 and ord(i) <= 122) or (ord(i) >= 48 and ord(i) <= 57)): count += 1 else: pass return count def word_count(txt): for i in txt: if((ord(i) >= 65 and ord(i) <= 122) or (ord(i) >= 48 and ord(i) <= 57) or ord(i) == 32): txt = txt.replace(i,i.lower()) elif(i == "\n"): txt = txt.replace(i," ") else: txt = txt.replace(i,"") newtxt = txt.split(" ") return len(newtxt) def Bow(txt): realtxt = [] for i in txt: if((ord(i) >= 65 and ord(i) <= 122) or (ord(i) >= 48 and ord(i) <= 57) or ord(i) == 32): txt = txt.replace(i,i.lower()) else: txt = txt.replace(i," ") newtxt = txt.split() for i in newtxt: if i not in stop: realtxt.append(i) end = [] for i in realtxt: if [i, realtxt.count(i)] not in end: end.append([i, realtxt.count(i)]) return end def Bow2(txt,M): realtxt = [] for i in txt: if((ord(i) >= 65 and ord(i) <= 122) or (ord(i) >= 48 and ord(i) <= 57) or ord(i) == 32): txt = txt.replace(i,i.lower()) else: txt = txt.replace(i," ") newtxt = txt.split() for i in newtxt: if i not in stop: realtxt.append(i) end = [] vbow = [] for i in realtxt: vbow.append(fhash(i,M)) for i in vbow: if [i, vbow.count(i)] not in end: end.append([i, vbow.count(i)]) return end while(True): feature = str(input("Use feature hashing ? (y,Y,n,N) ")) if feature.lower() == 'y' : M=int(input("M = ")) print(f'''------------------- char count = {char_count(txt)} alphanumeric count = {alphanumeric_count(txt)} line count = {line_count(txt)} word count = {word_count(txt)} Bow = {Bow2(txt,M)}''') break elif feature.lower() == 'n': print(f'''------------------- char count = {char_count(txt)} alphanumeric count = {alphanumeric_count(txt)} line count = {line_count(txt)} word count = {word_count(txt)} Bow = {Bow(txt)}''') break else : print("Try again.")
# 6330433521 (22.99) 264 (2021-03-22 16:52) def fhash(w,M): nword = [] f = 0 for e in w: nword.append(e[:len(e)+1]) for i in range(len(nword)): f += (ord(nword[i])*37**i) return f%M def alphabet(t): c = 0 for e in t: if e.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': c += 1 return c def chword(t): x = [] w = '' for e in t: if e.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': w += e else: if w != '': x.append(w) w = '' return x file_name = input('File name = ') feature = input('Use feature hashing ? (y,Y,n,N) ') while True: if feature == 'y' or feature == 'Y': M = int(input('M = ')) feature = True break elif feature == 'n' or feature == 'N': feature = False break else: print('Try again.') feature = input('Use feature hashing ? (y,Y,n,N) ') print('-------------------') stopword = open('stopwords.txt', 'r') fle = open(file_name,'r') stop = [] for line in stopword: for e in line.strip().split(): stop.append(e.lower()) stopword.close chcount = 0 alcount = 0 lincount = 0 wcount = 0 for line in fle: lincount += 1 chcount += len(line) alcount += alphabet(line) wcount += len(chword(line)) chcount = chcount - lincount +1 fle.close() fle = open(file_name, 'r') bow = [] if feature == True: wbow = [] for line in fle: for e in chword(line): if e.lower() not in stop: if fhash(e.lower(),M) not in wbow: wbow.append(fhash(e.lower(),M)) bow.append([fhash(e.lower(),M), 1]) else: for i in range(len(bow)): if bow[i][0] == fhash(e.lower(),M): bow[i][1] += 1 bow.sort() else: wbow = [] for line in fle: for e in chword(line): if e.lower() not in stop: if e.lower() not in wbow: wbow.append(e.lower()) bow.append([e.lower(),1]) else: for i in range(len(bow)): if bow[i][0] == e.lower(): bow[i][1] += 1 fle.close() print('char count = ', chcount) print('alphanumeric count = ', alcount) print('line count = ', lincount) print('word count = ', wcount) print('BoW = ', bow)
# 6330434121 (21.55) 265 (2021-03-22 17:39) def flash(w,m): c = 0 for i in range(len(w)): c = c + (ord(w[i])*(37**i)) number = c%int(m) return number file_name = input("File name = ") feature = input("Use feature hashing ? (y,Y,n,N) ") m = 0 if not feature == "y" and feature == "Y" and feature == "n" and feature == "N": a = 0 while a == 0: print("Try again") feature = input("Use feature hashing ? (y,Y,n,N) ") if feature == "y" or feature == "Y" or feature == "n" or feature == "N": if feature == "y" or feature == "Y": m = input("M = ") break else: if feature == "y" or feature == "Y": m = input("M = ") stopword = [] stopwords = open("stopwords.txt", "r") for line in stopwords: words = line.strip().split() for i in range(len(words)): stopword.append(words[i]) stopwords.close() file = [] newword = "" line_count = 0 word_count = 0 char_count = 0 alphanumeric_count = 0 files = open(file_name, "r",encoding="utf-8") for line in files: line_count += 1 char_count += len(line.strip("\n"))+1 check_word = line.strip().lower() for i in range(len(check_word)): if check_word[i] not in "abcdefghijklmnopqrstuvwxyz0123456789": newword = newword+" " else: newword = newword + check_word[i] alphanumeric_count += 1 file = file + newword.strip().split() newword = "" files.close() char_count = char_count - line_count word_count = len(file) new_file = [] for i in range(len(file)): if not file[i] in stopword: new_file.append(file[i]) new_file.sort() bow = [] bow1 = [] c = 1 for i in range(len(new_file)-1): if new_file[i]==new_file[i+1]: c +=1 else: bow.append([new_file[i],c]) c = 1 if not m == 0: e = "" f = [] for i in range(len(new_file)): e = flash(new_file[i],m) f.append(e) f.sort() for i in range(len(f)-1): if f[i]==f[i+1]: c +=1 else: bow1.append([f[i],c]) c = 1 bow1.append([f[-1],c]) print("-------------------") print("char count = ",char_count) print("alphanumeric count = ",alphanumeric_count) print("line count = ",line_count) print("word count = ",word_count) if m == 0: print("BoW = ",bow) else: print("BoW = ",bow1)
# 6330435821 (30.00) 266 (2021-03-21 21:13) def count( data, element ): c = 0 for e in data: if e == element: c += 1 return c def BoW(list): BoW=[] appended=[] for word in list: if word not in appended: BoW.append([word,count(list,word)]) appended.append(word) return BoW def fhash(w,M): G=37 fhash=0 for i in range(len(w)): fhash+=ord(w[i])*(G**i) fhash%=M return fhash def BoW_w_fhash(list,M): BoW=[] fhash_list=[] appended=[] for word in list: fhash_list.append(fhash(word,M)) for i in fhash_list: if i not in appended: appended.append(i) BoW.append([i,count(fhash_list,i)]) return BoW stopwords_in=open('stopwords.txt','r') #3 stopwords_list=[] for line in stopwords_in: stopwords_list+=line.split() stopwords_in.close() file_name=input('File name = ') #1 fin=open(file_name,'r') words_list=[] char_count=0 alphanumeric_count=0 line_count=0 word_count=0 for line in fin: line_count+=1 word = '' for letter in line.lower(): allow_letters='abcdefghijklmnopqrstuvwxyz0123456789' if letter!='\n':char_count+=1 if letter in allow_letters: word+=letter alphanumeric_count+=1 else: if word!='': word_count+=1 if word not in stopwords_list: words_list.append(word) word='' if word!='': word_count+=1 if word not in stopwords_list: words_list.append(word) fin.close() mode=input('Use feature hashing ? (y,Y,n,N) ') #2 while mode not in ['y','Y','n','N']: print('Try again.') mode = input('Use feature hashing ? (y,Y,n,N) ') if mode=='Y' or mode=='y': M=int(input('M = ')) BoW=BoW_w_fhash(words_list,M) #4 else: BoW=BoW(words_list) BoW.sort() print('-------------------') print('char count =',char_count) print('alphanumeric count =',alphanumeric_count) print('line count =',line_count) print('word count =',word_count) print('BoW =',BoW)
# 6330436421 267 (2021-03-22 19:48) # Set the enumerate CHAR_LIST = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ012345678" def clear_word(word): """Get a word, clear a word to just a word and convert it to lowercase Parameter : Ugly like shit word from anywhere (Kasumi's house?) Return : Word as a list (because it must support if it have a separator in there) Doctest : >>> clear_word("Abc:a18") ['abc', 'a18'] """ result_list = [] # Turn a word to lowercase word = word.lower() # Convert to list to for loop word_list = list(word) # For loop to seperate character and (if) seperator and set it to result_list for i in range(len(word_list)): if word_list[i] in CHAR_LIST: result_list.append(word_list[i]) else: result_list.append(" ") # Convert result list to normal string (Make it easier to find a real 'space') result_list = "".join(result_list) # Convert back to list again but we have a real 'space' bewteen word now result_list = result_list.split(" ") # Next : We must clear some shit if a member in result list is ' ' (blank space) # First, we must find that how many shit space we have by using for loop. space_number = 0 other_number = 0 for member in result_list: if member == ' ': space_number += 1 elif member == '': other_number += 1 # Second, use function 'remove' to remove a shit space n times. We cannot use remove and not check a number # because if it not have blank space in result_list it will run to error. if space_number != 0: for i in range(space_number): result_list.remove(' ') if other_number != 0: for i in range(other_number): result_list.remove('') # Complete! Then, return! return result_list def clear_stop_word(file_name, sentence_list): """Clear a stop word from list of word in sentence Parameter : file name that contain stop words and sentence as list Return : Sentence as list that already clear a stop words """ duplicate_list = [] stopwords = [] # Open stopword file stopwords_file = open(file_name, "r") for line in stopwords_file: stop = line.strip().split() for i in stop: stopwords.append(i) stopwords_file.close() # Use list comprehension for finding stop words in sentence list for member in sentence_list: if member in stopwords: duplicate_list.append(member) # Remove stop word that in sentence from sentence list by use of duplicate list for member in duplicate_list: sentence_list.remove(member) # Return! return sentence_list def sentence_to_list(sentence): """Convert and clean a sentence to a list for bow Parameter : String sentence Return : Clean sentence ready to use in bow """ # Declare a variable zone to make a code more 'clean' not like that shit sentence_complete = [] # Seperate a sentence to list sentence = sentence.split() for word in sentence: # Clear and spread the word by using 'clear_word' function that we write before sentence_complete.append(clear_word(word)) # Convert list in list to just a word sentence_complete = list(map(''.join, sentence_complete)) # Clear a stop word by using clear_stop_word function sentence_complete = clear_stop_word('stopwords.txt', sentence_complete) # Sort a list to make a result list as ordered number and word sentence_complete.sort() return sentence_complete def bow(sentence): """Get a sentence and 'BoW' it Parameter : Sentence Return : BoW result Doctest : >>> bow("Shane likes football; he is a big fan of Arsenal football team.") [['shane', 1], ['likes', 1], ['football', 2], ['big', 1], ['fan', 1], ['arsenal', 1], ['team', 1]] """ # Declare a variable zone to make a code more 'clean' not like that shit result = [] word_already_append = [] # List for check if we already append it in result list to make it more convenience # Use sentence_to_list function to get a clean sentence ready for bow sentence = sentence_to_list(sentence) # Make a result list for word in sentence: if word not in word_already_append: result.append([word, 1]) word_already_append.append(word) else: for list_in_result_index in range(len(result)): if result[list_in_result_index][0] == word: result[list_in_result_index][1] += 1 return result def fhash(word, M): """Just fhash Parameter : word, M Return : fhash result Doctest : >>> fhash('big', 4) 2 """ # Calculate fhash formular G = 37 result = 0 word = list(word) for i in range(len(word)): if i == 0: result += ord(word[i]) else: result += ord(word[i]) * (G ** i) return result % M def bow_fhash(sentence, M): """Bow with fhash Parameter : sentence, M Return : List result of bow with fhash Doctest : >>> bow_fhash("Shane likes football; he is a big fan of Arsenal football team.", 4) [[0, 1], [1, 1], [2, 2], [3, 4]] """ # Declare a variable zone to make a code more 'clean' not like that shit result = [] fhash_list = [] already_append = [] # List for check if we already append it in result list to make it more convenience # Use sentence_to_list function to get a clean sentence ready for bow sentence = sentence_to_list(sentence) for word in sentence: fhash_list.append(fhash(word, M)) # Sort a list to make a result list as ordered number fhash_list.sort() # Make a result list for fhash_number in fhash_list: if fhash_number not in already_append: result.append([fhash_number, 1]) already_append.append(fhash_number) else: for list_in_result_index in range(len(result)): if result[list_in_result_index][0] == fhash_number: result[list_in_result_index][1] += 1 return result def count(file_name, fhash=False, M=0): """Count and print all Parameter : file name, fhash True or False as a boolean (Default is False), M if fhash is True Return : Nothing """ # Declare a variable char_count = 0 alphanumberic_count = 0 word_count = 0 file_list = [] line_list = [] word_list = [] clear_word_list = [] # Open file and readline file = open(file_name, "r") # Append each line of file to list for x in file: file_list.append(x) # We get line_count from how many list that it append in for loop line_count = len(file_list) # Next, for loop each line to make a list of line and a list of word for line in file_list: line_list.append(line.strip()) word_list.append(line.strip().split()) # Use a line list that we just make to count a character and alphanumberic for line in line_list: char_count += len(line) for character in line: if character in CHAR_LIST: alphanumberic_count += 1 # Clear a word list by using a clear_word function for word in word_list: clear_word_list.append(clear_word(str(word))) # After we get a clean word now, count it. for member in clear_word_list: word_count += len(member) # Print a result print("-------------------") print(f"char count = {char_count}") print(f"alphanumberic count = {alphanumberic_count}") print(f"line count = {line_count}") print(f"word_count = {word_count}") sentence = "" for line in line_list: sentence += line sentence += " " # We set 2 parameter (fhash and M) to an enum to so you can use this function if you don't have fhash and M to. if fhash == False: print(f"BoW = {bow(sentence)}") else: print(f"BoW = {bow_fhash(sentence, M)}") # Run Program file_name = input("File name = ") while True: hashing_or_not = input("Use feature hashing ? (y,Y,n,N) ") if hashing_or_not == "Y" or hashing_or_not == "y": M = int(input("M = ")) # Turn fhash mode on and put M in function count(file_name, fhash=True, M=M) break elif hashing_or_not == "N" or hashing_or_not == "n": # Use this function normally, fhash mode of and not input M because we don't require M count(file_name) break else: print("Try again.")
# 6330437021 (22.99) 268 (2021-03-22 21:54) def feature_hashing(w, M): A=0 for i in range(len(w)): A+=(ord(w[i])*(37**i)) return A%M file_name=input("File name = ") A=input("Use feature hashing ? (y,Y,n,N) ") while A not in ["y","Y","n","N"]: print("Try again.") A=input("Use feature hashing ? (y,Y,n,N) ") if A in ["y","Y"]: M=int(input("M = ")) print("-------------------") B=open(file_name,"r") C=0 for i in B: for c in i: C+=1 if "\n" in i: C-=1 print("char count =",C) B.close() B = open(file_name,"r") D=0 for i in B: for c in i: if ("a"<=c<="z"): D+=1 elif ("A"<=c<="Z"): D+=1 elif ("0"<=c<="9"): D+=1 print("alphanumeric count =",D) B.close() B=open(file_name,"r") E=0 for i in B: if "\n" in i: E+=1 print("line count =",E+1) B.close() B=open(file_name,"r") F=[] G="" for i in B: for c in i: if ("a"<=c<="z"): G+=c elif ("A"<=c<="Z"): G+=c elif ("0"<=c<="9"): G+=c else: if len(G) != 0: F.append(G) G="" print("word count =",len(F)) B.close() Y=[] Z=open("stopwords.txt","r") for i in Z: for w in i.strip().split(): w = w.lower() if w not in Y: Y.append(w) Z.close() J=[] K=[] L=[] N=[] n=0 for i in F: i=i.lower() if not i in Y: J.append(feature_hashing(i,M)) for i in J: if i not in K: K.append(i) for c in K: n=0 for i in J: if c==i: n+=1 N.append([c,n]) print("BoW =",N) elif A in ["n","N"]: print("-------------------") B=open(file_name,"r") C=0 for i in B: for c in i: C+=1 if "\n" in i: C-=1 print("char count =",C) B.close() B = open(file_name,"r") D=0 for i in B: for c in i: if ("a"<=c<="z"): D+=1 elif ("A"<=c<="Z"): D+=1 elif ("0"<=c<="9"): D+=1 print("alphanumeric count =",D) B.close() B=open(file_name,"r") E=0 for i in B: if "\n" in i: E+=1 print("line count =",E+1) B.close() B=open(file_name,"r") F=[] G="" for i in B: for c in i: if ("a"<=c<="z"): G+=c elif ("A"<=c<="Z"): G+=c elif ("0"<=c<="9"): G+=c else: if len(G) != 0: F.append(G) G="" print("word count =",len(F)) B.close() H=[] I=open("stopwords.txt","r") for i in I: for w in i.strip().split(): w = w.lower() if w not in H: H.append(w) I.close() J=[] K=[] L=[] N=[] n=0 for i in F: i=i.lower() if not i in H: J.append(i) for i in J: if i not in K: K.append(i) for c in K: n=0 for i in J: if c==i: n+=1 N.append([c,n]) print("BoW =",N)
# 6330438721 (23.15) 269 (2021-03-22 22:04) file = input("File name = ") ans = input("Use feature hashing ? (y,Y,n,N) ") while ans not in "yYnN": print("Try again.") ans = input("Use feature hashing ? (y,Y,n,N) ") fn = open(file,"r") fs = open("stopwords.txt", "r") stop = [] for e in fs: stop += e.split() backn = 0 cha_c = 0 alp_c = 0 line_c = 0 w_c = 0 words = "" def fhash(word, M): weight = 1 hashsum = 0 for w in word: hashsum += ord(w) * weight weight *= 37 return hashsum % M if ans == "Y" or ans == "y": M = int(input("M = ")) print("-------------------") for line in fn: cha_c += len(line) for e in line: if e == "\n": backn += 1 if e in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789": alp_c += 1 words += e else: words += " " line_c += 1 list_words = words.split() w_c = len(list_words) print("char count =", cha_c - backn) print("alphanumeric count =", alp_c) print("line count =", line_c) print("word count =", w_c) not_stop_words = [] for w in list_words: if w.lower() not in stop: not_stop_words.append(w) pairs = [] temp_words = [] for m in not_stop_words: temp_words.append(fhash(m, M)) temp_words.sort() for k in temp_words: found = False for p in pairs: if k == p[0]: p[1] += 1 found = True break if not found: pairs.append([k, 1]) print("BoW =",pairs) elif ans == "N" or ans == "n": print("-------------------") for line in fn: cha_c += len(line) for e in line: if e == "\n": backn += 1 if e in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789": alp_c += 1 words += e else: words += " " line_c += 1 list_words = words.split() w_c = len(list_words) print("char count =", cha_c - backn) print("alphanumeric count =", alp_c) print("line count =", line_c) print("word count =", w_c) not_stop_words = [] for w in list_words: if w.lower() not in stop: not_stop_words.append(w) pairs = [] for k in not_stop_words: found = False for p in pairs: if k == p[0]: p[1] += 1 found = True break if not found: pairs.append([k, 1]) print("BoW =",pairs) fn.close() fs.close()
# 6330439321 (12.00) 270 (2021-03-21 22:31) running = True def fhash(word: str, M: int) -> int: weight = 1 hashsum = 0 for w in word: hashsum += ord(w) * weight weight *= 37 return hashsum % M def bow(words: list, command: str, M: int) -> list: pairs = list() temp_words = list() if command == 'y': for w in words: temp_words.append(fhash(w, M)) temp_words.sort() else: temp_words = words for w in temp_words: found = False for p in pairs: if w == p[0]: p[1] += 1 found = True break if not found: pairs.append([w, 1]) return pairs while running: file_name = input('File name = ') cmd = input('Use feature hashing ? (y,Y,n,N) ').lower() if cmd not in ('y', 'n'): print('Try again.') else: stopwords = ['it', 'they', 'the', 'a', 'an', 'of', 'on', 'in', 'at', 'is', 'am', 'are', 'was', 'were'] # extract filechar to line n_line = 0 # count number of lines f = open(file_name, 'r') file_line = '' for line in f: for c in line: file_line += c n_line += 1 # end extract if cmd == 'y': M = int(input('M = ')) # get else: M = 0 ###### END INPUT STAGE ####### # print char count print('char count =', len(file_line) - (n_line-1)) n_alphanumeric = 0 words = list() tempword = '' for c in file_line: if c.isalpha() or c.isdigit(): tempword += c n_alphanumeric += 1 # reset elif tempword: words.append(tempword.lower()) tempword = '' # print alphanumeric, line and word count print('alphanumeric count =', n_alphanumeric) print('line count =', n_line) print('word count =', len(words)) # calculate BOW not_stop_words = list() for w in words: if w not in stopwords: not_stop_words.append(w) print(bow(not_stop_words, cmd, M)) running = False
# 6330440921 (26.00) 271 (2021-03-22 12:33) file_name = input('File name = ').strip() # sample.txt check = '' wordcount, line, alphacount, charcount = 0, 0, 0, 0 stopwords = open('stopwords.txt', "r") stwords = [] for stline in stopwords: stwords += stline.split() stopwords.close() def punc(sentence): new_sentence = '' alpha = 0 for i in range(len(sentence)): if sentence[i].isalnum() == True: # isalnum check alphanumeric new_sentence += sentence[i] alpha += len(sentence[i]) else: new_sentence += ' ' return new_sentence.split() + [alpha] def backn(sentence): if sentence[-1] == '\n': return True def fhash(word, M): G = 37 total = 0 for i in range(len(word)): total += ord(word[i])*G**i return total % M def find_all(word, sentence): count = 0 for i in range(len(sentence)): if word == sentence[i]: count += 1 return count def unique(list_all, list_stop): #all_word, stwords unique = [] for e in (list_all): if e not in list_stop: unique.append(e) return unique all_word, bow, BoW = [], [], [] fname = open(file_name, "r") for line_f in fname: all_word += punc(line_f.lower())[:-1] new_line = punc(line_f)[:-1] wordcount += len(new_line) line += 1 alphacount += punc(line_f)[-1] if backn(line_f) == True: charcount += len(line_f) - 1 else: charcount += len(line_f) fname.close() unique_list = unique(all_word, stwords) while check != 'n' or check != 'y': check = input('Use feature harshing ? (y,Y,n,N) ').lower() if check == 'y': M = int(input('M = ')) break elif check == 'n': break else: print('Try again.') print('-'*20) print('char count =', charcount) print('alphanumeric count =', alphacount) print('line count =', line) print('word count =', wordcount) if check == 'y': fh = [fhash(e, M) for e in unique_list] for i in range(len(fh)): bow.append([fh[i], find_all(fh[i], fh)]) else: for e in unique_list: bow.append([e, find_all(e, unique_list)]) bow.sort() for i in range(len(bow) - 1): if bow[i] != bow[i+1]: BoW.append(bow[i]) BoW.append(bow[-1]) print('BoW =', BoW)
# 6330441521 (30.00) 272 (2021-03-21 23:02) fn = input("File name = ") x = 0 file_name = open(fn,'r') fh = input("Use feature hashing ? (y,Y,n,N,) ") while True: if fh in ['y','Y']: x = 1 break elif fh in ['n','N']: x = 0 break else: print("Try again.") fh = input("Use feature hashing ? (y,Y,n,N,) ") if x == 1: M = int(input("M = ")) stop = open('stopwords.txt','r') sw = [] stopwords = [] for line in stop: line = line.split() if line != []: sw.append(line) for d in sw: for c in d: stopwords.append(c.lower()) line_count = 0 char_count = 0 an_count = 0 w = '' lw = [] list_word = [] word = '' for line in file_name: line_count += 1 char_count += len(line.strip('\n')) line += ' ' for d in line: if 'A' <= d <= 'Z' or 'a' <= d <= 'z' or '0' <= d <= '9': word += d an_count += 1 else: if word != '': word = word.lower() list_word.append(word) word = '' list_bow = [] for d in list_word: if d not in stopwords and d not in list_bow: list_bow.append(d) def fhash(w,M): G = 37 s = 0 for d in range(0,len(w)): s += ord(w[d])*(G**d) return s%M BoW = [] if x == 0: for d in list_bow: count = list_word.count(d) BoW.append([d,count]) elif x == 1: list_fh = [] cfh = [] for d in list_bow: fh = fhash(d,M) for c in range (0,list_word.count(d)): cfh.append(fh) if fh not in list_fh: list_fh.append(fh) for d in list_fh: count = cfh.count(d) BoW.append([d,count]) print("-------------------") print("char count =",char_count) print("alphanumeric count =",an_count) print("line count =",line_count) print('word count =',len(list_word)) print('BoW =',BoW) file_name.close() stop.close()
# 6330443821 (20.65) 273 (2021-03-21 18:53) def bow_for_yes(lst, M): if lst: uniq = [] for word in lst: if word not in uniq: uniq.append(fhash(word, M)) bow1 = [] for i in range(len(uniq)): bow1.append([uniq[i],uniq.count(uniq[i])]) if bow1: uniq2 = [] for ele in bow1: if ele not in uniq2: uniq2.append(ele) uniq2.sort() return uniq2 def bow(lst): if lst: uniq = [] for word in lst: if word not in uniq: uniq.append(word) bow1 = [] for i in range(len(uniq)): bow1.append([uniq[i],lst.count(uniq[i])]) bow1.sort() return bow1 def all_lower(x_file): file = open(x_file, 'r') stop_file = open('stopwords.txt', 'r') word = [] stop_word = [] for line in stop_file: line = line.strip('\n') line2 = line.split() for i in range(len(line2)): stop_word.append(line2[i].lower()) for line in file: line = line.strip('\n') line2 = line.split() for i in range(len(line2)): if line2[i].lower() not in stop_word: word.append(line2[i].lower()) word2 = '' lst = [] for i in range(len(word)): for j in range(len(word[i])): if word[i][j].isalnum(): word2 += word[i][j] lst.append(word2) word2 = '' file.close() return lst def alphanumeric_count(file_name): file = open(file_name, 'r') summ = 0 for line in file: line = line.strip('\n') for i in range(len(line)): if line[i].isalnum(): summ += 1 file.close() return summ def fhash(strr, M): G = 37 summ = 0 for i in range(len(strr)): summ += ord(strr[i])*(G**i) return summ % M def countt(file_name): file = open(file_name, 'r') num_of_lines = 0 num_of_words = 0 num_of_char = 0 for line in file: line = line.strip("\n") words = line.split() num_of_lines += 1 num_of_words += len(words) num_of_char += len(line) file.close() return num_of_char, num_of_words, num_of_lines file_name = input("File name = ") x = input("Use feature hashing ? (y,Y,n,N) ") while x not in ['y', 'Y', 'n', 'N']: print("Try again.") x = input("Use feature hashing ? (y,Y,n,N) ") if x in ['n', 'N']: print("-------------------") print("char count = "+ str(countt(file_name)[0])) print("alphanumeric count = "+ str(alphanumeric_count(file_name))) print("line count = "+ str(countt(file_name)[2])) print("word count = "+ str(countt(file_name)[1])) print("BoW = " +str(bow(all_lower(file_name)))) elif x in ['y', 'Y']: M = input("M = ") print("-------------------") print("char count = "+ str(countt(file_name)[0])) print("alphanumeric count = "+ str(alphanumeric_count(file_name))) print("line count = "+ str(countt(file_name)[2])) print("word count = "+ str(countt(file_name)[1])) print("BoW = " +str(bow_for_yes(all_lower(file_name), int(M))))
# 6330444421 (30.00) 274 (2021-03-22 15:48) def isAl(w): o = ord(w) if 48 <= o <= 57 or 97 <= o <= 122: return True return False # does not concert of stopWords # count occurance of a word in words def bagOfWord(words): BoW = [] words.sort() i = 0 while i < len(words): dup = 1 first = words[i] while i+1 < len(words) and words[i+1] == first: dup += 1 i += 1 i += 1 BoW.append([first,dup]) return BoW # transform word in words to hash def fhash(words, M): hash = [] for w in words: mod = 0 i = 0 for c in w: mod += ord(c)*(37**i) i += 1 mod %= M hash.append(mod) return hash def removeStopWords(words, stopWords): removed = [] for w in words: if w not in stopWords: removed.append(w) return removed def detail(fileText): alphaCount = 0 preprocessText = fileText.lower() postprocessText = "" for c in preprocessText: if isAl(c): postprocessText += c alphaCount += 1 else: postprocessText += " " words = postprocessText.split() lineCount = preprocessText.count("\n") + 1 charCount = len(preprocessText) - lineCount + 1 wordCount = len(words) return [postprocessText, words, lineCount, charCount, wordCount, alphaCount] def main(): filename = input("File name = ") file = open(filename, "r") stopWords = open("stopwords.txt","r").read() useHash = input("Use feature hashing ? (y,Y,n,N) ") while useHash not in ["y","Y","n","N"]: print("Try again.") useHash = input("Use feature hashing ? (y,Y,n,N) ") d = detail(file.read()) words, lineCount, charCount, wordCount, alphaCount= d[1],d[2],d[3],d[4],d[5] rawText = removeStopWords(words, stopWords.replace("\n"," ").split()) if useHash in ["n","N"]: print("-------------------") print("char count =", charCount) print("alphanumeric count =", alphaCount) print("line count =", lineCount) print("word count =", wordCount) print("BoW =", bagOfWord(rawText)) if useHash in ["y","Y"]: M = int(input("M = ")) print("-------------------") print("char count =", charCount) print("alphanumeric count =", alphaCount) print("line count =", lineCount) print("word count =", wordCount) print("BoW =", bagOfWord(fhash(rawText, M))) main()
# 6330445021 (26.67) 275 (2021-03-21 22:50) #Prog-08: Bag-of-words # 6330445021 (26.67) Matt Yongpiyakul def file(): f = open(file_name,'r') return f def words_in(file): f = file s = '' for line in f: l = line for k in range(len(l)): char = l[k].lower() if not l[k].isalnum(): char = ' ' s += char words = s.split() return words def stopwords(): sw = open('stopword.txt','r') return words_in(sw) def filtered(): pos = [] words = words_in(file()) for k in range(len(words)): if words[k] in stopwords(): pos.insert(0,k) for i in pos: words.pop(i) return words def characters(): count = 0 for line in file(): count += len(line.strip()) return count def alnum(): count = 0 for i in words_in(file()): count += len(i) return count def word_count(): return len(words_in(file())) def line_count(): count = 0 for line in file(): count += 1 return count def fhash(w): ords = 0 for k in range(len(w)): ords += ord(w[k]) * 37**k return ords def bow(): l = filtered() l.sort() words = [] for i in l: if not i in words: words.append(i) bow = [] for i in words: bow.append([i,l.count(i)]) return bow def fbow(M): l = filtered() fbow = [] for k in range(M): fbow.append([k,0]) for i in l: fbow[fhash(i)%M][1] += 1 for i in fbow[::-1]: if i[1] == 0: fbow.remove(i) return fbow def hashing(): cond = input('Use feature hashing ? (y,Y,n,N) ').lower() if cond == 'y': return True if cond == 'n': return False print('Try again.') return hashing() file_name = input('File name = ') if hashing(): M = int(input('M = ')) bow = fbow(M) else: bow = bow() print('-'*19) print('char count =',characters()) print('alphanumeric count =',alnum()) print('line count = ',line_count()) print('word count =',word_count()) print('BoW =',bow)
# 6330446721 (20.20) 276 (2021-03-21 14:50) file_name = input("File name = ") cd = input("Use feature hashing ? (y,Y,n,N) ") while cd not in "yYnN": print("Try again.") cd = input("Use feature hashing ? (y,Y,n,N) ") if cd == "y" or cd =="Y" : M = int(input("M = ")) def count_words(words,wordslist): c = 0 for e in wordslist: if e == words: c += 1 return c def BoW(wordslist): bow = [] wordslist.sort() for e in wordslist: if e not in bow : bow.append(e) reBow = [] for x in bow: reBow.append([x,count_words(x,wordslist)]) return reBow def Hash(w,M): c = 0 for i in range(len(w)): c += ord(w[i])*(37**i) fh = c%M return fh def cut_st_words(words,stopwords): for i in range(len(words)): words[i] = words[i].lower() w = [] for e in words: if e not in stopwords: w.append(e) return w stopwords = [] stwfile = open("stopwords.txt","r") for line in stwfile: if len(line) > 0 : for e in line.split(): stopwords.append(e) stwfile.close() words = [] c_c = 0 alp_c = 0 l_c = 0 w_c = 0 file = open(file_name,"r") for line in file: if line[-1] == "\n" : line = line[:-1] c_c += len(line) line= line.lower() if len(line) > 0: l_c += 1 for e in line: if 'a' <= e <= "z" or '0' <= e <= '9': alp_c += 1 w = "" for i in range(len(line)): u = line[i] if 'a' <= u <= "z" or '0' <= u <= '9' : w += u else : if w != "" : words.append(w) w = "" file.close() final_words = cut_st_words(words,stopwords) if cd == "y" or cd =="Y" : for i in range(len(final_words)): final_words[i] = Hash(final_words[i],M) rb = BoW(final_words) else: rb = BoW(final_words) print("-------------------") print("char count = ",c_c ) print("alphanumeric count =",alp_c) print("line count =",l_c) print("word count =",len(words)) print("BoW =",rb)
# 6330447321 (22.99) 277 (2021-03-22 02:40) file_name = str(input('File name = ').strip()) while True: ans = input('Use feature hashing ? (y,Y,n,N) ') if ans in ['y','n','Y','N']: if ans in ['Y','y']: fh = True M = int(input('M = ')) elif ans in ['N','n']: fh = False break print('Try again.') print('-------------------') f = open(file_name,'r') w = '' c,i,l,d = 0,0,0,0 L7 = [] for e in f.readlines(): l += 1 if '\n' in e : w += e[:-1]+' ' d += 1 else : w += e L7.append(e) L7 = L7[::-1] for z in range(len(L7)): if L7[z] != '\n':break print('char count =',len(w)-d) while i<len(w): if w[i].isalpha() or w[i].isnumeric(): c += 1 i += 1 print('alphanumeric count =',c) print('line count =',l-z) L,L2 = [],[] #list fo word L1 = L - stopwords w3 = '' for i in range(len(w)): if w[i].isalpha() or w[i].isnumeric(): w3 += w[i] else : if w3 != '' : L.append(w3.lower()) w3 = '' print('word count =',len(L)) f.close() g = open('stopwords.txt','r') L1,ww = [],'' for line in g.readlines(): if '\n' in line : L1.extend(line[:-1].split()) else : L1.extend(line.split()) for h in L: if not h in L1: L2.append(h) #count word in L2 (not same) L3 = [] wL3 = [] for i in range(len(L2)): if not L2[i] in wL3: wL3.append(L2[i]) L3.append([L2[i],1]) else: for j in range(len(L3)): if L3[j][0] == L2[i]: L3[j][1] += 1 break def fhash(y,M): count = 0 for i in range(len(y)): count += ord(y[i])*(37**i) return count%M L3.sort() L5,L6 = [],[] if not fh: print('BoW =',L3) else : L4 = [] for i in range(len(L3)): L4.append([fhash(L3[i][0],M),L3[i][1]]) for i in range(len(L4)): if not L4[i][0] in L6: L5.append(L4[i]) L6.append(L4[i][0]) else: for j in range(len(L5)): if L5[j][0] == L4[i][0]: L5[j][1] += L4[i][1] print('BoW =',sorted(L5))
# 6330448021 (0.00) 278 (2021-03-22 23:58) file_name = input('Flie name') use_feature = input('Use feature hashing ? (y,Y,n,N) ') def blank(t): result = "" for c in t: if c in "\"\'/\\,.:;": result += " " else: result += c return result def flash(w,M) : mavis = 0 for i in range(len(w)) : mavis += ord(w[i])*37**i meow = mavis%M return meow fin = open(file_name,"r") line = fin.readline() number_of_lines = 0 number_of_words = 0 number_of_characters = 0 number_of_alphanumeric = 0 for line in fin: line = line.strip("\n") words = line.split() number_of_lines += 1 number_of_words += len(words) number_of_characters += len(line) for line in fin : clear_sentence = blank(line) lower_sentence = clear_sentence.lower() fin.close() while use_feature not in 'yYnN' : use_feature = input('Use feature hashing ? (y,Y,n,N) ') if use_feature == 'y' or use_feature == 'Y' : M = int(input('M = ')) print(19*'-') print('char count = ',number_of_characters) print('alphanumeric count = ') print('line count = ',number_of_lines) print('word count =',number_of_words) print('BoW = ') elif use_feature == 'n' or use_feature == 'N' : print(19*'-') print('char count = ',number_of_characters) print('alphanumeric count = ') print('line count = ',number_of_lines) print('word count =',number_of_words) print('BoW = ')
# 6330449621 (28.00) 279 (2021-03-22 23:52) def fhash(word, M): sum_ord_num = 0 for i in range(len(word)) : ord_num = ord(word[i]) sum_ord_num += ord_num*(37**i) result = sum_ord_num%M return result #-------------------------------------------------------------------------------------------------- file_name = input("File name = ") c = input("Use feature hashing ? (y,Y,n,N) ") check = c.lower() while True : if check not in ("yn"): print("Try again.") check = input("Use feature hashing ? (y,Y,n,N) ").lower() else: break f1 = open(file_name, "r") char = 0 alpha = 0 file_line = "" for i in f1 : for e in i: if e.isalnum(): file_line += e alpha +=1 else: file_line += ' ' if e!= '\n': char += 1 if check == 'y': M = int(input('M = ')) else: M = 1 f1 = open(file_name, "r") noline = 0 for line in f1 : noline += 1 #--------------------------------------------------------------------------------------------------- def duplicates(numbers_list): store = [] checked = [] for i in range(len(numbers_list)) : counter = 1 for j in range(i+1,len(numbers_list)) : if numbers_list[i] not in checked and numbers_list[j] == numbers_list[i] : counter += 1 if counter > 1 : store.append(numbers_list[i]) checked.append(numbers_list[i]) return store #----------------------------------------------------------------------------------------------------- word_list = file_line.lower().strip().split() num_word = len(word_list) stp = open("stopwords.txt", "r") stopword=[] for line in stp : stopword += line.strip().split() listofdata = [] for e in word_list : if not e in stopword : listofdata.append(e) listoffhash = [] for w in listofdata: listoffhash.append(fhash(w,M)) if c in "yY" : a = [] b = [] for num in listoffhash: if num not in a : a.append(num) b.append([num,listoffhash.count(num)]) elif c in "nN" : a = [] b = [] for word in listofdata: if word not in a : a.append(word) b.append([word,listofdata.count(word)]) #------------------------------------------------------------------------------------------------------- print('-'*19) print('char count =',char) print('alphanumeric count =',alpha) print('line count =',noline) print('word count =',num_word) print("BoW =", b) f1.close() stp.close()
# 6330450121 (14.99) 280 (2021-03-22 22:57) #----------------------------------------------------------------------------------- def fhash(w, M) : n= 0 for i in range(len(w)): n= n + (ord(w[i]) * (37**i)) r=n%M return r def get_b( words , stopWords , Bow_Con , M ) : r=list() for c in words : a=str() for n in c: a=a+n.lower() c=a #c= c.lower() if c in stopWords: pass else: Con = False if Bow_Con: #print(c) cEdit= fhash(c, M) for i in range(len(r)): if r[i][0] == cEdit: r[i][1]=r[i][1]+ 1 Con= True break if not Con: cEdit = fhash(c, M) r.append([cEdit, 1]) else: a=len(r) #myyheadddddddd for i in range(a): if r[i][0] == c: r[i][1]=r[i][1]+ 1 Con = True break if not Con: r.append([c, 1]) r.sort() return r #----------------------------------------------------------------------------------------------- #tired M=0 file_name= input('File name = ') Bow_Con= input('Use feature hashing ? (y,Y,n,N) ') while Bow_Con not in [ 'y', 'Y' , 'n' , 'N' ]: print('Try again.') Bow_Con= input('Use feature hashing ? (y,Y,n,N) ') if Bow_Con in ['y', 'Y']: M= int(input('M = ')) Bow_Con= True #b true else: Bow_Con = False print('-------------------') #--------------------------------------- r=list() stopWordsFile= open( 'stopwords.txt', 'r' ) for line in stopWordsFile: for w in line.strip().split(): w= w.lower() if w in r: pass else: r.append(w) stopWords=r #close stopWordsFile.close() #----------------------------------- kyow= 0 kyow2= 0 words=list() wordsFile= open( file_name, 'r' ) lineCount= 0 for line in wordsFile: lineCount= lineCount + 1 for c in line: kyow=kyow+1 if c == '\n': kyow= kyow - 1 if ( '0'<= c <='9' ) or ( 'A'<= c <='Z' ) \ or ( 'a'<= c <='z' ): kyow= kyow + 1 word= str() for c in line: if ( '0'<= c <='9' ) or ( 'A'<= c <='Z' ) \ or ( 'a'<= c <='z' ): #print(c+'hola') word=word+c else: if len(word)==0: pass else: words.append(word) #empty word= str() #close wordsFile.close() print( 'char count =' , kyow ) print( 'alphanumeric count =' , kyow2 ) print( 'line count =' , lineCount ) print( 'word count =' , len(words) ) print( 'BoW =' , get_b(words, stopWords, Bow_Con, M) )
# 6330452421 (25.00) 281 (2021-03-22 15:00) file_name = input("File name = ",) choose = input("Use feature hashing ? (y,Y,n,N) ") while choose.upper() not in "NY": print("Try again.") choose = input("Use feature hashing ? (y,Y,n,N) ") if choose.upper() == "Y": M = input("M = ") stopfile = open("stopwords.txt","r") list_stop =[] for line in stopfile: line = line.split() for e in line: list_stop.append(e) stopfile.close() fn = open(file_name,"r") char_count = 0 line_count = 0 alph_count = 0 new_sen = "" for line in fn: line_count+=1 for e in line: if e != "\n": char_count+=1 if "a"<=(e.lower())<="z" or "0"<=e<="9": alph_count+=1 new_sen+=e.lower() else: new_sen+=" " list_word = new_sen.strip().split() word_count = len(list_word) fn.close() print("-------------------") print("char count =",str(char_count)) print("alphanumeric count =",str(alph_count)) print("line count =",str(line_count)) print("word count =",str(word_count)) no_stop =[] for e in list_word: if e not in list_stop: no_stop.append(e) def fhash(word,M): sum = 0 for i in range(len(word)): sum += ord(word[i])*((37)**i) return sum%int(M) if choose.upper() == "Y": list_fhash = [] for e in no_stop: list_fhash.append(fhash(e,M)) c=1 list_fhash.sort() BoW = [] if len(list_fhash)>1: for i in range(1,len(list_fhash)): if list_fhash[i] == list_fhash[i-1]: c+=1 else: BoW.append([list_fhash[i-1],c]) c=1 BoW.append([list_fhash[i],c]) else: BoW.append([list_fhash[0],c]) elif choose.upper() == "N": c=1 no_stop.sort() BoW = [] if len(no_stop)>1: for i in range(1,len(no_stop)): if no_stop[i] == no_stop[i-1]: c+=1 else: BoW.append([no_stop[i-1],c]) c=1 BoW.append([no_stop[i],c]) else: BoW.append([no_stop[0],c]) print("BoW =",BoW)
# 6330453021 (22.80) 282 (2021-03-22 20:04) n = input('File name = ') o = input('Use feature hashing ? (y,Y,n,N) ') while o not in 'yYnN': print('Try again.') o = input('Use feature hashing ? (y,Y,n,N) ') if o in 'yY': M = input('M = ') elif o in 'nN': pass #------------------------------------- list_inf = '' inf = open(n, 'r') for line in inf: list_inf += line inf.close() #------------------------------------- stop_word = '' stop = open('stopwords.txt', 'r') for line in stop: stop_word += line stop.close() #------------------------------------- stop_word = stop_word.split() #print(stop_word) list_inf = list_inf.lower() word = list_inf.split() word2 = '' for i in range(len(word)): for k in word[i]: if 'a' <= k <= 'z' or '0' <= k <= '9': word2 += k word2 += ' ' word2 = word2.split() #print(word2) word3 = [] for i in range(len(word2)): word3.append(word2[i]) #print(word) word4 = [] for i in range(len(word3)): if word3[i] in stop_word: word4.append(word3[i]) for i in range(len(word4)): word3.remove(word4[i]) #print(word3) #------------------------------------- def char_count(list_inf): c1 = 0 for i in range(len(list_inf)): if list_inf[i] not in '\n': c1 += 1 print('char count = '+str(c1)) #------------------------------------- def alphanumeric_count(list_inf): c2 = 0 for i in range(len(list_inf)): if 'A' <= list_inf[i] <= 'z' or '0' <= list_inf[i] <= '9': c2 += 1 print('alphanumeric count = '+str(c2)) #------------------------------------- def line_count(list_inf): d = open(n, 'r') c3 = 0 for i in d: c3 += 1 print('line count = '+str(c3)) d.close() #------------------------------------- first_bow = [] back_bow = [] bow = [] def not_fhash(word3): for i in range(len(word3)): if word3[i] not in first_bow: first_bow.append(word3[i]) back_bow.append(word3.count(word3[i])) elif word3[i] in first_bow: pass for i in range(len(first_bow)): bow.append([first_bow[i],back_bow[i]]) print('BoW = '+str(bow)) numword = [] ffirst_bow = [] fback_bow = [] fbow = [] G = 37 def fhash(word3, M): for i in range(len(word3)): s = 0 p = 0 for k in word3[i]: s += ord(k)*(G**p) p += 1 s = s % int(M) numword.append(s) numword.sort() for i in range(len(numword)): if numword[i] not in ffirst_bow: ffirst_bow.append(numword[i]) fback_bow.append(numword.count(numword[i])) elif word3[i] in ffirst_bow: pass for i in range(len(ffirst_bow)): fbow.append([ffirst_bow[i],fback_bow[i]]) print('BoW = '+str(fbow)) print(str('-------------------')) char_count(list_inf) alphanumeric_count(list_inf) line_count(list_inf) print('word count = '+str(len(word2))) if o in 'nN': not_fhash(word3) elif o in 'yY': fhash(word3,M)
# 6330454721 (25.15) 283 (2021-03-22 18:48) def fhash(w,M) : G = 37 total = 0 for i in range(len(w)) : out = ord(w[i]) * (G**i) total += out total = total % int(M) return total #------------- def charcount(file_name) : fin = open(file_name,"r") charcount = 0 for line in fin : for e in line : if e != "\n" : charcount += 1 fin.close() return charcount #------------ def alphanum(file_name) : fin = open(file_name,"r") alphanum = 0 for line in fin : for e in line : if ("0" <= e <= "9") or ("a" <= e <= "z") or ("A" <= e <= "Z") : alphanum += 1 fin.close() return alphanum #----------- def linecount(file_name) : fin = open(file_name,"r") i = 0 for line in fin : i += 1 fin.close() return i #----------- def wordcount(file_name) : fin = open(file_name,"r") x = "" for line in fin : for e in line : if e not in "\"\'/\\,.:;-()><|[]{}_" : x += e fin.close() return len(x.split()) #----------- def bow(file_name) : fin1 = open("stopwords.txt","r") stopword = "" for line in fin1 : for e in line : if e not in "\"\'/\\,.:;-()><|[]{}_" : stopword += e.lower() stopword = stopword.split() fin1.close() fin = open(file_name,"r") word = "" for line in fin : for e in line : if e not in "\"\'/\\,.:;-()><|[]{}_" : word += e.lower() word = word.split() fin.close() bow = [] for i in range(len(word)) : if word[i] not in stopword : bow.append(word[i]) bow.sort() unique = [] for e in bow : if e not in unique : unique.append(e) wordfre = [bow.count(w) for w in unique] last = [] for i in range(len(unique)) : last.append([unique[i],wordfre[i]]) return last #----------- def bowy(file_name,M) : fin1 = open("stopwords.txt","r") stopword = "" for line in fin1 : for e in line : if e not in "\"\'/\\,.:;-()><|[]{}_" : stopword += e.lower() stopword = stopword.split() fin1.close() fin = open(file_name,"r") word = "" for line in fin : for e in line : if e not in "\"\'/\\,.:;-()><|[]{}_" : word += e.lower() word = word.split() fin.close() bow = [] for i in range(len(word)) : if word[i] not in stopword : bow.append(word[i]) bow.sort() x = [] for i in range(len(bow)) : x.append(fhash(bow[i],M)) unique = [] for i in range(len(x)) : if x[i] not in unique : unique.append(x[i]) wordfre = [x.count(w) for w in unique] out = [] for i in range(len(unique)) : out.append([unique[i],wordfre[i]]) out.sort() if len(out) > int(M) : return out[:M] else : return out #----------- def show() : print("-"*19) print("char count =",charcount(file_name)) print("alphanumeric count =",alphanum(file_name)) print("line count =",linecount(file_name)) print("word count =",wordcount(file_name)) #---------- file_name = input("File name = " , ) feature = input("Use feature hashing ? (y,Y,n,N) ", ) while feature not in ["y","Y","n","N"] : print("Try again.") feature = input("Use feature hashing ? (y,Y,n,N) ", ) if feature in ["n","N"] : show() print("BoW =",bow(file_name)) if feature in ["y","Y"]: M = input("M = ", ) show() print("BoW =",bowy(file_name,M)) #----------
# 6330455321 (26.00) 284 (2021-03-19 00:28) def fhash(w,M) : su = 0 for i in range(len(w)) : su += ord(w[i])*(37**i) return su%M def chcount(w) : n = 0 for i in w : if i != '\n': n += 1 return n def alpcount(w) : n = 0 for e in w : if 'a'<= e <= 'z' or "A" <= e <= "Z" or '0' <= e <= '9' : n += 1 return n def wor(w) : w += " " li = [] st = "" for e in w : if 'a'<= e <= 'z' or "A" <= e <= "Z" or '0' <= e <= '9' : st += e else: li.append(st.lower()) st = "" li2 = [] for e in li : if e != "" : li2.append(e) return li2 def bowN(W,stop) : W2 = [] fin = [] for e in W : if e.lower() not in stop : W2.append(e) W2.sort() n = 1 for i in range(len(W2)-1) : if W2[i] == W2[i+1] : n += 1 else : fin.append([W2[i],n]) n = 1 fin.append([W2[-1],n]) return fin def bowY(W,stop,M) : W2 = [] fin = [] for e in W : if e not in stop : W2.append(fhash(e,int(M))) W2.sort() n = 1 for i in range(len(W2)-1) : if W2[i] == W2[i+1] : n += 1 else : fin.append([W2[i],n]) n = 1 fin.append([W2[-1],n]) return fin file_name = input("File name = ") line = open(file_name, "r") sto = open('stopwords.txt', "r") stop = [] for i in sto : stop += wor(i) YN = input("Use feature hashing ? (y,Y,n,N) ") while YN not in ['y','Y','n','N']: print('Try again') YN = input("Use feature hashing ? (y,Y,n,N) ") if YN in ['y','Y'] : Mm = int(input("M = ")) print('-------------------') su = 0 alsu = 0 lincn = 0 wo = [] for y in line : su += int(chcount(y)) alsu += int(alpcount(y)) lincn += 1 wo += wor(y) print('char count =',su) print('alphanumeric count =',alsu) print('line count =',lincn) print('word count =',len(wo)) print('BoW =',bowY(wo,stop,Mm)) else : print('-------------------') su = 0 alsu = 0 lincn = 0 wo = [] for y in line : su += int(chcount(y)) alsu += int(alpcount(y)) lincn += 1 wo += wor(y) print('char count =',su) print('alphanumeric count =',alsu) print('line count =',lincn) print('word count =',len(wo)) print('BoW =',bowN(wo,stop))
# 6330458221 (30.00) 285 (2021-03-20 15:34) file_name = input('File name = ') use = input('Use feature hashing ? (y,Y,n,N) ') while use != 'y' and use != 'Y' and use != 'n' and use != 'N': print('Try again.') use = input('Use feature hashing ? (y,Y,n,N) ') if use == 'y' or use == 'Y': M = int(input('M = ')) print('-------------------') stop_word = open('stopwords.txt', 'r') stop = '' for s in stop_word: for i in range(len(s)): if s[i] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789': stop += s[i] else: stop += ' ' stop = [s.lower() for s in stop.split()] f = open(file_name, 'r') file = [] for s in f: file.append(s) char_count = 0 for s in file: char_count += len(s) if s != file[-1]: char_count -= 1 print('char count = ' + str(char_count)) alphanumeric_count = 0 for s in file: for i in range(len(s)): if s[i] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789': alphanumeric_count += 1 print('alphanumeric count = ' + str(alphanumeric_count)) line_count = 0 for s in file: line_count += 1 print('line count = ' + str(line_count)) word = '' for s in file: for i in range(len(s)): if s[i] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789': word += s[i] else: word += ' ' word = [s.lower() for s in word.split()] print('word count = ' + str(len(word))) tword = [] for i in range(len(word)): if word[i] not in stop : tword += [word[i]] if use == 'n' or use == 'N': bow0 = [] for i in range(len(tword)): bow0.append([tword[i], 1]) bow = [] for i in range(len(bow0)): q=0 for d in range(len(bow)): if bow0[i][0] == bow[d][0]: bow[bow.index(bow[d])][1] += 1 q = 1 break if q == 0: bow.append(bow0[i]) bow.sort() print('BoW = '+ str(bow)) elif use == 'y' or use == 'Y': def fhash(w, M): fh = 0 for i in range(len(w)): fh += ord(w[i])*(37**(i)) fh %= M return fh bow0 = [] for i in range(len(tword)): bow0.append([fhash(tword[i], M),1]) bow = [] for i in range(len(bow0)): q=0 for d in range(len(bow)): if bow0[i][0] == bow[d][0]: bow[bow.index(bow[d])][1] += 1 q = 1 break if q == 0: bow.append(bow0[i]) bow.sort() print('BoW = '+ str(bow))
# 6330459921 (22.99) 286 (2021-03-21 17:25) # -------------------------------------------------- def fhash(w,M) : c = [] for i in range(len(w)) : if ("a" <= w[i] <= "z") or ("A" <= w[i] <= "Z") or ("0" <= w[i] <= "9"): c.append(w[i]) a = [] for i in range(len(c)) : x = str(ord(c[i])) a.append(x) ass = 0 for i in range(len(a)): z = int(a[i]) * (37**i ) ass += z ass = ass % M return ass # -------------------------------------------------- file_name = input("File name = ") useBoW = input("Use feature hashing ? (y,Y,n,N)") # หา "char count" , "alphanumeric count " , "line count" , "words count" charcount = 0 alphanumericcount = 0 newline = 0 words = [] linecount = 0 file = open(file_name , "r") for line in file : linecount += 1 for c in line : charcount += 1 if c == "\n" : newline += 1 if "a" <= c <= "z" or "A" <= c <= "Z" or "0" <= c <= "9" : alphanumericcount += 1 word = "" for c in line: if "a" <= c <= "z" or "A" <= c <= "Z" or "0" <= c <= "9" : word += c else : if len(word) != 0 : words.append(word) word = "" charcount = charcount - newline file.close() #หาstop words stopwords = [] stopw = open( "stopwords.txt" , "r") for line in stopw : for w in line.strip().split() : w = w.lower() if w not in stopwords : stopwords.append(w) while useBoW not in ["y" , "Y" , "n" , "N"] : print("Try again.") useBoW = input("Use feature hashing ? (y,Y,n,N)") BoW = [] if useBoW in ["y" , "Y"] : M = int(input("M = ")) print("-------------------") for c in words : c = c.lower() if c in stopwords : pass else : found = 0 newc = fhash(c,M) for i in range(len(BoW)) : if BoW[i][0] == newc : BoW[i][1] += 1 found = 1 break if not found : BoW.append([newc,1]) if useBoW in ["n" , "N"] : print("-------------------") for c in words : c = c.lower() if c in stopwords : pass else : found = 0 for i in range(len(BoW)) : if BoW[i][0] == c : BoW[i][1] += 1 found = 1 break if not found : BoW.append([c,1]) print("char count = " , charcount) print("alphanumeric count = " , alphanumericcount) print("line count = " , linecount) print("word count = " , len(words)) print("BoW = " , BoW)
# 6330460421 (21.40) 287 (2021-03-20 23:08) def remove_punc(word): the_string = '' for i in word: if i.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789 ': the_string += i else: the_string += ' ' return the_string f = open(input("File name = "), "r") strofline = "" listofline = [] linec = 0 charc = 0 for line in f: listofline.append(line.lower()) linec += 1 for i in range(0, len(listofline) - 1): listofline[i] = listofline[i][:-1] for i in range(0, len(listofline)): charc += len(listofline[i]) strofline += listofline[i] strofline = remove_punc(strofline) wordstr = "" for e in strofline: if e != " ": wordstr += e else: wordstr += "" listofwordReal = [] listofword = strofline.strip().split(" ") while "" in listofword: listofword.remove("") listofwordReal += listofword f.close() liststop = [] stop = open("stopwords.txt", "r") for line in stop: liststop.append(line.lower()) for i in range(0, len(liststop) - 1): liststop[i] = liststop[i][:-1] wordstop = [] wordstoplist = [] for i in range(len(liststop)): wordstop.append(liststop[i].split(" ")) for i in range(len(wordstop)): wordstoplist += wordstop[i] listofword = strofline.strip().split(" ") while "" in listofword: listofword.remove("") rdyforhash = [] rdyforhash += listofword for e in rdyforhash: if e in wordstoplist: listofword.remove(e) featureh = input("Use feature hashing ? (y,Y,n,N) ") while featureh not in ["Y", "y", "N", "n"]: print("Try again") featureh = input("Use feature hashing ? (y,Y,n,N) ") listofword.sort() bowl = [] if not featureh in ["n", "N"]: M = input("M = ") hashh = [] for i in range(len(listofword)): hasn = 0 for j in range(len(listofword[i])): hasn += (ord(listofword[i][j]) * 37 ** j) hasn = hasn % int(M) hashh.append(hasn) hashh.sort() for i in range(len(hashh)): if i == 0: n = hashh.count(hashh[i]) bowl.append([hashh[i], n]) elif hashh[i] != hashh[(i - 1)]: n = hashh.count(hashh[i]) bowl.append([hashh[i], n]) else: for i in range(len(listofword)): if i == 0: n = listofword.count(listofword[i]) bowl.append([listofword[i], n]) elif listofword[i] != listofword[(i - 1)]: n = listofword.count(listofword[i]) bowl.append([listofword[i], n]) # print(wordstoplist) # print(listofline) # print(strofline) # print(listofword) print("-------------------") print("char count = " + str(charc)) print("line count = " + str(linec)) print("alphanumeric count = " + str(len(wordstr))) print("word count = " + str(len(listofwordReal))) print("BoW = " + str(bowl))
# 6330461021 (30.00) 288 (2021-03-22 04:13) file_read = input("File name = ") ht = input("Use feature hashing ? (y,Y,n,N) ") while ht not in ["y","Y","n","N"]: print("Try again.") ht = input("Use feature hashing ? (y,Y,n,N) ") if ht == "y" or ht == "Y": M = int(input("M = ")) num_words = 0 num_lines = 0 num_charc = 0 num_al = 0 words = "" fn = open(file_read, "r") for line in fn: num_lines += 1 for e in line: if e != "\n": num_charc += 1 if e in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789": num_al += 1 words += e else: words += " " a = words.strip().split() num_words += len(a) fn.close() print("-------------------") print("char count = " + str(num_charc)) print("alphanumeric count = " + str(num_al)) print("line count = " + str(num_lines)) print("word count = " + str(num_words)) fn1 = open(file_read ,"r") sample = "" for line in fn1: for e in line: if e != "\n": sample += e else: sample += " " sample_c = "" for e in sample: e = e.lower() if e in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789": sample_c += e else: sample_c += " " sample_b = sample_c.strip().split() fn1.close() fn2 = open("stopwords.txt","r") stops = "" for line in fn2: for e in line: if e != "\n": stops += e else: stops += " " stop_c = "" for e in stops: e = e.lower() if e in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789": stop_c += e else: stop_c += " " stop_sp = stop_c.strip().split() fn2.close() sample_b1 = [] for e in sample_b: if e not in stop_sp: sample_b1.append(e) sample_b1.sort() sample_b2 = [] for c in sample_b1: if c not in sample_b2: sample_b2.append(c) def count( data, element ): c = 0 for e in data: if e == element: c += 1 return c listb = [] for i in range(len(sample_b2)): listb.append(count(sample_b1,sample_b2[i])) bow = [] for i in range(len(sample_b2)): bow.append([sample_b2[i],listb[i]]) def fhash(w,m): sum = ord(w[0]) for i in range(1,len(w)): sum += ord(w[i])*37**i remainder = sum%m return remainder if ht == "n" or ht == "N": print("BoW = " + str(bow)) elif ht == "y" or ht == "Y": hbow = [] for e in bow: hbow.append([fhash(e[0],M),e[1]]) hbow_n = [] hbow_c = [] for e in hbow: if e[0] not in hbow_n: hbow_n.append(e[0]) hbow_c.append(e[1]) else: i = hbow_n.index(e[0]) hbow_c[i] += e[1] hbow_f = [] for i in range(len(hbow_n)): hbow_f.append([hbow_n[i],hbow_c[i]]) hbow_f.sort() print("BoW = " + str(hbow_f))
# 6330462721 (30.00) 289 (2021-03-21 19:34) def fhash( w , M ) : x = 0 ; i = 0 for ch in w : x += ord(ch)*(37**i) i += 1 return x%int(M) def read(file) : f = open(file,"r") words = [] for line in f : line = line.lower() for i in range(len(line)): if not ("a" <= line[i] <= "z" or "0" <= line[i] <= "9") : line = line[:i]+" "+line[i+1:] x = line.split() for w in x : words.append(w) f.close() return words def clear(words,stopwords): clear = words[::] for i in range(len(words)) : if words[i] in stopwords : clear.remove(words[i]) return clear def BoW_y( L,M ): f = [] ; BoW = [] ; unique = [] ; x = 0 for i in range(len(L)) : f.append(fhash( L[i] , M )) f.sort for i in range(len(f)) : if f[i] not in unique : unique.append(f[i]) for i in range(len(unique)) : x = f.count(unique[i]) BoW.append([unique[i],x]) return BoW def BoW_n( L ): BoW = [] ; unique = [] ; x = 0 for i in range(len(L)) : if L[i] not in unique : unique.append(L[i]) for i in range(len(unique)) : x = L.count(unique[i]) BoW.append([unique[i],x]) return BoW def count(file_name): f = open(file_name,"r") ch_count = 0 ; a_count = 0 ; l_count = 0 w_count = 0 ; words = [] ; x = 0 for line in f : line = line.lower() ch_count += len(line) l_count += 1 for i in range(len(line)) : if "a" <= line[i] <= "z" or "0" <= line[i] <= "9" : a_count += 1 else : line = line[:i]+" "+line[i+1::] w_count += len(line.split()) ch_count = ch_count-l_count+1 f.close() return [ch_count,a_count,l_count,w_count] file_name = input("File name = ") words = read(file_name) stopwords = read("stopwords.txt") ask = input("Use feature hashing ? (y,Y,n,N) ") while ask not in ["y","Y","n","N"] : print("Try again.") ask = input("Use feature hashing ? (y,Y,n,N) ") if ask == "Y" or ask == "y" : M = input("M = ") BoW = BoW_y(clear(words,stopwords),M ) BoW.sort() else : BoW = BoW_n(clear(words,stopwords)) BoW.sort() print("-------------------") print("char count = "+str(count(file_name)[0])) print("alphanumeric count = "+str(count(file_name)[1])) print("line count = "+str(count(file_name)[2])) print("word count = "+str(count(file_name)[3])) print("BoW =",BoW) print(" ")
# 6330463321 (30.00) 290 (2021-03-22 22:38) def fhash(c,M): f = 0 for i in range(len(c)): f += (ord(c[i]))*(37**i) return f%M T = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' t = T.lower() n = '0123456789' file_name = input('File name = ') b = input('Use feature hashing ? (y,Y,n,N) ') while b not in ['y','Y','n','N']: print('Try again.') b = input('Use feature hashing ? (y,Y,n,N) ') if b in ['y','Y']: M = int(input('M = ')) stopwords = [] fin = open('stopwords.txt','r') for line in fin: for e in line.lower().split(): stopwords.append(e) fin.close() print('-------------------') all_a = '' fin = open(file_name,'r') for line in fin: for e in line: if e != '\n': all_a += e else: all_a += ' ' fin.close() a2 = '' for e in all_a: if e in T or e in t or e in n: a2 += e else: a2 += ' ' new_a = a2.strip().lower().split() aaa = '' alpha = '' l_count = 0 fin = open(file_name,'r') for line in fin: l_count += 1 for e in line: if e != '\n': aaa += e c_count = len(aaa) print('char count = '+ str(c_count)) fin.close() for e in aaa: if e in T or e in t or e in n: alpha += e a_count = len(alpha) print('alphanumeric count = '+ str(a_count)) print('line count = '+str(l_count)) w_count = len(new_a) print('word count = '+str(w_count)) a_withoutstopwords = [] for e in new_a: if e not in stopwords: a_withoutstopwords.append(e) if b in ['y','Y']: f = 0 new_a = [] for e in a_withoutstopwords: new_a += [fhash(e,M)] a_withoutstopwords = new_a c = [] BoW = [] for e in a_withoutstopwords: if e not in c: c += [e] for e in c: count = 0 for k in a_withoutstopwords: if k == e: count += 1 BoW += [[e,count]] print('BoW = '+str(BoW))
# 6330464021 (14.15) 291 (2021-03-21 22:15) def fhash(w,M) : sm=0 for i in range(len(w)): sm+=ord(w[i])*(37**i) return sm%M stopwords=open("stopwords.txt","r") sw=[] for e in stopwords: a=e.split() sw+=a stopwords.close() fn=input("File name = ") file_name=open(fn,"r") ans=input("Use feature hashing ? (y,Y,n,N) ") while ans not in "YyNn": print("Try again.") ans=input("Use feature hashing ? (y,Y,n,N) ") if ans in "Nn": pass if ans in "Yy": M=int(input("M = ")) print("-------------------") b=0 for e in file_name: for i in e : if i != '\n' : b+=1 print('char count = '+str(b)) c=0 file_name.close() file_name=open(fn,"r") for e in file_name: for i in e : if 'A'<= i <='z' or '0'<= i <='9': c+=1 print('alphanumeric count = '+str(c)) file_name.close() file_name=open(fn,"r") d=0 for e in file_name : d+=1 print('line count = '+str(d)) file_name.close() file_name=open(fn,"r") f=0;g='';h1=[];h2=[] for e in file_name: e+='' for i in e: if 'A'<= i <='z' or '0'<= i <='9': g+=i else: h1.append(g) g='' for i in h1: if i != '': h2.append(i) print('word count = '+str(len(h2))) file_name.close() file_name=open(fn,"r") h3=[] for e in h2: v=e.lower() h3.append(v) if ans in 'Nn': h4=[] for e in h3: if e not in sw: h4.append(e) h4.sort() n=[];cn=1;j=[] for e in range(len(h4)-1): if h4[e] == h4[e-1]: cn+=1 else: j.append([h4[e],cn]) cn = 1 j.append([h4[-1],cn]) print('BoW =',j) elif ans in 'Yy': h4=[] for e in h3: if e not in sw: h4.append(fhash(e,int(M))) h4.sort() n=[];cn=1;j=[] for e in range(len(h4)-1): if h4[e] == h4[e+1]: cn+=1 else: j.append([h4[e],cn]) cn = 1 j.append([h4[-1],cn]) print('BoW =',j) file_name.close()
# 6330465621 (30.00) 292 (2021-03-21 22:58) file_name = input("File name = ") hash_use = input("Use feature hashing ? (y,Y,n,N) ") while hash_use not in ['y','Y','n','N'] : print("Try again.") hash_use = input("Use feature hashing ? (y,Y,n,N) ") if hash_use in ['y','Y'] : M = int(input("M = ")) def char_count(file_name) : text = open(file_name, "r") char_count = 0 for e in text : for a in e : if a != "\n" : char_count += 1 text.close() return char_count print('-'*19) print("char count =", char_count(file_name)) def alpha_count(file_name) : text = open(file_name, "r") alpha_count = 0 for e in text : for a in e : if "A" <= a <= "Z" or "a" <= a <= "z" or "0" <= a <= "9" : alpha_count += 1 text.close() return alpha_count print("alphanumeric count =", alpha_count(file_name)) def line_count(file_name) : text = open(file_name, "r") line_count = 0 for e in text : line_count += 1 text.close() return line_count print("line count = ", line_count(file_name)) def word(file_name) : text = open(file_name, "r") word = '' words = [] for e in text : for a in e : if not "A" <= a <= "Z" and not "a" <= a <= "z" and not "0" <= a <= "9" : word += ' ' else : word += a w = word.split() for i in w : words.append(i) text.close() return words print("word count = ", len(word(file_name))) def n(word,words) : n = 0 for c in words : if word == c : n += 1 return n def Bow(file_name) : Bow = [] words2 = [] words_finish = [] words = word(file_name) stop_text = open("stopwords.txt", "r") stop_words = [] for e in stop_text : sw = e.split() for i in sw : stop_words.append(i) stop_text.close() for e in range(len(words)) : words[e] = words[e].lower() for a in words : if a not in stop_words : words2.append(a) for b in words2 : if b not in words_finish : words_finish.append(b) for c in words_finish : Bow.append([c,n(c,words)]) return Bow def fhash(file_name,M) : fhash_1 = [] fhash = [] words = Bow(file_name) for e in words : sum_ord = 0 for i in range(len(e[0])) : sum_ord += ord(e[0][i])*(37**i) hashing = sum_ord % M fhash_1.append([hashing,e[1]]) fhash_1.sort() n = 0 for i in range(len(fhash_1) - 1 ) : if i == len(fhash_1) - 2 : if fhash_1[i][0] == fhash_1[i+1][0] : n += fhash_1[i][1] + fhash_1[i+1][1] fhash.append([fhash_1[i][0],n]) break else : fhash.append([fhash_1[i][0],n + fhash_1[i][1]]) fhash.append(fhash_1[i+1]) break if fhash_1[i][0] == fhash_1[i+1][0] : n += fhash_1[i][1] else : fhash.append([fhash_1[i][0],n + fhash_1[i][1]]) n = 0 fhash.sort() return fhash if hash_use in ['y','Y'] : print("BoW = ", fhash(file_name,M)) if hash_use in ['n','N'] : print("BoW = ", Bow(file_name))
# 6330466221 (30.00) 293 (2021-03-22 21:36) alphabets = ['a','b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z'] nums = ['0','1','2','3','4','5','6','7','8','9',0,1,2,3,4,5,6,7,8,9] file_name = input('File name = ') while True: check = input('Use feature hashing ? (y,Y,n,N) ').upper() if check in ['y','Y','n','N']: break else: print('Try again.') if check == 'Y': M = input('M = ') print('-------------------') else: print('-------------------') G = 37 def fhash(w,M): tot = 0 for i in range(len(w)): tot += ord(str(w[i]))*(G**(i)) ans = tot%int(M) return ans charcount = 0 linecount = 0 wordss = '' words = [] read = open('stopwords.txt','r') stopwords = [] for line in read: stopwords += line.split() read.close() for i in range(len(stopwords)): stopwords[i] = stopwords[i].lower() file = open(file_name,'r') word = '' for line in file: wordss += line linecount += 1 for i in line: if i not in alphabets and i not in nums: word += ' ' else: word += i words = word.split() file.close() for i in range(len(words)): words[i] = words[i].lower() if check == 'Y': Bow = [] BOW = [] bow = [] for i in range(int(M)): BOW.append([i,0]) bow.append([i,0]) for i in range(len(words)): if words[i] not in stopwords: for j in range(int(M)): if fhash(words[i],M) == j: BOW[j][1] += 1 bow[j][1] += 1 for i in range(int(M)): if bow[i][1] == 0: BOW.remove(bow[i]) elif check == 'N': Bow = [] BOW = [] for i in words: if i not in stopwords: if i not in Bow: Bow.append(i) BOW.append([i,1]) elif i in Bow: for j in range(len(Bow)): if i == Bow[j]: BOW[j][1] += 1 charcount = len(wordss)-linecount+1 wordscount = len(words) print('char count =', charcount) print('alphanumeric count =', len(''.join(words))) print('line count =', linecount) print('word count =', wordscount) print('BoW =',BOW)
# 6330467921 (29.00) 294 (2021-03-21 23:06) def fhash(w,M): s = 0 for i in range(len(w)): s += ord(w[i])*(37**i) fh = s%M return fh def count(word, wordslist): c = 0 for w in wordslist: if w == word: c += 1 return c file_name = input('File name = ') yn = input('Use feature hashing ? (y,Y,n,N) ') while yn not in 'yYnN': print('Try again.') yn = input('Use feature hashing ? (y,Y,n,N) ') if yn == "y" or yn == 'Y': M = int(input('M = ')) print('-------------------') stopwords = [] stopfile = open("stopwords.txt","r") for line in stopfile: line = line.lower() if len(line) > 0: stopwords += line.split() stopfile.close() abnum = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' cc = 0 abc123 = 0 lc = 0 words = [] file = open(file_name,"r") for line in file: l = '' for a in line: if a != '\n': cc += 1 if a not in abnum: l += ' ' else: l += a abc123 +=1 words += l.split() if len(line) > 0: lc += 1 file.close() print('char count =',cc) print('alphanumeric count =',abc123) print('line count =',lc) print('word count =',len(words)) for i in range(len(words)): words[i] = words[i].lower() for i in range(len(stopwords)): stopwords[i] = stopwords[i].lower() cut_words = [] for a in words: if a not in stopwords: cut_words.append(a) if yn == 'y' or yn == 'Y': for i in range(len(cut_words)): cut_words[i] = fhash(cut_words[i],M) bow = [] for e in cut_words: if e not in bow: bow.append(e) for i in range(len(bow)): bow[i] = [bow[i], count(bow[i],cut_words)] bow.sort() print('BoW =',bow)
# 6330468521 (27.60) 295 (2021-03-22 16:42) def char_count(file_name) : fin = open(file_name, "r") char_count = 0 for line in fin : a = line.strip() char_count += len(a) fin.close() return char_count def alphanumeric_count(file_name) : fin = open(file_name, "r") alphanumeric_count = 0 for line in fin : for e in line.strip() : if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9" : alphanumeric_count += 1 else : alphanumeric_count += 0 fin.close() return alphanumeric_count def line_count(file_name) : fin = open(file_name, "r") line_count = 0 for line in fin : if len(line) > 0 : line_count += 1 fin.close() return line_count def word_count(file_name) : fin = open(file_name, "r") words = "" for line in fin : for e in line.strip() : if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9" : words += e else : words += " " words_list = words.split() word_count = len(words_list) fin.close() return word_count def BOW_list(file_name) : fin = open(file_name, "r") words_of_BOW2 = [] words_of_BOW = [] for line in fin : words_of_BOW1 = "" for e in line.strip() : if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9" : words_of_BOW1 += e else : words_of_BOW1 += " " words_of_BOW2.append(words_of_BOW1.lower().split()) for i in range(len(words_of_BOW2)) : for e in words_of_BOW2[i] : words_of_BOW.append(e) fin.close() fin1 = open("stopwords.txt", "r") stopwords_list1 = [] stopwords_list = [] for line in fin1 : stopwords_list1.append(line.strip().split()) for i in range(len(stopwords_list1)) : for e in stopwords_list1[i] : stopwords_list.append(e) fin1.close() BOW_list = [] BOW_list[:] = words_of_BOW for e in stopwords_list : k = 0 while k < len(BOW_list) : if BOW_list[k] == e : BOW_list.pop(k) else : k += 1 return BOW_list def fhash(w, M) : G = 37 s = 0 for i in range(len(w)) : s += ord(w[i])*(G**i) s1 = s % int(M) return s1 def Bow_yY(p) : B = [] for e in p : y = fhash(e, int(M)) B.append(y) B1 = [] B2 = [] for e in B : x = 0 for i in range(len(B)) : if e == B[i] : x += 1 B1.append([e,x]) B1.sort() for e in B1 : if e not in B2 : B2.append(e) return B2 def BOW_nN(v) : b = [] b[:] = v list1 = [] #before for e in v : c = [] n = 0 for i in range(len(v)) : if b[i] == e : n += 1 c.append(e) c.append(n) list1.append(c) BOW_nN = [] for e in list1 : if e not in BOW_nN : BOW_nN.append(e) return BOW_nN file_name = input("File name = ") u = input("Use feature hashing ? (y,Y,n,N) ") while u not in ["y","Y","n","N"] : print("Try again.") u = input("Use feature hashing ? (y,Y,n,N) ") if u == "y" or u == "Y" : M = input("M = ") print("-------------------") print("char count = "+str(char_count(file_name))) print("alphanumeric count = "+str(alphanumeric_count(file_name))) print("line count = "+str(line_count(file_name))) print("word count = "+str(word_count(file_name))) p = BOW_list(file_name) print("BoW = "+str(Bow_yY(p))) elif u == "n" or u == "N" : print("-------------------") print("char count = "+str(char_count(file_name))) print("alphanumeric count = "+str(alphanumeric_count(file_name))) print("line count = "+str(line_count(file_name))) print("word count = "+str(word_count(file_name))) v = BOW_list(file_name) print("BoW = "+str(BOW_nN(v)))
# 6330469121 (16.00) 296 (2021-03-22 23:58) def fash(w,M): M=int(M) x=0 for i in range(len(w)): x+=ord(w[i])*(37**i) return x%M def comprog(q): q=open(q,"r") q1=[] c=0 word=[] n='' k='' z='qwertyuiopasdfghjklzxcvbnm' z+=z.upper()+'1234567890' x=z+' ' for line in q: n+=line for i in line: if i in z: q1.append(i) elif i == '\n': c+=1 if i in x: k+=i word=k.split() a=len(n)-c b=len(q1) c=c+1 d=len(word) print('char count =',a) print('alphanumeric count =',b) print('line count =',c) print('word count =',d) q.close() def count( data, element ): c = 0 for e in data: if e == element: c += 1 return c def bow(file,m): file=open(file,"r") stopword=open('stopwords.txt',"r") stop=[] for line in stopword: stop+=line.split() z='qwertyuiopasdfghjklzxcvbnm0987654321' a=[] b=[] for line in file: line=line.lower().split() for e in line: s = '' for i in range(len(e)): if e[i] in z: s+=e[i] a.append(s) for i in range(len(a)): if not a[i] in stop: b.append(a[i]) c=[] b.sort() for i in range(len(b)-1): if b[i]!=b[i+1]: c.append(b[i]) c.append(b[-1]) d=['']*len(c) for i in range(len(c)): d[i]=[c[i],count(b,c[i])] if m.lower()=='n': return d if m.lower()=='y': M=input('M = ') x=[] for i in range(len(b)): x.append(fash(b[i],M)) x.sort() c=[] for i in range(len(x)-1): if x[i]!=x[i+1]: c.append(x[i]) c.append(x[-1]) d=['']*len(c) for i in range(len(c)): d[i]=[c[i],count(x,c[i])] return d file_name=input('File name = ') comprog(file_name) m=input('Use feature hashing ? (y,Y,n,N) ') while m.lower() != 'n' and m.lower() !='y': print('Try again.') m=input('Use feature hashing ? (y,Y,n,N) ') print(bow(file_name,m))
# 6330470721 (30.00) 297 (2021-03-21 18:47) #Prog-08: Bag-of-words #6330470721 (30.00) Name Wongsatorn Suwisuthikasame file_name = input("File name = ") a = input("Use feature hashing ? (y,Y,n,N) ") def fhash(w,M): G = 37 ans =0 list_ans=[] list1=[] for n in range(len(w)): for i in range(len(w[n])): ans +=ord(w[n][i])*G**i x = ans%(int(M)) list_ans += [ans] ans =0 list1.append(x) list1.sort() res_count=[] res_cal=[] for p in list1: if p not in res_cal: res_cal.append(p) for k in res_cal: res_count.append([k,list1.count(k)]) print("BoW =",res_count) def count_char(file_name): v=open(file_name,"r") line_count=0 line1 = '' for line in v: if "\n" in line: line_count+=1 line1 += line char_num=len(line1)-line_count print("char count =",char_num) v.close() def count_alpha(file_name): v=open(file_name,"r") line_count=0 e = '' words = [] for line in v: line = line.lower() if "\n" in line: line_count+=1 for i in range(len(line)): if not "a"<=line[i]<="z" and not "0"<=line[i]<="9": line = line.replace(line[i],' ') word = line.split() words += word for i in range(len(words)): e += words[i] print('alphanumeric count =',len(e)) v.close() def count_line(file_name): f = open(file_name,'r') countline = 0 countline1 = 0 for line in f: if line != '\n': countline += 1 elif line == '\n': countline1 += 1 c = countline + countline1 print('line count =',c) f.close() def count_word(file_name): f = open(file_name,"r") words =[] for line in f: line = line.lower() for i in range(len(line)): if not "a"<=line[i]<="z" and not "0"<=line[i]<="9": line = line.replace(line[i],' ') word = line.split() words += word print('word count =',len(words)) f.close() while True: if a in ['Y','y']: m = input("M = ") print('-------------------') count_char(file_name) count_alpha(file_name) count_line(file_name) count_word(file_name) stop = open("stopwords.txt","r") stop_all =[] for line1 in stop: line1 = line1.lower() stop_list = line1.split() stop_all += stop_list stop.close() f = open(file_name,"r") words =[] for line in f: line = line.lower() for i in range(len(line)): if not "a"<=line[i]<="z" and not "0"<=line[i]<="9": line = line.replace(line[i],' ') word = line.split() words += word words_cut =[] for i in range(len(words)): if words[i] not in stop_all: words_cut.append(words[i]) fhash(words_cut,m) f.close() break elif a in ['N','n']: print('-------------------') count_char(file_name) count_alpha(file_name) count_line(file_name) count_word(file_name) stop = open("stopwords.txt","r") stop_all =[] for line1 in stop: line1 = line1.lower() stop_list = line1.split() stop_all += stop_list stop.close() f = open(file_name,"r") words =[] for line in f: line = line.lower() for i in range(len(line)): if not "a"<=line[i]<="z" and not "0"<=line[i]<="9": line = line.replace(line[i],' ') word = line.split() words += word words_cut =[] for i in range(len(words)): if words[i] not in stop_all: words_cut.append(words[i]) res_count=[] res_cal=[] for p in words_cut: if p not in res_cal: res_cal.append(p) for k in res_cal: res_count.append([k,words_cut.count(k)]) res_count.sort() print('BoW =',res_count) f.close() break else: print("Try again.") a = input("Use feature hashing ? (y,Y,n,N) ")
# 6330471321 (14.95) 298 (2021-03-22 10:19) def fhash(w, M): G = 37 y = 0 for i in range(len(w)): x = 0 x = ord(w[i]) * G ** i y += x z = y % int(M) return z def acount(x): total = 0 z = 0 for i in range(len(x)): if x[i].isalpha() or x[i].isdigit(): z += 1 return z def wcount(x): ofn = open(x, 'r') ofn = ofn.read() ofn = ofn.replace("\n", "") for i in range(len(ofn)): if not ofn[i].isalpha() and not ofn[i].isdigit(): ofn = ofn.replace(ofn[i], " ") return ofn.split() def BoW(x, h, M = 0): unique = [] res = [] stopwords = open("stopwords.txt", "r") stopwords = stopwords.read() stopwords = stopwords.split() i = 0 while i != len(x) - 1: if x[i] in stopwords: del x[i] else: i += 1 if h: x = list(map(lambda y: fhash(y, M), x)) for i in x: if i not in unique: unique.append(i) for i in unique: res.append([i, x.count(i)]) return sorted(res) file_name = input('File name = ',) while True: y = input('Use feature hashing ? (y,Y,n,N) ') ofn = open(file_name,"r") content = ofn.read() if y in 'yY': M = input('M = ',) print('-------------------') print('Char count =', len(content)) a_count = acount(content) print('alphanumeric count =',a_count) line = len(content.splitlines()) print('line count =',line) print('word count =', len(wcount(file_name))) print('BoW =', BoW(wcount(file_name), True, M)) break elif y in 'nN': print('-------------------') print('Char count =',len(content)) a_count = acount(content) print('alphanumeric count =',a_count) line = len(content.splitlines()) print('line count =', line) print('word count =', len(wcount(file_name))) print('BoW =', BoW(wcount(file_name), False)) break else: print('Try again.')
# 6330472021 (15.00) 299 (2021-03-22 23:25) file_name = input('File name = ') use_fhash = input('Use feature hashing ? (y,Y,n,N) ') use_fhash_list=['y','Y','n','N'] while use_fhash not in use_fhash_list : print('Try again.') use_fhash = input('Use feature hashing ? (y,Y,n,N) ') if use_fhash =='y' or use_fhash == 'Y': m=int(input('M = ')) def char_count(): fn = open(file_name,'r') line= fn.readline() c=0 while len(line)>0: c += len(line)-1 line= fn.readline() fn.close() return c print('-'*19) print('char count =',char_count()) def alphanumeric_count(): fn = open(file_name,'r') line= fn.readline() c=0 while len(line)>0: for i in range(len(line)): if 'a' <= line[i] <= 'z' or 'A' <= line[i] <= 'Z' or '0' <= line[i] <= '9': c+= 1 line = fn.readline() fn.close() return c print('alphanumeric count =',alphanumeric_count()) def line_count(): fn = open(file_name,'r') line= fn.readline() c=0 while len(line)>0: c+=1 line = fn.readline() fn.close() return c print('line count =',line_count()) def word_count(): fn = open(file_name,'r') line= fn.readline() word='' while len(line)>0: for i in range(len(line)): if 'a'<= line[i] <= 'z' or 'A' <= line[i] <= 'Z' or '0' <= line[i] <= '9': word += line[i] else: word += ' ' line=fn.readline() fn.close() w=word.split() c= len(w) return c print('word count =',word_count()) fn = open(file_name,'r') line= fn.readline() word='' while len(line)>0: for i in range(len(line)): if 'a'<= line[i] <= 'z' or 'A' <= line[i] <= 'Z' or '0' <= line[i] <= '9': word += line[i] else: word += ' ' line=fn.readline() fn.close() word=word.lower() w=word.split() fx = open('stopwords.txt','r') linee = fx.readline() stop_word='' while len(linee)>0: for i in range(len(linee)): if 'a'<= linee[i] <= 'z' or 'A' <= linee[i] <= 'Z' or '0' <= linee[i] <= '9': stop_word += linee[i] else: stop_word += ' ' linee=fx.readline() fx.close() stw = stop_word.split() s=[] for e in w: if e not in stw: s.append(e) def fhash(word,m): sum=0 for i in range(len(word)): sum += ord(word[i])*(37**i) fh = sum%m return fh
# 6330474221 (24.90) 300 (2021-03-22 20:52) alpnum = 'abcdefghijklmnopqrstuvwxyz0123456789' file = input('File name = ') x = input('Use feature hashing ? (y,Y,n,N) ').lower() M = '' def Bag_of_words(words): BoW = [] word_list = [] count = [] for word in words: if word not in word_list: word_list.append(word) count.append(int(1)) else: for i in range(len(word_list)): if word_list[i] == word: count[i]+=1 for i in range(len(word_list)): BoW.append([word_list[i],count[i]]) return sorted(BoW) def fhash_BOW(BoW,M): BoW_hash = [] hash_list = [] hash_count = [] for word, count in BoW: num_hash = fhash(word,M) if num_hash not in hash_list: hash_list.append(num_hash) hash_count.append(count) else: for i in range(len(hash_list)): if num_hash == hash_list[i]: hash_count[i]+=count for i in range(len(hash_list)): BoW_hash.append([hash_list[i],hash_count[i]]) BoW_hash = sorted(BoW_hash) return BoW_hash def fhash(word, M): f = 0 for i,char in enumerate(word): f += ord(char)*(37**i) f = f%int(M) return f while x not in ['n','y']: print('Try again') x = input('Use feature hashing ? (y,Y,n,N) ').lower() if x == 'y': M = input('M =') sFile = open('stopwords.txt','r') stop_words = [] for line in sFile: stop_words += line.split() stop_words = list(map(str.lower,stop_words)) sFile.close() wFile = open(file,'r') charCount = 0 alpCount = 0 lineCount = 0 wordCount = 0 words = [] for line in wFile: lineCount+=1 words += line.split() words = list(map(str.lower,words)) charCount+= len(line.strip()) wFile.close() clean_words = [] for word in words: text = '' for char in word: if char in alpnum: text += char alpCount+=1 clean_words.append(text) wordCount += len(clean_words) clean_word_stopword = [] for word in clean_words: if word not in stop_words: clean_word_stopword.append(word) print(clean_word_stopword) BoW = Bag_of_words(clean_word_stopword) if x == 'y': BoW_hash = fhash_BOW(BoW,M) print('-------------------') print('char count =',charCount) print('alphanumeric count =',alpCount) print('line count =',lineCount) print('word count =',wordCount) if x =='y': print('BoW = ', BoW_hash) else: print('BoW = ', BoW)
# 6330475921 (26.00) 301 (2021-03-22 15:57) file_name= input("File name = ") fh= input("Use feature hashing ? (y,Y,n,N) ") while fh != "y" and fh !="Y" and fh!="n" and fh!="N": print("Try again.") fh = input("Use feature hashing ? (y,Y,n,N) ") if fh == "y" or fh =="Y": M = int(input("M = " )) print('-------------------') sample = open(file_name,"r") stop_f = open("stopwords.txt","r") stop = stop_f.read() text = sample.read() text = text.lower() charc=0 alphac=0 linec=1 wordc=0 i = 0 while i < len(text): if text[i] == '\n': linec += 1 if text[i] != '\n': charc += 1 if text[i] in 'abcdefghijklmnopqrstuvwxyz0123456789': alphac += 1 i+=1 i=0 text2="" while i < len(text): if text[i] not in 'abcdefghijklmnopqrstuvwxyz0123456789': text2+= " " else: text2+=text[i] i+=1 txtlist=text2.split() i=0 stop2="" while i< len(stop): if stop[i] == "\n": stop2+=" " else: stop2+= stop[i] i += 1 stplist= stop2.split() if fh == 'n' or fh =='N': bow = [] motf = [] i = 0 while i < len(txtlist): if txtlist[i] not in stplist: if txtlist[i] not in motf: motf.append(txtlist[i]) bow.append([txtlist[i], 1]) else: ind = motf.index(txtlist[i]) bow[ind] = [txtlist[i], bow[ind][1]+1] i += 1 else: bow = [] motf = [] i = 0 while i < len(txtlist): if txtlist[i] not in stplist: j = 0 ans_f = 0 while j < len(txtlist[i]): ans_f += ord(txtlist[i][j])*(37**j) j += 1 ans = ans_f%M if ans not in motf: motf.append(ans) bow.append([ans, 1]) else: ind = motf.index(ans) bow[ind] = [ans, bow[ind][1]+1] i += 1 bow.sort() wordc = len(txtlist) sample.close() stop_f.close() print('char chount = ', charc) print('alphanumeric count =', alphac) print('line count =', linec) print('word count =', wordc) print('Bow =', bow)
# 6330476521 (20.06) 302 (2021-03-22 19:54) def fhash(w,M): x = 0 for i in range(len(w)): x += ord(w[i])*37**i r = x%M return r def get_unique(words): unique_words = [] for i in range(len(words)): if words[i] not in unique_words: unique_words.append(words[i]) return unique_words def count_words(word,g_word): c = 0 for i in range(len(g_word)): if word == g_word[i]: c += 1 return c #------------------------------------------------ fn = open('stopwords.txt',"r") stopwords = '' for line in fn: stopwords += line stopwords = stopwords.strip()+' ' stopwords = stopwords.split() fn.close() #------------------------------------------------ file_name = input('File Name = ') fn1 = open(file_name,"r") words = '' s_words = '' lines = 0 cw = 0 for line in fn1: words += line.lower() lines += 1 if '\n' in line: cw += len(line)-1 else: cw += len(line) fn1.close() for ch in words: if ch == '\n': s_words += ' ' elif 'a' <= ch <= 'z' or '0' <= ch <= '9' or ch == ' ': s_words += ch s_words = s_words.split() u_words = get_unique(s_words) while True: f_hash = input('Use feature hashing ? (y,Y,n,N) ') if f_hash in ['y','Y','n','N']: if f_hash in ['y','Y']: M = int(input('M = ')) break else: print('Try again.') print('-------------------') #------------------------------------------------ alphanumaric = 0 for e in s_words: alphanumaric += len(e) BoW = [] if f_hash in ['n','N']: for w in u_words: if w not in stopwords: BoW.append([w,count_words(w,s_words)]) elif f_hash in ['y','Y']: f_hashed = [] for w in s_words: if w not in stopwords: f_hashed.append(fhash(w,M)) u_hashed = get_unique(f_hashed) for n in u_hashed: BoW.append([n,count_words(n,f_hashed)]) BoW.sort() print('char count =',cw) print('alphanumaric count =',alphanumaric) print('line count =',lines) print('word count =',len(s_words)) print('BoW =',BoW)
# 6330477121 (17.15) 303 (2021-03-22 23:55) def Input_data(): Count = 0 M = -1 File_name_input = input('File name = ') BoW_num = input('Use feature hashing ? (y,Y,n,N) ') while BoW_num not in ['Y', 'y', 'N', 'n']: print('Try again.') BoW_num = input('Use feature hashing ? (y,Y,n,N) ') Count += 1 if BoW_num in ['y', 'Y']: M = int(input('M = ')) BoW_num = True elif BoW_num in ['n' , 'N']: BoW_num = False else: pass print('-------------------') return File_name_input, BoW_num, M def TikTok(w, M): Start = 37 Second = 0 for i in range(len(w)): Second += ((Start**i) * ord(w[i])) Ans = (Second % M) return Ans def Words_Func(File_name_input): Lenght1= 0 Lenght2 = 0 Num_Line = 0 words = [] Count = 0 word = '' wordsFile = open(File_name_input, 'r') for line in wordsFile: Num_Line += 1 for c in line: Lenght1+= 1 if c == '\n': Lenght1-= 1 else: pass #Count += 1 for c in line: if ('a' <= c <= 'z') or ('A' <= c <= 'Z') or ('0' <= c <= '9'): Lenght2 += 1 word += c else: if len(word) != 0: words.append(word) else: False #Count += 1 word = '' wordsFile.close() return Lenght1 , Lenght2 , Num_Line, words def StopWords_Func(): r = [] File_Of_stopWords = open('stopwords.txt', 'r') Count = 0 for line in str(File_Of_stopWords): for i in line.strip().split(): i = i.lower() if i not in r: r.append(i) else: False #Count += 1 File_Of_stopWords.close() return r def BoW_Ans(words, stopWords, BoW_num, M): Ans = [] for j in words: j = j.lower() if j in stopWords: pass else: Check = False if BoW_num: Edit = TikTok(j, M) for i in range(len(Ans)): if Ans[i][0] == Edit: Ans[i][1] += 1 Check = True break else: pass if not Check: Ans.append([Edit, 1]) else: pass else: for i in range(len(Ans)): if Ans[i][0] == j: Ans[i][1] += 1 Check = True break else: False if not Check: Ans.append([j, 1]) else: pass return Ans File_name_input,\ BoW_num, \ M = Input_data() stopWords = StopWords_Func() Lenght1,\ Lenght2 ,\ Num_Line, words = Words_Func(File_name_input) print('char count = ', Lenght1) print('alphanumeric count = ', Lenght2) print('line count = ', Num_Line) print('word count = ', len(words)) print('BoW = ', BoW_Ans(words, stopWords, BoW_num, M))
# 6330478821 (14.70) 304 (2021-03-21 23:16) def start(): hashes = input('Use feature hashing ? (y,Y,n,N) ') if hashes=='n' or hashes=='N': print('-------------------') print('char count = '+str(char_count)) print('alphanumeric count = '+str(alpha_count)) print('line count = '+str(line_count)) print('word count = '+str(word_count)) print('BoW = '+str(new_ans)) elif hashes=='y' or hashes=='Y': M=input('M = ') for i in new_ans: result=0 c=1 for j in i[0]: result=result+(ord(j)*c) c=c*37 result=result%int(M) if result in hash_ans_idx: hash_ans[hash_ans_idx.index(result)][1]+=i[1] else: hash_ans.append([result,i[1]]) hash_ans_idx.append(result) print('-------------------') print('char count = '+str(char_count)) print('alphanumeric count = '+str(alpha_count)) print('line count = '+str(line_count)) print('word count = '+str(word_count)) print('BoW = '+str(hash_ans)) else: print("Try again") start() ######################################################## filename = input("File name = ") sample = open(filename, "r") read = sample.read().lower() #print(read) ans=[] #<word,number of word> new_ans=[] hash_ans=[] hash_ans_idx=[] word=[] stopwords=[] s='' p='' #print(len(read)) #alphabet='abcdefghijklmnopqrstuvwxyz' #print(sample) #line_count = len(read.readlines()) line_count=1 pos=0 find=read.find('\n',pos,len(read)) while(find!=-1): find=read.find('\n',find+1,len(read)) #print(find) line_count+=1 #print(read) char_count = len(read)-line_count+1 # not sure for i in read: if i in '\"\'.,:;@#!%&*()|<>%\n ': if s in word: ans[word.index(s)][1]+=1 else: ans.append([s,1]) word.append(s) s='' else: s=s+i #print(ans) word_count=0 alpha_count=0 for i in ans: if(i[0]!=''): word_count+=i[1]; alpha_count=alpha_count+(len(i[0])*i[1]) stop = open("stopwords.txt", "r") #print(stopword.read()) stopread=stop.read() for i in stopread: if i in '\"\'.,:;@#!%&*()|<>%\n ': if p not in stopwords: stopwords.append(p) p='' else: p=p+i #print(stopwords) for i in ans: if i[0] not in stopwords and i[0]!='': new_ans.append(i) new_ans.sort() #print(new_ans.sort()) start()
# 6330481621 (30.00) 305 (2021-03-22 19:51) file_name=input("File_name= ") use=input("Use feature hashing ? (y,Y,n,N) ") while use not in ['y','Y','n','N']: print('Try again.') use=input("Use feature hashing ? (y,Y,n,N) ") if use in ['y','Y']: M=input("M = ") print('-------------------') stop=open('stopwords.txt','r') file=open(file_name,'r') cha=0 alpha=0 stw=[] linecount=0 wordcount=0 sen=''#ประโยคในfile ที่ cleanแล้ว for line in stop: a=line.split() for e in a: stw.append(e) for line in file: linecount+=1 a=line.split() cha+=len(line) for e in line: if e.isalnum(): sen+=e else: sen+=' ' word=sen.split() #[]คำสะอาด wordlow=[] #[]คำสะอาดพิมเล็ก wordcount=len(word) cha=cha-linecount+1 for e in word: wordlow.append(e.lower()) for i in range(len(word)): for e in word[i]: if '0'<=e<='9' or 'a'<=e.lower()<='z': alpha+=1 perfsen=' '.join(wordlow) #ประโยคสวย print('char count =',cha) print('alphanumeric count =',alpha) print('line count =',linecount) print('word count =',wordcount) #----------------------------------- def fhash(w,M): summ=0 G=37 for i in range(len(w)): summ+=ord(w[i])*G**i sol=summ%int(M) return sol #----------------------------------- def clean(s): a=[ '(', ')', '-', '_', '[', ']' ,'"' ,"'" ,';', ':', '>', '<','.' ] c=[] for i in range(len(s)): if s[i] not in a: c.append(s[i]) return c #----------------------------------- nsen=[] for e in wordlow: if e not in stw: nsen.append(e) newsen=clean(nsen) #----------------------------------- x=[] y=[] for i in range(len(newsen)): if newsen[i] not in x: x.append(newsen[i]) y.append(1) else: y[x.index(newsen[i])]+=1 block=[] for i in range(len(x)): block.append([x[i],y[i]]) block.sort() #----------------------------------- if use in ['y','Y']: o=[] p=[] for i in range(len(newsen)): if fhash(newsen[i],M) not in o: o.append(fhash(newsen[i],M)) p.append(1) else: p[o.index(fhash(newsen[i],M))]+=1 q=[] for i in range(len(o)): q.append([o[i],p[i]]) q.sort() print('BoW =',q) else: print('BoW =',block) stop.close() file.close()
# 6330482221 (30.00) 306 (2021-03-21 02:40) def hashing(): hashing = input("Use feature hashing ? (y,Y,n,N) ").lower() while hashing != "y" and hashing != "n": print("Try again.") hashing = input("Use feature hashing ? (y,Y,n,N) ").lower() if hashing == "y": return True if hashing == "n": return False def do_hashing(w): g = 37 ; m = int(input("M = ")) ; l = [] ; l1 = [] ; l2 = [] ; l3 = [] for i in w: for e in range(len(i)): a = ord(i[e])*(g**e) l.append(a) l1.append(sum(l)%m) l = [] for i in l1: if i not in l2: l2.append(i) for i in l2: l3.append([i,l1.count(i)]) return sorted(l3) #----------------------------------------------------------------------------------# file_name = input("File name = ") fin = open(file_name,"r") check_hashing = hashing() stopwords = open("stopwords.txt","r") l = [] ; l1 = [] ; l2 = [] ; abc = "" # list เก็บ stopwords,sample for i in stopwords: i = i.split() for e in range(len(i)): l.append(i[e]) countall = 0 ; lines = 0 for m in fin: lines += 1 countall += len(m) for e in m: if e.isalnum(): abc += e else : abc += " " w = abc.split() for x in range(len(w)): l1.append(w[x]) countall = countall - (lines-1) countcn = 0 for j in l1: for e in j: if e.isdigit() or e.isalpha(): countcn += 1 for i in range(len(l1)): l1[i] = l1[i].lower() ans = "" for i in l1: ans += " " for u in range(len(i)): if "a" <= i[u] <= "z" or "0" <= i[u] <= "9" : ans += i[u] else: ans += " " ans = ans.strip().split() for i in range(len(ans)): if ans[i] in l: ans[i] = "" finalword = [] for i in ans: if i not in finalword: finalword.append(i) for i in finalword: if len(i) == 0: finalword.remove(i) ans_new = [] for i in ans: if i != "": ans_new.append(i) counts = [] for i in finalword: counts.append(ans.count(i)) bow = [] for i in range(len(finalword)): bow.append([finalword[i],counts[i]]) bow = sorted(bow) if not check_hashing: bow = bow else: hh = do_hashing(ans_new) print("-------------------") print(f'char count = {countall}') print(f'alphanumeric count = {countcn}') print(f'line count = {lines}') print(f'word count = {len(l1)}') if check_hashing: print(f'BoW = {hh}') else: print(f'BoW = {bow}') fin.close() stopwords.close()
# 6330483921 (26.00) 307 (2021-03-22 22:21) alpha=['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s' \ ,'t','u','v','w','x','y','z'] number=['0','1','2','3','4','5','6','7','8','9'] a=input('File name = ') b=input('Use feature hashing ? (y,Y,n,N) ') file=open(a,'r') stopw=open('stopwords.txt','r') stw=[] for line in stopw : sw=line.split() for i in sw : stw.append(i) def ccount(s) : return len(s) def acount(s) : c=0 for i in s : if i in alpha or i in number : c=c+1 return c def wcount(s) : e=s.split() return (len(e)) def nfh(s) : BoW=[] bow=[] boww=[] count=[] e=s.split() for i in e : if i not in stw : bow.append(i) for i in bow : if i not in boww : boww.append(i) count.append(int(1)) else : for k in range (len(boww)) : if i == boww[k] : count[k] += 1 for i in range (len(boww)) : BoW.append([boww[i],count[i]]) return BoW def fh(s,M) : b=0 for i in range (len(s)) : b=b+(ord(s[i])*(37**i)) return b%M def yfh(s,M) : c=s.split() f=[] d=[] e=[] BoW=[] count=[] for i in c : if i not in stw : f.append(i) for i in f : d.append(fh(i,M)) for i in d : if i not in e : e.append(i) count.append(int(1)) else : for k in range (len(e)) : if i == e[k] : count[k] += 1 for i in range (len(e)) : BoW.append([e[i],count[i]]) return BoW def de(s) : for i in range (len(s)) : if s[i] not in alpha and s[i] not in number : s=s[:i]+' '+s[i+1:] return s while b not in ['n','y','N','Y'] : print('Try again.') b=input('Use feature hashing ? (y,Y,n,N) ') if b in ['n','N'] : print('-------------------') cc=0 ac=0 lc=0 wc=0 aline='' for line in file : l=line.lower() l=de(l) cc=cc+ccount(l) ac=ac+acount(l) lc=lc+1 wc=wc+wcount(l) aline=aline+l BoW=nfh(aline) print('char count =',cc-lc) print('alphanumeric count =',ac) print('line count =',lc) print('word count =',wc) print('BoW =',BoW) if b in ['y','Y'] : M=int(input('M = ')) print('-------------------') cc=0 ac=0 lc=0 wc=0 aline='' for line in file : l=line.lower() l=de(l) cc=cc+ccount(l) ac=ac+acount(l) lc=lc+1 wc=wc+wcount(l) aline=aline+l BoW=yfh(aline,M) BoW.sort() print('char count =',cc-lc) print('alphanumeric count =',ac) print('line count =',lc) print('word count =',wc) print('BoW =',BoW)
# 6330485121 (30.00) 308 (2021-03-21 01:01) def char_count(file_name): words = '' c = 0 fn = open(file_name) for line in fn: words += line for e in words: if e != '\n': c += 1 fn.close() return c def alphanumeric_count(file_name): words = '' c = 0 fn = open(file_name) for line in fn: words += line for e in words: if e.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789': c += 1 fn.close() return c def line_count(file_name): c = 0 fn = open(file_name) for line in fn: c += 1 fn.close() return c def list_of_words(file_name): words = '' listwords = '' fn = open(file_name) for line in fn: words += line for e in words: if e.lower() not in 'abcdefghijklmnopqrstuvwxyz0123456789' or e.lower() == '\n': listwords += ' ' else: listwords += e.lower() listwords = listwords.split() fn.close() return listwords # ['it', 'was', 'the', 'best', 'of', ...] def bag_of_words(file_name): listwords = list_of_words(file_name) sw = list_of_words('stopwords.txt') new = [] for e in listwords: if e not in sw: new.append(e) word = []; fr = []; bow = [] for e in new: if e.lower() not in word: word.append(e.lower()) fr.append(int(1)) elif e.lower() in word: fr[word.index(e.lower())] += 1 for i in range(len(word)): bow.append([word[i], fr[i]]) bow.sort() return bow def fhashing(w,m): fhash = 0 g = 37 for i in range(len(w)): fhash += ord(w[i])*(g**i) return fhash%m def feature_hashing(file_name): listwords = list_of_words(file_name) sw = list_of_words('stopwords.txt') new = [] for e in listwords: if e.lower() not in sw: new.append(e.lower()) fhash = []; ordd = []; fr = []; bow = [] for e in new: fhash.append(fhashing(e,m)) for i in range(len(fhash)): if fhash[i] not in ordd: ordd.append(fhash[i]) fr.append(int(1)) elif fhash[i] in ordd: fr[ordd.index(fhash[i])] += 1 for i in range(len(ordd)): bow.append([ordd[i], fr[i]]) bow.sort() return bow x = ['y', 'Y', 'n', 'N' ] file_name = input('File name = ') hashing = input('Use feature hashing ? (y,Y,n,N) ') while hashing not in x: print('Try again.') hashing = input('Use feature hashing ? (y,Y,n,N) ') if hashing in 'yY': m = int(input('M = ')) print('-------------------') print('char count = '+str(char_count(file_name))) print('alphanumeric count = '+str(alphanumeric_count(file_name))) print('line count = '+str(line_count(file_name))) print('word count = '+str(len(list_of_words(file_name)))) print('BoW =',feature_hashing(file_name)) elif hashing in 'nN': print('-------------------') print('char count = '+str(char_count(file_name))) print('alphanumeric count = '+str(alphanumeric_count(file_name))) print('line count = '+str(line_count(file_name))) print('word count = '+str(len(list_of_words(file_name)))) print('Bow =',bag_of_words(file_name))
# 6330486821 (30.00) 309 (2021-03-22 09:27) def remove_stopwords(text,sword): return sorted([e for e in text if e not in sword]) def fhash(text,m): return sorted([(sum([ord(w[i])*37**i for i in range(len(w))])%m) for w in text]) def repeat_word(text): ntext = [] for i in range(len(text)): if i==0: ntext.append([text[0],text.count(text[0])]) elif text[i]!=text[i-1]: ntext.append([text[i],text.count(text[i])]) return ntext def show_features(): print('-------------------') print('char count =',countc-n) print('alphanumeric count =',counta) print('line count =',countl) print('word count =',countw) #------------------info from file------------------ f = open(input('File name = '),'r') countc,counta,countl,n = 0,0,0,0 text = '' for line in f: countl += 1; line = line.lower() for ch in line: countc += 1 if ch=='\n': n += 1 if ch.isalnum(): counta += 1; text += ch else: text += ' ' text = text.split() countw = len(text) f.close() #-----------------stopwords import----------------- s = open('stopwords.txt','r') stopwords = [] for line in s: stopwords += line.strip().split() s.close() #----------------------Input----------------------- h = input('Use feature hashing ? (y,Y,n,N) ').lower() while h!='y' and h!='n': print('Try again.') h = input('Use feature hashing ? (y,Y,n,N) ').lower() #----------------------Output---------------------- stext = remove_stopwords(text,stopwords) if h=='n': show_features() print('BoW =',repeat_word(stext)) elif h=='y': m = int(input('M = ')) show_features() print('BoW =',repeat_word(fhash(stext,m)))
# 6330487421 (22.99) 310 (2021-03-22 23:39) def iinput(): M=-1 file_name = input('File name = ') wantfhash = input('Use feature hashing ? (y,Y,n,N) ') while wantfhash not in ['y', 'Y', 'n', 'N']: print('Try again.') wantfhash = input('Use feature hashing ? (y,Y,n,N) ') if wantfhash in ['y', 'Y']: M = int(input('M = ')) wantfhash = True else: wantfhash = False print('-------------------') return file_name, wantfhash, M def sstopwords(): x = [] stopWordsFile = open('stopwords.txt', 'r') for line in stopWordsFile: for word in line.strip().split(): word = word.lower() if word not in x: x.append(word) stopWordsFile.close() return x def wwords(file_name): q = 0 p = 0 lines = 0 words = [] wordsFile = open(file_name, 'r') for line in wordsFile: lines += 1 for y in line: q += 1 if y == '\n': q -= 1 if ('A' <= y <= 'Z') or('a' <= y <= 'z') or ('0' <= y <= '9') : p += 1 w = '' for y in line: if ('0' <= y <= '9') or ('A' <= y <= 'Z') or('a' <= y <= 'z') : w += y else: if len(w) != 0: words.append(w) w = '' wordsFile.close() return q, p , lines, words def fhash(w, M): G = 37 x = 0 for i in range(len(w)): x += (ord(w[i])*(G**i)) return x % M def bbow(words, stopWords, wantfhash, M): r = [] for y in words: y = y.lower() if y in stopWords: pass else: found = False if wantfhash: cEdit = fhash(y, M) for i in range(len(r)): if r[i][0] == cEdit: r[i][1] += 1 found = True break if not found: r.append([cEdit, 1]) else: for i in range(len(r)): if r[i][0] == y: r[i][1] += 1 found = True break if not found: r.append([y, 1]) return r #---------------------------------------------------------------------------- file_name, wantfhash, M = iinput() stopWords = sstopwords() q, p , lines, words = wwords(file_name) print('char count =', q) print('alphanumeric count =', p) print('line count =', lines) print('word count =', len(words)) print('BoW =', bbow(words, stopWords, wantfhash, M))
# 6330488021 (19.92) 311 (2021-03-21 12:59) def fhash(w,M): f=0 for i in range(len(w)): f+=ord(w[i])*(37**i) total = f%int(M) return total def bow(word,q): alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' s_alphabet = alphabet.lower() n1='' w = word.lower() for e in w: if e in alphabet or e in s_alphabet or e in '1234567890' or e in ' ': n1+=e n1list = n1.split() n2='' for k in n1list: if k not in stop_list: n2+=k+' ' n2list=n2.split() #เก็บคำที่ตัด stopword ออก bow=[] if q=='N' or q=='n': n2new=[] for j in n2list: if j not in n2new: n2new.append(j) for c in n2new: bow.append([c,n2list.count(c)]) bow.sort() elif q=='Y' or q=='y': fhas=[] checkfhas=[] for k in n2list: fhas.append(fhash(k,M)) #เก็บ fhas ของทุกคำ for p in fhas: if p not in checkfhas: checkfhas.append(p) # เก็บเลขตัดตัวซ้ำออก for r in checkfhas: bow.append([r,fhas.count(r)]) bow.sort() return bow file_name = input('File name = ') q= input('Use feature hashing ? (y,Y,n,N) ') while q not in 'yYnN': print('Try aqain.') q=input('Use feature hashing ? (y,Y,n,N) ') if q== 'y' or q=='Y': M=input('M = ') f_stop = open('stopwords.txt','r') line_s = f_stop.readline() s=line_s for line_s in f_stop: s+=line_s stop_list =s.split() #------------------------------- f_file = open(file_name,'r') line_f = f_file.readline() fn=line_f char = '' l=1 kount=0 for line_f in f_file: fn+=line_f l+=1 #3 line count for c in fn: if c !='\n': char+=c else: char+=' ' kount+=1 ch= len(char)-kount #1 char count count = 0 for e in fn: if 'A' <= e <='Z' or 'a'<= e <='z': count+=1 if e in '123456789': count+=1 #2 letter and num count f_list = fn.split() word_c = len(f_list) #4 wordcount print('-------------------') print('char count = '+str(ch)) print('alphanumeric count = '+str(count)) print('line count = '+str(l)) print('word count = '+str(word_c)) print('BoW =',bow(char,q)) f_file.close() f_stop.close()
# 6330489721 (23.35) 312 (2021-03-22 21:01) file_name = input('File name = ') ft = input('Use feature hashing ? (y,Y,n,N) ') uh = False while ft not in ['y','Y','n','N']: print('Try again.') ft = input('Use feature hashing ? (y,Y,n,N) ') if ft in ['y','Y']: M=input('M = ') uh = True print('-------------------') stopwords_list = [] stopwords_file = open('stopwords.txt', 'r') line_count=0 char_count=0 alpha_count=0 word_count=0 for line in stopwords_file: strip_stopwords_file = line.strip() strip_split_stopwords_file = strip_stopwords_file.split() stopwords_list += strip_split_stopwords_file stopwords_file.close() file = open(file_name, 'r') for line in file: strip_line = line.strip().lower() char_count += len(strip_line) file.close() file = open(file_name, 'r') for line in file: strip_line = line.strip().lower() for i in strip_line: isalnum = i.isalnum() if isalnum == True: alpha_count +=1 file.close() file = open(file_name, 'r') for line in file: strip_line = line.strip().lower() line_count +=1 file.close() def find_replace(t): result = "" for c in t: if c in "\"\'/\\,.:;": result += " " else: result += c return result file = open(file_name, 'r') for line in file: strip_line = line.strip().lower() words = find_replace(strip_line) strip_words = words.strip() split_strip_words = strip_words.split() word_count += len(split_strip_words) file.close() print('char count =',char_count) print('alphanumeric count =',alpha_count) print('line count =',line_count) print('word count =',word_count) all_words_list =[] file = open(file_name, 'r') for line in file: strip_line = line.strip().lower() words = find_replace(strip_line) strip_words = words.strip() split_strip_words = strip_words.split() all_words_list += split_strip_words file.close() all_words_withoutstopwords_list = [] for i in all_words_list: if not i in stopwords_list: all_words_withoutstopwords_list.append(i) BoW = [] def add(BoW,d): c = True for i in range(len(BoW)): if BoW[i][0] == d: c = False BoW[i][1] += 1 if c == True: BoW.append([d,1]) return BoW def fhash(list_of_word,M): wordhash_list = [] for word in list_of_word: char_count = 0 for i in range(len(word)): char_count += ord(word[i])*(37**i) wordhash_list.append(char_count%int(M)) return wordhash_list if uh == False: for i in all_words_withoutstopwords_list: BoW = addwordToBoW(BoW,i) print('BoW =',sorted(BoW)) if uh == True: wordhash_list = fhash(all_words_withoutstopwords_list,M) BoWhash = [] for i in sorted(wordhash_list): BoWhash = add(BoWhash,i) print('BoW =',BoWhash)
# 6330491921 (25.20) 313 (2021-03-22 22:40) #------------------------------------ file_name = input("File name = ") while True: feature_hashing = input("Use feature hashing ? (y,Y,n,N) ") if feature_hashing.lower() not in ['y', 'n']: print("Try again.") else: break if feature_hashing.lower() == 'y': while True: try: m = int(input("M = ")) dash = '-'*19 print(dash) break except: print("Try again.") continue elif feature_hashing.lower() == 'n': while True: try: dash = '-'*19 print(dash) break except: print("Try again.") continue stopwords_file = open("stopwords.txt","r") input_file = open(file_name,"r") data = input_file.read().lower() result_char = len(data)-data.count('\n') print("char count = "+str(result_char)) alphabet = 0 for char in data: if char.isalpha() or char.isdigit(): alphabet += 1 result_alpha = alphabet print("alphanumeric count = "+str(result_alpha)) d = data.split("\n") count = len(d) for i in range(len(d)): if i == len(d)-1: if not d[i]: count -=1 result_count = count print("line count = "+str(result_count)) word = data.replace("\n", " ").replace('"','').replace(".","").replace(",","").split(" ") _word = [] for w in word: if w.isalpha() or w.isdigit(): _word.append(w) result_word = len(_word) print("word count = "+str(result_word)) stopwords = [] with open('stopwords.txt','r') as file: for line in file: for word in line.split(): stopwords.append(word) new_input = [] for i in _word: if i not in stopwords: new_input.append(i) new_input.sort() def fhash(w,M): G = 37 index = 0 num = 0 for c in w: num = num + (ord(c)*(G**index)) index += 1 return num%M if feature_hashing.lower() == 'y': index = 0 for w in new_input: num = fhash(w,m) new_input[index] = num index += 1 result = [] new_input.sort() for i in new_input: result.append([i,new_input.count(i)]) res = [] [res.append(x) for x in result if x not in res] result_res_1 = res print("BoW = "+str(result_res_1)) else: result = [] for i in new_input: result.append([i,new_input.count(i)]) res = [] [res.append(x) for x in result if x not in res] result_res_1 = res print("BoW = "+str(result_res_1)) #------------------------------
# 6330492521 (21.40) 314 (2021-03-22 19:56) def fhash(w,M): s = 0 G = 37 for i in range(len(w)): s = s+ord(w[i])*(G**i) return s%M def clean_word(ltext): text = '' cleanw ='' alc = 0 lc = len(ltext) for k in range(lc): text = text+ltext[k] for i in range(len(text)): if text[i].isalnum()==True: cleanw=cleanw+text[i].lower() alc = alc +1 else: cleanw = cleanw+' ' list_text = cleanw.split() print('char count =',len(cleanw)) print('alphanumeric count =',alc) print('line count =',lc) print('word count =',len(list_text)) return list_text def stop_words(text,stopline): st = '' for k in range(len(stopline)): st = st+' '+stopline[k] stoplist = st.split() for i in range(len(stoplist )): text = [s for s in text if s != stoplist[i]] return text def text_same(words): words.sort() bow = [] ch_list =[] for i in range(len(words)): if words[i] not in ch_list: c=0 for j in range(len(words)): if words[i]==words[j]: c=c+1 bow.append([words[i],c]) ch_list.append(words[i]) return bow file_name = input('File name = ') fx = input('Use feature hashing ? (y,Y,n,N) ') while True: if fx in ['y','Y','n','N']: if fx == 'Y' or fx == 'y': M = int(input('M = ')) break else: print('Try again.') fx = input('Use feature hashing ? (y,Y,n,N) ') file=open(file_name,'r') lines=file.readlines() file.close() lines=[line.strip() for line in lines] file2=open('stopwords.txt','r') lines2=file2.readlines() file2.close() lines2=[line.strip() for line in lines2] print('-------------------') all_text = clean_word(lines) words= stop_words(all_text,lines2) if fx == 'Y' or fx == 'y': number = [] for i in range(len(words)): number.append(fhash(words[i],M)) bow=text_same(number) elif fx =='N' or fx == 'n': bow=text_same(words) print('BoW =',bow)
# 6330494821 (26.00) 315 (2021-03-21 15:28) def bow(word1,word123): x=0 for i in range(len(word123)): if word1==word123[i]: x+=1 return x def fhash(word,m): x=0 for i in range(len(word)): x+=ord(word[i])*37**i y=x%m return y x=input('File name = ') file_name=open(x,'r') y=input('Use feature hashing ? (y,Y,n,N) ') while not(y=='y'or y=='Y'or y=='n'or y=='N'): print('Try again.') y=input('Use feature hashing ? (y,Y,n,N) ') if y=='y'or y=='Y': M=int(input('M = ')) print('-------------------') stopwords=open('stopwords.txt','r') stopword='' for line in stopwords: for c in line: stopword+=c stopword=stopword.split() charcount = 0 alphanumericcount = 0 linecount = 0 word='' BoW=[] for line in file_name: charcount+=len(line)-1 for c in line: if c.isalnum()==True: alphanumericcount+=1 word+=c else: word+=' ' linecount+=1 print('char count =',charcount) print('alphanumeric count =',alphanumericcount) print('line count =',linecount) word=word.lower().split() wordcount = len(word) print('word count =',wordcount) if y=='y'or y=='Y': a=[] for i in range(wordcount): if word[i] not in stopword: b=fhash(word[i],M) a.append(b) a.sort() c=1 for i in range(1,len(a)): if a[i]==a[i-1]: c+=1 else: BoW.append([a[i-1],c]) c=1 if len(a)>=1: BoW.append([a[-1],c]) else: for i in range(wordcount): if [word[i],bow(word[i],word)] not in BoW and word[i] not in stopword: BoW.append([word[i],bow(word[i],word)]) print('BoW = ',BoW) stopwords.close() file_name.close()
# 6330495421 (0.00) 316 (2021-03-21 18:54) stopwords="" fn=open("stopwords.txt", "r") for line in fn: stopwords+=line fn.close() stopwords=stopwords.split() def fhash_(w,M): c=0 for i in range (len(w)): c+=(ord(w[i])*37**i) fhash=c%int(M) return fhash def words_(sentence): s="" for c in sentence: if c not in "\"\'/\\().,;:" : s+=c def clear_stopwords(sentence): s="" for c in sentence: if c in "\"\'/\\().,;:" : s+=" " else: s+=c s=s.lower() s=s.split() d=[] for e in s: if e not in stopwords: d.append(e) a=" ".join(d) return a def word_count(sentence): wordcount=len(sentence.strip().split()) return wordcount def char_count(sentence): charcount=len(sentence.strip()) return charcount def alphanumeric_count(sentence): s="" b=0 for c in sentence: if c in "\"\'/\\().,;:" : s+=" " else: s+=c s=s.strip().split() for e in s: b+=len(str(e)) return b a=input("File name = ") b=input("Use feature hashing ? (y,Y,n,N) ") while b not in "y,Y,n,N": print ("Try again.") b=input("Use feature hashing ? (y,Y,n,N) " ) if b in ["y","Y"]: M=input("M = ") charcount=0 alphanumericcount=0 linecount=0 wordcount=0 bow=[] bowfinal=[] file=open("sample.txt", "r") for line in file: charcount+=char_count(line) wordcount+=word_count(line) alphanumericcount+=alphanumeric_count(line) linecount+=1 if b in ["y", "Y"]: s=clear_stopwords(line) s=s.split() for e in s: bow.append(fhash_(e,M)) if b in ["n","N"]: s=clear_stopwords(line) s=s.split() for e in s: bow.append(e) file.close print ("-"*19) print ("char count = " + str(charcount)) print ("alphanumeric count = " + str(alphanumericcount)) print ("line count = " + str(linecount)) print ("word count = " + str(wordcount)) if b=="y"or"Y": for i in range (len(bow)): a=[bow[i],bow.count(bow[i])] if a not in bowfinal: bowfinal.append(a) if b=="n"or"N": for i in range (len(bow)): a=[bow[i],bow.count(bow[i])] if a not in bowfinal: bowfinal.append(a) bowfinal.sort() print ("BoW =",bowfinal)
# 6330496021 (20.06) 317 (2021-03-22 23:07) x = input('File name = ') file=open(x,'r') w = '' lines = 0 lens = 0 for line in file: lines = lines+1 lens = lens+(len(line)-1) w = w+line.lower() file.close() file=open('stopwords.txt','r') j = '' for line in file: j = j+line.lower() file.close() def fhash(w,m): a = 0 for i in range(len(w)): a = a+(ord(w[i])*(37**i)) a = a%int(m) return a def word_(x): a = '' for w in x: if w.isalnum(): a = a+w else: continue return a def stop_(x,stop): a = [] for e in x: if e in stop: continue else: a = a+[e] return a def howmany_(x): a = [] for i in x: word = i n = 0 for j in x: if i == j: n += 1 if [word,n] not in a: a.append([word,n]) return a list_word = w.split() list_stopword = j.split() word = [] for t in list_word: word = word+[word_(t)] char_count = lens alphanumeric_count = 0 for h in word: alphanumeric_count = alphanumeric_count+len(h) BoWn = howmany_(stop_(word,list_stopword)) while True: u = input('Use feature hashing ? (y,Y,n,N) ') u = u.lower() if u in ['y']: m = input('M = ') BoWy = howmany_([fhash(l,m) for l in stop_(word,list_stopword)]) print("-------------------") print('char count = ',char_count) print('alphanumeric count = ',alphanumeric_count) print('line count = ',lines) print('word count = ',len(word)) print('BoW = ',BoWy) break if u in ['n']: print("------------------") print("char count = ",char_count) print("alphanumeric count = ",alphanumeric_count) print("line count = ",lines) print("word count = ",len(word)) print("BoW = ",BoWn) break else: print('Try again.')
# 6330497721 (28.40) 318 (2021-03-18 22:44) def char_count(file_name): open_file=open(file_name,'r') text=''.join([line.strip() for line in open_file.readlines()]) char_count=len(text) open_file.close() return char_count def alphanu_count(file_name): open_file=open(file_name,'r') text=' '.join([line.strip() for line in open_file.readlines()]) new_text=''.join(adjust_text(text).split()) alphanu='' for i in new_text: if i.isalnum(): alphanu+=i open_file.close() return len(alphanu) def line_count_and_word_count(file_name): open_file=open(file_name,'r') lines=[line.strip() for line in open_file.readlines()] line_count=len(lines) lines=' '.join(lines) lines=lines.split() word_cound=len(lines) open_file.close() return line_count,word_cound def read_text(file_name): # return string open_file=open(file_name,'r') lines=' '.join([line.strip() for line in open_file.readlines()]) open_file.close() return lines def adjust_text(text): # return string text=text.lower() cheak=text.split() new_text=[] for i in cheak: new_t='' for j in i: if j.isalnum(): new_t+=j else: new_t+=' ' new_text.append(new_t) return ' '.join(new_text) def list_without_stopwords(adjust_text): # return list stopwords=read_text('stopwords.txt').split() list_without_stopwords=[] cheak=adjust_text.split() for i in cheak: if i not in stopwords: list_without_stopwords.append(i) return list_without_stopwords def BoW(list_without_stopwords): # return list cheak=[] BoW=[] for i in range(len(list_without_stopwords)): if not (list_without_stopwords[i] in list_without_stopwords[i+1:]): cheak.append(list_without_stopwords[i]) for i in range(len(cheak)): count=list_without_stopwords.count(cheak[i]) BoW.append([cheak[i],count]) return sorted(BoW) def fhash(word,M): # return int sum_f=0 for i in range(len(word)): sum_f+=ord(word[i])*(37**i) return sum_f%M def un_BoW_fhash(list_without_stopwords,M): un_BoW_fhash=[] new_info=[] for i in range(len(list_without_stopwords)): new_text='' for j in list_without_stopwords[i]: if j.isalnum(): new_text+=j new_info.append(new_text) for i in range(len(new_info)): un_BoW_fhash.append(fhash(new_info[i],M)) return un_BoW_fhash def print_info(file_name): line_count,word_count=line_count_and_word_count(file_name) print('-------------------') print('char count =',char_count(file_name)) print('alphanumeric count =',alphanu_count(file_name)) print('line count =',line_count) print('word count =',word_count) file_name=input('File name = ') use_feature_hash=input('Use feature hashing ? (y,Y,n,N) ') while use_feature_hash not in ['y','Y','n','N']: print('Try again.') use_feature_hash=input('Use feature hashing ? (y,Y,n,N) ') if use_feature_hash in ['y','Y']: M=int(input('M = ')) print_info(file_name) BoW_fhash=un_BoW_fhash(list_without_stopwords(adjust_text(read_text(file_name))),M) Bow_fhash=BoW(BoW_fhash) print('BoW =',Bow_fhash) elif use_feature_hash in ['n','N']: print_info(file_name) BoW=BoW(list_without_stopwords(adjust_text(read_text(file_name)))) print('BoW =',BoW)
# 6330498321 (30.00) 319 (2021-03-22 23:21) alpha='abcdefghijklmnopqrstuvwxyz' stop=[] inf=open('stopwords.txt','r') for line in inf: for i in line.split(): stop.append(i) def fhash(w,M): top=0 e=0 for i in w: top+=(ord(i)*(37**e)) e+=1 fh=top%int(M) return fh def counts(key,data): c=0 for i in data: if i==key: c+=1 return c file_name=input('File name = ') while True: choice=input('Use feature hashing ? (y,Y,n,N) ') if choice=='n' or choice=='N' or choice=='y' or choice=='Y': break else: print('Try again.') if choice=='y' or choice=='Y': M=input('M = ') infile=open(file_name,'r') char_count=0 alnu_count=0 counter='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890' list_of_line=[] for line in infile: list_of_line.append(line) for j in line: if j in counter: alnu_count+=1 if j!='\n': char_count+=1 lines=len(list_of_line) word_counter='' for k in list_of_line: for m in k: if m in counter: word_counter+=m.lower() else: word_counter+=' ' list_of_word2=word_counter.split() list_of_word=[] for i in list_of_word2: if i not in stop: list_of_word.append(i) words=len(list_of_word2) BoW=[] uniq=[] list_of_fhash=[] if choice=='n' or choice=='N': for k in list_of_word: if k not in uniq: uniq.append(k) for i in uniq: BoW.append([i,counts(i,list_of_word)]) if choice=='y' or choice=='Y': for i in list_of_word: list_of_fhash.append(fhash(i,M)) for k in list_of_fhash: if k not in uniq: uniq.append(k) for i in uniq: BoW.append([i,counts(i,list_of_fhash)]) print('-------------------') print('char count =',char_count) print('alphanumeric count =',alnu_count) print('line count =',lines) print('word count =',words) print('BoW =',BoW) inf.close() infile.close()
# 6330499021 (30.00) 320 (2021-03-22 19:18) note=input('File name = ') file=open(note,'r') data=file.read() file.close() stop_words=open('stopwords.txt','r') stop_words_data=stop_words.read().split() def fhash(w,M) : x=0 for i in range(len(w)): x += ord(w[i])*(37**i) return x % int(M) def file_len(fname): with open(fname) as f: for i, l in enumerate(f): pass return i + 1 while True: f_hashing = input('Use feature hashing ? (y,Y,n,N) ') if f_hashing =='y' or f_hashing=='Y': M=int(input('M = ')) m_check=True break elif f_hashing =='n' or f_hashing=='N': m_check=False break else: print('Try again.') print('-------------------') words='' for word in data.lower(): if word.isalnum()==True: words+=word elif word.isalnum()==False: words+=' ' word_list=words.split() words_lower=words.lower().split() number_of_word=len(words_lower) word_real=[] for word in words_lower: if word in stop_words_data: pass else: word_real.append(word) BoW=[] if m_check==False: for word in word_real: if word not in stop_words_data: BoW.append([word,word_real.count(word)]) if m_check==True: ans = [] x = [] for word in word_real: if word not in stop_words_data: ans.append(fhash(word, M)) for i in ans: if i not in x: x.append(i) BoW.append([i, ans.count(i)]) BoW.sort() number_of_characters = 0 for line in data: line = line.strip("\n") number_of_characters += len(line) alphanumeric_count=0 for char in data: if char.isalnum()==True: alphanumeric_count=alphanumeric_count+1 file.close() stop_words.close() print('char count = {}'.format(number_of_characters)) print('alphanumeric count = {}'.format(alphanumeric_count)) print('line count = {}'.format(file_len(note))) print('word count = {}'.format(number_of_word)) print('BoW = {}'.format(BoW))
# 6330500921 (24.40) 321 (2021-03-22 00:14) file_name=input('File name = ' ) a=input('Use feature hashing ? (y,Y,n,N) ') while a.lower() not in ('y','n') : print('Try again.') a=input('Use feature hashing ? (y,Y,n,N) ') if a.lower() == 'n': file= open(file_name,'r') read= open('stopwords.txt','r') r=read.readlines() h=list(r) g='' for i in h: for e in i: if not e.isalnum(): g=g+' ' elif e.isalnum(): g=g+e stop_words=g.split(' ') f=file.readlines() b=[line.strip() for line in f] each=[] for i in b: each+=list(i) d=[i.lower() for i in each if i.isalnum()] c=''.join(d) char_count= len(each) alpha_count=len(c) line_count=len(f) e='' for i in each: if not i.isalnum(): e=e+' ' elif i.isalnum(): e=e+i.lower() e=e.split() word_count=len(e) st='' for i in e: if i in stop_words: st+=' ' else: st+=' '+i stopp=st.split() stopp.sort() Bow_y= [] for i in range(len(stopp)) : a = stopp.count(stopp[i]) if stopp[i-1] !=stopp[i] : Bow_y.append([stopp[i],a]) file.close() read.close() print('-------------------') print('char count = '+str(char_count)) print('alphanumeric count = '+str(alpha_count)) print('line count = '+str(line_count)) print('word count = '+str(word_count)) print('BoW = '+str(Bow_y)) elif a.lower() == 'y': M=int(input('M = ')) file= open(file_name,'r') read= open('stopwords.txt','r') r=read.readlines() h=list(r) g='' for i in h: for e in i: if not e.isalnum(): g=g+' ' elif e.isalnum(): g=g+e stop_words=g.split(' ') f=file.readlines() b=[line.strip() for line in f] each=[] for i in b: each+=list(i) d=[i.lower() for i in each if i.isalnum()] c=''.join(d) char_count= len(each) alpha_count=len(c) line_count=len(f) e='' for i in each: if not i.isalnum(): e=e+' ' elif i.isalnum(): e=e+i.lower() e=e.split() word_count=len(e) st='' for i in e: if i in stop_words: st+=' ' else: st+=' '+i stopp=st.split() stopp.sort def fhash(w,M): a=list(w) G=37 c=0 for i in range(len(a)): b=ord(a[i])*(G**(i)) c=c+b d=c%M return d bow_y=[] for i in stopp: bow_y.append(fhash(i,M)) bow_y.sort() Bow_y= [] for i in range(len(bow_y)) : a = bow_y.count(bow_y[i]) if bow_y[i-1] !=bow_y[i] : Bow_y.append([bow_y[i],a]) file.close() read.close() print('-------------------') print('char count = '+str(char_count)) print('alphanumeric count = '+str(alpha_count)) print('line count = '+str(line_count)) print('word count = '+str(word_count)) print('BoW = '+str(Bow_y))
# 6330501521 (30.00) 322 (2021-03-21 18:10) file_name=input('File name = ') use=input('Use feature hashing ? (y,Y,n,N) ') while use not in ['y','Y','n','N']: print('Try again.') use=input('Use feature hashing ? (y,Y,n,N) ') def fhash(w,M): f=0 for i in range(len(w)): f+=ord(w[i])*(37**i) fhash=f%M return fhash def read(file_name): file=open(file_name,'r') read='' for line in file: line=line.lower() for i in line: if i not in 'abcdefghijklmnopqrstuvwxyz0123456789 ': read+=' ' else: read+=i file.close() return read.split() def bow(ming,unique): c=0 C=[] for i in unique: c+=ming.count(i) C.append([i,c]) c=0 return sorted(C) ming=read(file_name) charcount=0 linecount=0 file=open(file_name,'r') for line in file: charcount+=len(line) charcount-=1 linecount+=1 charcount+=1 file.close() al=0 for i in ming: al+=len(i) word=len(ming) yum=[] if use in ['y','Y','n','N']: for i in ming: if i not in read('stopwords.txt'): yum.append(i) unique=[] for i in yum: if i not in unique: unique.append(i) f=[] if use in ['Y','y']: M=int(input('M = ')) for i in yum: f.append(fhash(i,M)) u=[] for i in f: if i not in u: u.append(i) vee=bow(f,u) else: vee=bow(ming,unique) print('-------------------') print('char count = '+str(charcount)) print('alphanumeric count = '+str(al)) print('line count = '+str(linecount)) print('word count = '+str(word)) print('BoW =',vee) print(' ')
# 6330502121 (30.00) 323 (2021-03-22 00:16) alp=['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'] num=['1','2','3','4','5','6','7','8','9','0'] def spt(): b=open(c,'r') a='' word='' for x in b: word+=x.lower() for i in range(len(word)): if word[i] in alp or word[i] in num: a+=word[i] else: a+=" " a=a.split() stop=open('stopwords.txt','r') e=[] for x in stop: x=x.split() if len(x)!=0: for i in range(len(a)): if a[i] in x: e.append(a[i]) for i in range(len(e)): a.remove(e[i]) return a def bow(word): e=[] for i in range(len(a)): n=a.count(a[i]) e.append([a[i],n]) r=[] [r.append(x) for x in e if x not in r] return r def fh(word): g=37 p=[] c=[] m=input('M = ') for i in range(len(word)): n=0 k=0 for b in range(len(word[i])): t=ord(word[i][b]) k+=t*g**n n+=1 p.append(str(k%int(m))) for num in range (int(m)): l=0 for i in range(len(p)): if str(num)==p[i]: l+=1 if l!=0: c.append([num,l]) return c def end(bow): i=0 z=0 o=0 n=0 e='' a=open(c,'r') for x in a: n+=len(x) i+=1 x=x.lower() for b in range(len(x)): if x[b] in alp or x[b] in num: z+=1 for b in range(len(x)): if x[b] in alp or x[b] in num: e+=x[b] else: e+=' ' e=str(e).split() o+=len(e) print('-------------------') print('char count = ',n-i+1) print('alphanumeric count = ',z) print('line count = ',i) print('word count = ',o) print('BoW = ',bow) c=input('File name = ') x=True while x: b=input('Use feature hashing ? (y,Y,n,N) ') if b=='y' or b=='Y': a=spt() bow=fh(a) x=False elif b=='n' or b=='N': a=spt() bow=bow(a) x=False else: print('Try again.') end(bow)
# 6330503821 (22.99) 324 (2021-03-22 14:59) def feature_hashing(word, m): hash_value = 0 for i in range(len(word)): character = word[i] hash_value += ord(character)*(37**i) return hash_value % m def is_alphanumeric(character): return 'a' <= character <= 'z' or 'A' <= character <= 'Z' or '0' <= character <= '9' word_file_name = input("File name = ") word_file = open(word_file_name, "r") stopwords_file = open("stopwords.txt", "r") is_feature_hashing = input("Use feature hashing ? (y,Y,n,N) ").lower() while is_feature_hashing not in ['Y', 'y', 'N', 'n']: print("Try again.") is_feature_hashing = input("Use feature hashing ? (y,Y,n,N) ").lower() if is_feature_hashing == 'y': m = int(input("M = ")) char_count = 0 alphanum_count = 0 line_count = 0 stopword_lst = [] for line in stopwords_file: line = line.split() stopword_lst.extend(line) word_lst = [] word_count = 0 for line in word_file: char_count += len(line)-1 line_count += 1 for character in line: if is_alphanumeric(character): alphanum_count += 1 word = "" for i in range(len(line)): if is_alphanumeric(line[i]): word += line[i] else: if word != "": word_count += 1 if word.lower() not in stopword_lst: word_lst.append(word.lower()) word = "" bow = [] if is_feature_hashing == 'n': for word in word_lst: word_in_bow = [data[0] for data in bow] if word not in word_in_bow: bow.append([word, 1]) else: bow[word_in_bow.index(word)][1] += 1 else: for word in word_lst: word_in_bow = [data[0] for data in bow] hashed_word = feature_hashing(word, m) if hashed_word not in word_in_bow: bow.append([hashed_word, 1]) else: bow[word_in_bow.index(hashed_word)][1] += 1 bow.sort() print("-------------------") print(f"char count = {char_count+1}") print(f"alphanumeric count = {alphanum_count}") print(f"line count = {line_count}") print(f"word count = {word_count}") print(f"BoW = {bow}")
# 6330504421 (30.00) 325 (2021-03-21 22:18) def fhash(w,M): x = 0 for i in range(len(w)): x += ord(w[i])*(37**i) x = x % M return x file_name = input('File name = ') feature_hashing = input('Use feature hashing ? (y,Y,n,N) ') while feature_hashing not in ['y','Y','N','n']: print('Try again.') feature_hashing = input('Use feature hashing ? (y,Y,n,N) ') if feature_hashing == 'y' or feature_hashing == 'Y': M = int(input('M = ')) stopwords_file = open('stopwords.txt','r') stopwords = '' list_stopwords = [] for i in stopwords_file: for e in range(0,len(i),1): if 'a' <= i[e] <= 'z' or '0' <= i[e] <= '9' or i[e] == ' ': stopwords += i[e] stopwords += ' ' list_stopwords = stopwords.split() stopwords_file.close() my_file = open(file_name,'r') print('-------------------') count1 = 0 #char count count2 = 0 #alphanumeric count count3 = 0 #line count count_bow = 0 bow = [] words_list = [] count_list = [] fhash_list = [] alphanumeric = '' for i in my_file: i = i.lower() for z in range(len(i)): if i[z] == '\n': count1 += 0 else: count1 += 1 count3 += 1 for e in range(0,len(i),1): if 'a' <= i[e] <= 'z' or '0' <= i[e] <= '9' or i[e] == ' ': alphanumeric += i[e] if i[e] != ' ': count2 += 1 else: alphanumeric += ' ' alphanumeric += ' ' print('char count =',count1) print('alphanumeric count =',count2) print('line count =',count3) list_alphanumeric = alphanumeric.split() list_alphanumeric.sort() print('word count =',len(list_alphanumeric)) for i in list_alphanumeric: if i not in list_stopwords: words_list += [i] words_list.sort() if feature_hashing == 'n' or feature_hashing == 'N': for a in range(len(words_list)): if a == 0: count_list += [words_list[0]] else: if words_list[a] != words_list[a-1] : count_list += [words_list[a]] for b in count_list: for c in words_list: if b == c: count_bow += 1 bow += [[b,count_bow]] count_bow = 0 else: for d in words_list: fhash_list += [fhash(d,M)] fhash_list.sort() for a in range(len(fhash_list)): if a == 0: count_list += [fhash_list[0]] else: if fhash_list[a] != fhash_list[a-1] : count_list += [fhash_list[a]] for e in count_list: for f in fhash_list: if e == f: count_bow += 1 bow += [[e,count_bow]] count_bow = 0 print('BoW =',bow) my_file.close()
# 6330505021 (17.75) 326 (2021-03-21 15:16) # prog-08: Bag-of-words # # 6330505021 (17.75) Sarun Punsuvon def remove_new_tab(s): str_out = "" for i in s: if i == "\n": continue str_out += i return str_out def keep_char_int(s): str_out = "" for i in s: if i.lower() in "1234567890abcdefghijklmnopqrstuvwxyz": str_out += i return str_out def fhash(w, M): G = 37 shabu = 0 eiei = 0 for k in range(len(w)): n = ord(w[k]) shabu = shabu + n*(G**k) eiei += shabu % M eiei = int(eiei) return eiei def char_count(s): s = remove_new_tab(s) return len(s) def alphan_count(s): s = keep_char_int(s) return len(s) def word_list(s): str_out = "" for i in s: if i in "\n \"": str_out += " " elif i.lower() in "1234567890abcdefghijklmnopqrstuvwxyz ": str_out += i str_out = str_out.split(" ") for i in str_out: if i == "": str_out.remove(i) return str_out def word_count(s): str_out = word_list(s) return len(str_out) def bow(s, n, m=0): stop = ['it', 'they', 'the', 'a', 'an', 'of', 'on', 'in', 'at', 'is', 'am', 'are', 'was', 'were'] list_word = word_list(s) key = [] list_out = [] for word in list_word: if word not in stop: if word in key: list_out[key.index(word)][1] += 1 else: list_out.append([word, 1]) key.append(word) if n == 0: return list_out elif n == 1: list_out = [] key1 = [] for i in key: if fhash(i, m) in key1: list_out[key1.index(fhash(i, m))][1] += 1 else: list_out.append([fhash(i, m), 1]) key1.append(fhash(i, m)) return list_out def all_text(file): str_out = "" count = 0 for i in file: str_out += i count += 1 return str_out, count def displayed(file, m, n): s, line = all_text(file) print("-"*19) print("char count =", char_count(s)) print("alphanumeric count =", alphan_count(s)) print("line count =", line) print("word count =", word_count(s)) print("BoW = ", bow(s, n, m)) def main(): file_name = input("Flle name = ") while True: user_input = input("Use feature hasing? (Y,y,N,n) ") file = open(file_name, "r") if user_input.lower() == "n": displayed(file, 0, 0) break elif user_input.lower() == "y": user_m = int(input("M = ")) displayed(file, user_m, 1) break else: print("input error") main()
# 6330507321 (21.40) 327 (2021-03-18 21:59) def fhash(w,M): c=0 for i in range(len(w)): c += ord(w[i])*(37**(i)) c=c%int(M) return c x=input('File name = ',) y=input('Use feature hashing ? (y,Y,n,N) ',) while y not in ['y','Y','n','N']: print('Try again.') y=input('Use feature hashing ? (y,Y,n,N) ',) if y.lower()=='y': M=input('M = ',) k=[] File=open(x,'r') a='' lc=0 chc=0 for line in File: for e in line.strip(): if (('a'<=e.lower() and e.lower()<='z') or ('0'<= e<='9')): a+=e else: a+=' ' chc+=1 lc+=1 a=a.lower().split() wc=len(a) File.close() stop=open('stopwords.txt','r') b='' for line in stop: b += line+' ' b=b.split() alm=0 for e in a: l=len(e) for i in range(len(e)): if not (('a'<=e[i].lower() and e[i].lower()<='z') or ('0'<= e[i]<='9')): l-=1 alm+=l stop.close() File=open(x,'r') B=[] for e in a: if not e in b: B.append(e) B.sort() B.append(' ') h=1 j=[] if y in['N','n']: for i in range(len(B)-1): if B[i]==B[i+1]: h+=1 else: j.append([B[i],h]) h=1 else: k=[] B.remove(' ') for e in B: k.append(fhash(e,M)) k.sort() k.append(111) for i in range(len(k)-1): if k[i]==k[i+1]: h+=1 else: j.append([k[i],h]) h=1 File.close() print('-------------------') print('char count =',chc) print('alphanumeric count =',alm) print('line count =',lc) print('word count =',wc) print('BoW =',j)
# 6330508021 (19.40) 328 (2021-03-21 23:12) file_name = input('File name = ') fh = input('Use feature hashing ? (y,Y,n,N) ') while fh not in ['y','n','N','Y'] : print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') if fh in ['y','Y']: M = int(input('M = ')) print('-------------------') #input def word_count(word,list): N = 0 for i in list: if word == i: N += 1 return N def BoW(w_list,stopwords): a = [] BoW = [] w_forBoW = [] for i in w_list: if i not in stopwords: w_forBoW += [i] for i in w_forBoW: if i not in a: a += [i] BoW += [[i,word_count(i,w_forBoW)]] BoW.sort() return BoW def fhash(w,M): n = 0 for i in range(len(w)): n += ord(w[i])*(37**(i)) N = n%M return N def BoW2(w_list,stopwords,M): a = [] BoW = [] w_forBoW = [] for i in w_list: if i not in stopwords: w_forBoW += [i] for i in w_forBoW: if i not in a: a += [i] BoW += [[fhash(i,M),word_count(i,w_forBoW)]] o = [] BoW2 = [] for i in BoW: [A,B] = i if A not in o: o += [A] BoW2 += [i] else: [c,d] = BoW2[o.index(A)] d += B BoW2[o.index(A)] = [c,d] BoW2.sort() return BoW2 #def_____________________________________________________ stopwords = [] file = open('stopwords.txt','r') for line in file: stopwords += line.strip().lower().split(' ') file.close() line_count = 0 words = '' file = open(file_name,'r') for line in file: words += line.strip() line_count += 1 file.close() character_count = len(words) print('char count =',character_count) #char count_______________________________________________ alphanumeric_count = 0 for i in words: if i.isalnum() == True: alphanumeric_count += 1 print('alphanumeric count =',alphanumeric_count) print('line count =',line_count) #_alphanumeric_count__line count_______________________ newword = '' for i in words: if i.lower().isalnum() == True: newword += i.lower() else: newword += ' ' w_list = newword.strip().split(' ') w_list.remove('') word_list = len(w_list) print('word count =',word_list) #_____________________________________________________ if fh in ['y','Y']: print('BoW =',str(BoW2(w_list,stopwords,M))) elif fh in ['n','N']: print('BoW =',str(BoW(w_list,stopwords)))
# 6330509621 (11.08) 329 (2021-03-22 17:42) file = open('stopwords.txt','r') a='' for line in file: a+= ' ' +line[:-1] la = a.split() #print(la) file_name = input("File name = ") file_= open(file_name,'r') c = '' line_c = 0 #line count for line in file_: c += ' ' +line[:-1] line_c +=1 lc = c.lower() lc = lc.replace('.',' ') lc = lc.replace(',',' ') lc = lc.replace("'",' ') lc = lc.replace('"',' ') lc = lc.replace("/",' ') lc = lc.replace("(",' ') lc = lc.replace(")",' ') lc = lc.replace("{",' ') lc = lc.replace("}",' ') lc = lc.replace("[",' ') lc = lc.replace("]",' ') lc = lc.replace(";",' ') lc = lc.replace(":",' ') lc = lc.replace("|",' ') lc = lc.split() wc= len(lc) #word count #print(c) cc = len(c) #char count cc-line_c ac = 0 #alpha count for i in lc: ac += len(i) cc= cc-line_c #print(ac) #BoW d = [] for e in lc: if not(e in la): d += [e] def word_frequency(): sd = ' '.join(d) f= [] f_=[] n=0 for e in d: if not(e in f): f_ += [[e,1]] f += [e] else: for i in range(len(f)): if e == f[i]: f_[i][1] += 1 return f_ def fhash(w,M): # big,4 p=0 for e in range(len(w)): p += ord(w[e])*(37**e) p = p%M return p def BoW(): bow = [] bow_= [] nn=0 for e in range(len(d)): bow += [fhash(d[e],M)] for e in range(M): for i in bow: if e == i: nn +=1 if nn != 0: bow_ += [[e,nn]] nn = 0 return bow_ #file_name = input("File name =") hh = input('Use feature hashing ? (y,Y,n,N) ') ans = ['y','Y','n','N'] while not(hh in ans): print('Try again.') hh = input('Use feature hashing ? (y,Y,n,N) ') if hh == 'y' or hh == 'Y': M = int(input('M = ')) print('-------------------') print('char count = '+ str(cc)) print('aplhanumaric count = '+str(ac)) print('line count = '+str(line_c)) print('word count = '+str(wc)) if hh == 'y' or hh=='Y': print('BoW = '+str(BoW())) else: print('BoW = '+ str(word_frequency()))
# 6330510121 (13.00) 330 (2021-03-22 23:58) Filename =input('File name = ') file_name=open(Filename,"r" ) a=input('Use feature hashing ? (y,Y,n,N) ') while a not in ['y','Y','n','N']: print('Try again') a=input('Use feature hashing ? (y,Y,n,N) ') if a in ['Y','y']: M=input('M = ') c='yes' if a in ['N','n']: c='no' def fhash(w,M): fhash=0 for i in range(len(w)): fhash+=((ord(w[i]))*(37**i)) fhash=fhash%M return fhash def stopword(sentence): STOPWORD=open('stopwords.txt','r') STOPWORD2='' for line in STOPWORD: STOPWORD2+=line STOPWORD2=STOPWORD2.replace('\n',' ') STOPWORD=STOPWORD2.split() sentence=sentence.split() x='' for i in sentence : if i.lower() not in STOPWORD: x+=' '+i return x file_name2='' for line in file_name: file_name2+=line file_name2=file_name2.replace('\n','') print('Char count =',len(file_name2)) ALPHAMERIC=['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9'] x=0 for i in range(len(file_name2)): if file_name2[i].lower() in ALPHAMERIC: x+=1 print('alphanumeric count =',x) file_name = open(Filename, "r") x = 0 for line in file_name: x += 1 print('line count =',x) x='''!()-[]{};:'"\,<>./?@#$%^&*_~''' file_name=open(Filename,"r" ) file_name2='' for line in file_name: file_name2+=line #wordcount file_name2=file_name2.replace('\n',' ') for i in file_name2: if i in x : file_name2=file_name2.replace(i,' ') word=file_name2.split() print('word count =',len(word)) print('-------------------') if c == 'yes': x=[] y=[] file_name2=stopword(file_name2).split() for i in file_name2: x.append(fhash(i,int(M))) BoW = [] x.sort() for i in x: if i not in y: y.append(i) for i in y: if i not in BoW: BoW.append([i,x.count(i)]) print(BoW) if c =='no': x=[] y=[] file_name2=stopword(file_name2).split() for i in file_name2: x.append(i) BoW = [] x.sort() for i in x: if i not in y: y.append(i) for i in y: if i not in BoW: BoW.append([i,x.count(i)]) print(BoW)
# 6330511821 (25.15) 331 (2021-03-20 00:28) the_list = [] string = '' list_of_fhash = [] def fhash(w,M): value = 0 for i in range(len(w)): value += ord(w[i])*(37)**i answer = value%M return answer def char_count(file): sigma = 0 for i in file: sigma += len(i) file = open(file_name,'r') sigma = sigma - (line_count(file) - 1) return sigma def alphanumeric_count(file): sigma = 0 for i in file: for k in i: if k.isalnum(): sigma += 1 return sigma def line_count(file): sigma = 0 for i in file: sigma+=1 return sigma def word_count(file): global string for i in file: for k in i: if k.isalnum(): string += k else: string += ' ' string = string.split() sigma = len(string) return sigma def BoW_not_hashing(): founded_word = [] BoW = [] for i in string: if i.lower() not in the_list: if i not in founded_word: founded_word.append(i) BoW.append([i,1]) else: for j in BoW: if j[0] == i: j[1] += 1 return BoW def remove_punctual(): file = open('stopwords.txt') global the_list for i in file: for j in i.split(): the_list.append(j) def BoW_but_hashing(): founded_word = [] BoW = [] global list_of_fhash for i in string: if i.lower() not in the_list: list_of_fhash.append(fhash(i,M)) for i in list_of_fhash: if i not in founded_word: founded_word.append(i) BoW.append([i,1]) else: for j in BoW: if j[0] == i: j[1] += 1 return BoW remove_punctual() file_name = input("File name = ") asking = input("Use feature hashing ? (y,Y,n,N) ") while asking not in 'yYnN': print('Try again.') asking = input("Use feature hashing ? (y,Y,n,N) ") if asking in 'yY': M = int(input("M = ")) print('-------------------\ ') file = open(file_name,'r') print('char count =',char_count(file)) file = open(file_name,'r') print('alphanumeric count =',alphanumeric_count(file)) file = open(file_name,'r') print('line count =',line_count(file)) file = open(file_name,'r') print('word count =',word_count(file)) if asking in 'nN': print('BoW =',BoW_not_hashing()) elif asking in 'yY': print('BoW =',BoW_but_hashing()) file.close()
# 6330512421 (24.90) 332 (2021-03-21 21:54) def fhash(w,M) : ord_ = 0 for i in range(len(w)) : ord_ += ord(w[i])*(37**i) ord_ = ord_%(int(M)) return ord_ file = input("File name = ", ) file_name = open(file,"r") use_fhash = input("Use feature hashing ? (y,Y,n,N) ",) if ","+use_fhash+"," not in ",y,Y,n,N," : while ","+use_fhash+"," not in ",y,Y,n,N," : print("Try again.") use_fhash = input("Use feature hashing ? (y,Y,n,N) ",) if ","+use_fhash+"," in ",y,Y," : M = input("M = ",) else : pass #-------------------------------- #stop_word stop_word = open("stopwords.txt",) list_stopword = [line.rstrip('\n') for line in stop_word] stopword_content = [] for e in list_stopword : stopword_content += e.split() stop_word.close() #-------------------------------------- #list_word list_word = [line.rstrip('\n') for line in file_name] list_content = [] for e in list_word : list_content += e.split() #--------------------------------- list_content1 = [] for e in list_content : word = "" for k in e : if "a"<=k.lower()<="z" : word += k.lower() elif "0"<=k<="9" : word += k list_content1 += [word] list_content2 = [] for e in list_content1 : if e in stopword_content : list_content2 += "" else : list_content2 += [e] file_name.close() #-------------------------------------- print("-------------------") #character_count character_count = 0 for e in list_word : for k in e : character_count += 1 print("char count = ",character_count) #alphanumeric_count alphanumeric_count = 0 for e in list_content1 : for k in e : alphanumeric_count += 1 print("alphanumeric count = ",alphanumeric_count) #line_count line_count = 0 for line in open(file,"r") : line_count += 1 print("line count = ",line_count) #word_count word_count = 0 for e in list_content1 : word_count += 1 print("word count = ",word_count) #-------------------------------------- #Bag_of_word def repeat(N,data) : n = 0 for e in data : if N == e : n += 1 return n if ","+use_fhash+"," in ",y,Y," : data = [] for e in list_content2 : data += [fhash(e,M)] #data = [3,0,3,2,1,2,3,3] #BoW = [[0, 1], [1, 1], [2, 2], [3, 4]] data1 = [] for e in data : if e not in data1 : data1 += [e] data1 = sorted(data1) #data1 = [0,1,2,3] Bow = [[e,repeat(e,data)] for e in data1] elif ","+use_fhash+"," in ",n,N," : data_ = [] for e in list_content2 : if e not in data_ : data_ += [e] Bow = sorted([[e,repeat(e,list_content2)] for e in data_]) #-------------------------------------- print("Bow = ",Bow)
# 6330513021 (17.85) 333 (2021-03-21 22:33) def fhash(w,M): G=37 sum_fhash=0 for i in range(len(w)): sum_fhash+=(ord(w[i])*((G)**i)) sum_fhash = sum_fhash%M return sum_fhash def lookBow(s): s.sort() ans_true = [] first_word = s[0] num = 1 for i in range(1,len(s)) : if s[i] == first_word : num += 1 else : ans_true.append([s[i-1], num]) first_word = s[i] num = 1 ans_true.append([s[i],num]) return ans_true #-----------------------------------------# file_name=input('File name = ') f_hash=input('Use feature hashing ? (y,Y,n,N) ') file=open(file_name,'r') str_stop='' stopwords=open('stopwords.txt','r') for line in stopwords: str_stop+=line str_stop=str_stop.lower() str_stop=str_stop.split() #general case while not f_hash in 'yYnN': print('Try again.') f_hash=input('Use feature hashing ? (y,Y,n,N) ') #case n or N if f_hash in'nN': print('-------------------') all_sentence=[] c=0 for line in file: all_sentence.append(line.strip()) c+=1 #count_words_false words='' for i in all_sentence : words+=i+' ' all_words=words.split() #char_count punctuation='' for i in words: if i in "\"\'/\\,.:;()[]{}": punctuation += " " else: punctuation += i char_count=len(punctuation)-c print('char count = ',char_count ) punctuation_lower=punctuation.lower() punctuation_words=punctuation_lower.split() count_words=(len(punctuation_words)) #count_words_true #count_alphanumric only_words=punctuation.split() alphanumric='' for i in only_words: alphanumric+=i count_alphanumric=len(alphanumric) print('alphanumeric count = ',count_alphanumric ) print('line count = ',c ) print('word count = ',count_words ) no_stopwords=[] punctuation_low=punctuation.lower() punctuation_split=punctuation_low.split() for i in punctuation_split: if not i in str_stop: no_stopwords.append(i) bow=lookBow(no_stopwords) print('BoW =',bow) #case y or Y if f_hash in'yY': M=int(input('M = ')) print('-------------------') all_sentence=[] c=0 for line in file: all_sentence.append(line.strip()) c+=1 #count_words_false words='' for i in all_sentence : words+=i+' ' all_words=words.split() #char_count punctuation='' for i in words: if i in "\"\'/\\,.:;()[]{}": punctuation += " " else: punctuation += i char_count=len(punctuation)-c print('char count = ',char_count ) punctuation_lower=punctuation.lower() punctuation_words=punctuation_lower.split() count_words=(len(punctuation_words)) #count_words_true #count_alphanumric only_words=punctuation.split() alphanumric='' for i in only_words: alphanumric+=i count_alphanumric=len(alphanumric) print('alphanumeric count = ',count_alphanumric ) print('line count = ',c ) print('word count = ',count_words ) no_stopwords=[] num_fhash=[] order_fhash=[] count=1 bow_y=[] punctuation_low=punctuation.lower() punctuation_split=punctuation_low.split() for i in punctuation_split: if not i in str_stop: no_stopwords.append(i) for i in no_stopwords: num_fhash.append(fhash(i,M)) order_fhash=sorted(num_fhash) first_word=order_fhash[0] for i in range(1,len(order_fhash)) : if order_fhash[i] == first_word : count += 1 else : bow_y.append([order_fhash[i-1], count]) first_word = order_fhash[i] count = 1 bow_y.append([order_fhash[i],count]) print('BoW =',bow_y) file.close()
# 6330514721 (30.00) 334 (2021-03-22 01:41) a=input('File name = ') b=input('Use feature hashing ? (y,Y,n,N) ') c=open(a,'r') d='' alnum='0123456789abcdefghijklmnopqrstuvwxyz' l=['n','N','y','Y'] charcount=0 alnumcount=0 linecount=0 wordcount=0 for line in c: linecount+=1 line=line.lower() for i in line: charcount+=1 if i in alnum: alnumcount+=1 d+=i else: d+=' ' d=d.split() wordcount=len(d) charcount=charcount-linecount+1 c.close() while b not in l: print('Try again.') b=input('Use feature hashing ? (y,Y,n,N) ') u=open('stopwords.txt','r') y=[] for line in u: line=line.strip().split() y+=line u.close() nostop=[] for i in d: if i not in y: nostop+=[i] def fhash(nostop,m): kimlium=[] for w in nostop: k=0 for i in range(len(w)): k+=ord(w[i])*(37**i) k%=int(m) kimlium+=[k] return kimlium def kumsum(p): p.sort() p+=[''] v=1 noks=[] for i in range(len(p)-1): if p[i]==p[i+1]: v+=1 else: noks.append([p[i],v]) v=1 return noks nono='-'*19 if b=='y' or b=='Y': m=input('M = ') f=fhash(nostop,m) f=kumsum(f) print(nono) print('char'+' '+'count'+' '+'=',charcount) print('alphanumeric'+' '+'count'+' '+'=',alnumcount) print('line'+' '+'count'+' '+'=',linecount) print('word'+' '+'count'+' '+'=',wordcount) print('BoW'+' '+'=',f) elif b=='n' or b=='N': f=kumsum(nostop) print(nono) print('char'+' '+'count'+' '+'=',charcount) print('alphanumeric'+' '+'count'+' '+'=',alnumcount) print('line'+' '+'count'+' '+'=',linecount) print('word'+' '+'count'+' '+'=',wordcount) print('BoW'+' '+'=',f)
# 6330515321 (7.37) 335 (2021-03-22 23:58) #Prog-08: Bag-of-words # 6330515321 (7.37) Siriphon Chitkham fileName = input("Filename = ") fHashOption = input("Use feature hashing ? (y,Y,n,N) ").lower() while fHashOption not in ["y", "n"] : print("Try again.") fHashOption = input("Use feature hashing ? (y,Y,n,N) ").lower() if fHashOption == "y" : useFHash = True else : useFHash = False M = int(input("M = ")) stopwordsFile = open("stopword.txt", "r") stopWordList = [] for line in stopwordsFile : miniStopwordList = line.strip().lower().split() stopWordList += miniStopwordList stopwordsFile.close() def fHash(word,M): fValue = 0 G = 37 p = 0 for char in word : fValue += ord(char) * ( G ** p ) p += 1 return fValue % M def sentenceClean(sentence, stopWordList): cleanWordList = [] word = "" wordCount = 0 alphanumericCount = 0 for char in sentence : if char.isalpha() or char.isdigit() : word += char.lower() else : if word != "" and word not in stopWordList : alphanumericCount += len(word) wordCount += 1 cleanWordList.append(word) word = "" elif word != "" and word in stopWordList : alphanumericCount += len(word) wordCount += 1 word = "" return cleanWordList , wordCount , alphanumericCount wordList = [] wordCodeList = [] bagOfWord = [] charCount = 0 alphanumericCount = 0 lineCount = 0 wordCount = 0 f = open(fileName, "r") for line in f : lineCount += 1 sentence = line.strip() charCount += len(sentence) CleanWordList , wc , alnum = sentenceClean(sentence, stopWordList) wordCount += wc alphanumericCount += alnum for word in CleanWordList : wordHash = fHash(word,M) if useFHash : if wordHash not in wordCodeList : wordCodeList.append(wordHash) bagOfWord.append([wordHash, 1]) else : pos = wordCodeList.index(wordHash) bagOfWord[pos][1] += 1 else : if word not in wordList : wordList.append(word) bagOfWord.append([word, 1]) else : pos = wordList.index(word) bagOfWord[pos][1] += 1 f.close() print("-------------------") print("char count =" , charCount ) print("alphanumeric count =", alphanumericCount ) print("line count =", lineCount ) print("word count =", wordCount ) bagOfWord.sort() print("BoW =", bagOfWord)
# 6330516021 (20.80) 336 (2021-03-21 18:33) def blank(t) : result = "" for ch in t : if ch in "\"\'\\/,.<>:;[]{}()-_" : result += " " else : result += ch return result def fhash(w,M) : f = 0 for i in range(len(w)) : f += ord(w[i])*(37**i) f %= M return f #-------------------------------- stopfile = open('stopword.txt','r') stop = [] for line in stopfile : stop += line.split() stopfile.close() infile = open(input('File name = '),'r') order = input('Use feature hashing ? (y,Y,n,N) ') while order != 'y' and order != 'Y' and order != 'n' and order != 'N' : print("Try again.") order = input('Use feature hashing ? (y,Y,n,N) ') if order == 'Y' or order == 'y' : M = int(input('M = ')) print('-------------------') sample = [] words = [] char_count = 0 alphanum_count = 0 line_count = 0 for lines in infile : for ec in lines : if ec != '\n' : char_count += 1 alphanum = blank(lines) for ea in alphanum : if ea != ' ' and ea != '\n': alphanum_count += 1 words += alphanum.lower().split() sample += alphanum.lower().split() line_count += 1 print('char count = ' + str(char_count)) print('alphanumeric count = ' + str(alphanum_count)) print('line count = ' + str(line_count)) samples = [] for e in sample : if not e in stop : samples.append(e) infile.close() text = [] Bow = [] for i in range(len(samples)) : if samples[i] in text : Bow[text.index(samples[i])][1] += 1 else : text += [samples[i]] Bow += [[samples[i],1]] Bow.sort() word_count = len(words) print('word count = ' + str(word_count)) if order == 'Y' or order == 'y' : bowl = [] contxt = [] for i in range(len(samples)) : bow = fhash(samples[i],M) if bow in contxt : bowl[contxt.index(bow)][1] += 1 else : contxt += [bow] bowl += [[bow,1]] bowl.sort() print('BoW = ' + str(bowl)) else : print('BoW = ' + str(Bow))
# 6330517621 (25.00) 337 (2021-03-22 22:39) #---------------------------------------------------------------- alnum = ['0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'] def Re_space(s): return s.split(' ') def lowandre(b): a = open(b, 'r') s = [] ss = [] sss = [] df = [] scon = [] for line in a : s.append(Re_space(line)) file.close() for e in s : if e[len(e)-1][len(e[len(e)-1])-1] == '\n' : e[len(e)-1] = e[len(e)-1][:len(e[len(e)-1])-1] for m in e : ss.append(m) for e in ss : for i in range(len(e)): if e[i] not in alnum : e = e[:i]+' '+e[i+1:] e = e.strip() sss.append(e) for e in sss: a = e.split(' ') while '' in a : a.remove('') df.append(a) for e in df : for m in e : scon.append(m) return scon #----------------------------------------------------------------- def lowresam(b): x = lowandre(b) lowresam = [] for e in x : e = e.lower() lowresam.append(e) if e in stw : lowresam.remove(e) return lowresam #----------------------------------------------------------------- def charcount(b): fn = open(b, 'r') line = fn.readline() charcount = 0 while len(line) > 0 : if line[len(line)-1] == '\n' : charcount += len(line)-1 else : charcount += len(line) line = fn.readline() fn.close() return charcount #----------------------------------------------------------------- def alnumcount(b): alnumcount = 0 x = lowandre(b) for e in x : alnumcount += len(e) return alnumcount #----------------------------------------------------------------- def linecount(b): fn = open(b, 'r') line = fn.readline() linecount = 0 while len(line) > 0 : linecount += 1 line = fn.readline() fn.close() return linecount #----------------------------------------------------------------- def wordcount(b): wordcount = 0 x = lowandre(b) for e in x : wordcount += 1 return wordcount #----------------------------------------------------------------- def BoWNn(b): a = lowresam(b) x = [] y = [] z = [] f = [] o = 1 a.sort() for e in a: f.append(e) g = len(a) for i in range(g): if i != g-1 : if f[i] == f[i+1]: a.remove(a[i+1]) o += 1 if f[i] == f[i-1]: a.remove(a[i-1]) o += 1 x.append(f[i]) x.append(o) y.append(x) x = [] o = 1 if i == g-1: x.append(f[i]) x.append(o) y.append(x) x = [] o = 1 y.sort() for e in y : if e not in z: z.append(e) else : pass return z #------------------------------------------------------------------ def BoWYy(b) : a = lowresam(b) x = [] y = [] z = [] dz = [] h = 0 b = 0 for e in a : for m in e : b += ord(m)*(37**h) h += 1 h = 0 c = b%int(M) x.append(c) x.sort() b = 0 i = 0 while i <= max(x): y.append([]) i += 1 for e in x : y[e-1].append(e) while [] in y : y.remove([]) y.sort() for e in y : z.append(e[0]) z.append(len(e)) dz.append(z) z = [] dz.sort() return dz #stwbuilding------------------------------------------------------ file = open('stopwords.txt','r') stwt = [] stw = [] for line in file: stwt.append(Re_space(line)) file.close() for e in stwt : if e[len(e)-1][len(e[len(e)-1])-1] == '\n' : e[len(e)-1] = e[len(e)-1][:len(e[len(e)-1])-1] for m in e : stw.append(m) #---------------------------------------------------------------- file_name = input('File name = ') YN = input('Use feature hashing ? (y,Y,n,N) ') while YN not in ['y','Y','n','N'] : print('Try again.') YN = input('Use feature hashing ? (y,Y,n,N) ') if YN == 'y' or YN == 'Y' : M = input('M = ') if YN == 'n' or YN == 'N' : pass print('-------------------') x = charcount(file_name) print('char count = '+str(x)) x = alnumcount(file_name) print('alphanumeric count = '+str(x)) x = linecount(file_name) print('line count = '+str(x)) x = wordcount(file_name) print('word count = '+str(x)) if YN == 'n' or YN == 'N': x = BoWNn(file_name) if YN == 'y' or YN == 'Y': x = BoWYy(file_name) print('BoW = '+ str(x))
# 6330518221 (30.00) 338 (2021-03-22 15:03) def charcount(m): n = 0 for i in m: i = i.strip() n += len(i) return n def alphacount(m): n = 0 for i in m: i = i.strip() for j in i: if j.isalnum(): n += 1 return n def create(m): k = list() for i in m: i = i.strip() for y in i.split(): k.append(y) return k def createall(m): k = list() n = list() for i in m: i = i.strip() for y in i.split(): k.append(y) for i in k: allword = '' for j in i: if j.isalnum(): allword += j else: allword += ' ' p = allword.split() for r in p: n.append(r) return n def fhash(word, M): G = 37 n = 0 for i in range(len(word)): n += ord(word[i]) * (G ** i) return n % M def BOW (w,d,M): w2 = list() bow = list() if d == 'y': for i in range(len(w)): w[i] = fhash(w[i],M) for i in w: if i in w2: pass else: w2.append(i) for i in w2: bow.append([i,w.count(i)]) return bow file_name = input('File name = ') useh = input('Use feature hashing ? (y,Y,n,N) ').lower() while useh not in ['y','n']: print('Try again.') useh = input('Use feature hashing ? (y,Y,n,N) ').lower() M = -1 if useh == 'y': M = int(input('M = ')) print('-'*19) all_stop2 = open('stopwords.txt', 'r') all_stop = list() for i in all_stop2: all_stop.append(i.lower()) lstop = create(all_stop) datao = open(file_name, 'r') data = list() for i in datao: data.append(i.lower()) print('char count =',charcount(data)) print('alphanumeric count =',alphacount(data)) print('line count =',len(data)) word = createall(data) print('word count =',len(word)) wordnostop = list() for i in word: if i not in lstop: wordnostop.append(i) print('BoW =',sorted(BOW(wordnostop,useh,M))) all_stop2.close() datao.close()
# 6330520421 (22.44) 339 (2021-03-22 22:28) #Prog-08 : Bag-of-words #6330520421 (22.44) Supakorn Na kalasin def hashing(w,m) : a=0 for i in range(len(w)): a += ord(w[i])*(37**i) return a%int(m) file_name = open(input('File name = '),"r") a = input("Use feature hashing ? (y,Y,n,N)") while True : if a not in ["y","Y","N","n"] : print("Try again.") a = input("Use feature hashing ? (y,Y,n,N)") elif a == "Y" or a == "y" : M = input("M = ") stopwords = open('stopwords.txt','r') ws = [] for line2 in stopwords: z = line2.split() ws += z lc=0 cc=0 wc=0 anc=0 bb = [] for line in file_name: lc += 1 cc += len(line)-1 for e in line: if e in ['?','.','!','/',';',':',',','"',"'",'@','#','$','%','^','&','฿','*','(',')','_','-','+','=','|',"[","]",'<','>','*']: line=line.replace(e,' ') b = line.split() bb += b wc += len(b) for i in b: anc += len(i) print('char count =',cc) print('alphanumeric count =',anc) print('line count =',lc) print('word count =',wc) wb = [] for j in bb: if j.lower() not in ws: wb.append(j.lower()) hw = [] for i in wb: hw.append(hashing(i,M)) bow = [] for f in hw: n = 0 for g in hw: if f == g: n += 1 if [f,n] not in bow: bow.append([f,n]) print('BoW =',bow) break elif a == "n" or a == "N" : stopwords = open('stopwords.txt','r') ws = [] for line2 in stopwords: z = line2.split() ws += z lc=0 cc=0 wc=0 anc=0 bb = [] for line in file_name: lc += 1 cc += len(line)-1 for e in line: if e in ['?','.','!','/',';',':',',','"',"'",'@','#','$','%','^','&','฿','*','(',')','_','-','+','=','|',"[","]",'<','>','*'] : line=line.replace(e,' ') b = line.split() bb += b wc += len(b) for i in b: anc += len(i) print('char count =',cc) print('alphanumeric count =',anc) print('line count =',lc) print('word count =',wc) wb = [] for j in bb: if j.lower() not in ws: wb.append(j.lower()) bow = [] for f in wb: n = 0 for g in wb: if f == g: n += 1 if [f,n] not in bow: bow.append([f,n]) print('BoW =',bow) break
# 6330521021 (26.00) 340 (2021-03-22 12:35) def gettext(file,t): text='' ncount=0 for i in range(len(t)-1): text+=t[i][:-1]+' ' if t[i][-1:]=='\n': ncount+=1 text+=t[-1] if t[-1][-1:]=='\n': ncount+=1 text=text.lower() return text,ncount def replacepunc(text): newtext='' for i in range(len(text)): if text[i].isalnum(): newtext+=text[i] else: newtext+=' ' return newtext def stopw(): stopwords=open('stopwords.txt','r') s=stopwords.readlines() stopw,nc=gettext(stopwords,s) allstopw=stopw.split() return allstopw def fhash(w,m): n=0 for i in range(len(w)): n+=ord(w[i])*(37**i) return n%m def bow(textlist): k=[] l=1 wc=[] uniqueword=[] allstopw=stopw() for i in textlist: if i not in allstopw: k.append(i) k=sorted(k) for i in k: if i not in uniqueword: uniqueword.append(i) for i in range(len(k)-1): if k[i]!=k[i+1]: wc.append(l) l=1 else: l+=1 wc.append(l) bagofword=[] for i in range(len(wc)): bagofword.append([uniqueword[i],wc[i]]) return bagofword def bowm(textlist,m): k=[] l=[] uniquenum=[] wc=[] n=1 allstopw=stopw() for i in textlist: if i not in allstopw: k.append(i) for i in k: l.append(str(fhash(i,m))) l=sorted(l) for i in l: if i not in uniquenum: uniquenum.append(i) for i in range(len(l)-1): if l[i]!=l[i+1]: wc.append(n) n=1 else: n+=1 wc.append(n) bagofword=[] for i in range(len(wc)): bagofword.append([int(uniquenum[i]),wc[i]]) return bagofword fn=input("File name = ") m=0 while True: fh=input("Use feature hashing ? (y,Y,n,N) ") if fh=='y' or fh=='Y' or fh=='n' or fh=='N': if fh=='y' or fh=='Y': m=int(input("M = ")) break else: print("Try again.") file=open(fn,'r') ccount=0 alnum=0 uniqueword=[] t=file.readlines() text,ncount=gettext(file,t) ccount=len(text) print('char count =',ccount-ncount) lcount=len(t) for i in text: if i.isalnum(): alnum+=1 print('alphanumeric count =',alnum) print('line count =',lcount) newtext=replacepunc(text) textlist=newtext.split() wcount=len(textlist) print('word count =',wcount) if m==0: print("BoW =",bow(textlist)) else: print("BoW =",bowm(textlist,m))
# 6330522721 (30.00) 341 (2021-03-22 22:58) def main(): char_count() alphanumeric_count() line_count() word_count() def char_count(): file = open(start) c = 0 for line in file : for e in line: if e != '\n': c += 1 file.close() return print('char count =', c) def alphanumeric_count() : file = open(start) c = 0 for line in file : for e in line: if e.isalnum() : c += 1 file.close() return print('alphanumeric count =' ,c) def line_count(): file = open(start) c = 0 for line in file: if len(line) > 0 : c += 1 else: break file.close() return print('line count =',c) def remove_punc(p): t = '' for e in p: if e.isalnum(): t += e else : t += ' ' return t def word_count(): c = 0 file = open(start) for line in file : c += len(remove_punc(line).split()) file.close() return print('word count =',c) #------------------------- stopwords = [] stop_file = open('stopwords.txt') for line in stop_file: new_stop = line.split() for e in new_stop: stopwords.append(e) #--------------------------- def line_normalize(oldline): new_line = [] for e in oldline: if e not in stopwords: new_line.append(e) return new_line def histogram_bin(data): his_bin = [] for e in data : if e not in his_bin : his_bin.append(e) return his_bin def count(data,element): c = 0 for e in data : if e == element: c += 1 return c def fhash(word,m): t = 0 for i in range(len(word)): t += ord(word[i])*37**i return t % m def change_data(data): new_data = [] for i in range(len(data)): data[i][1] = 0 for e in data: if e not in new_data : new_data.append(e) return new_data def count_fhash(data,element): for e in data : if element[0] == e[0]: data[data.index(e)][1] += element[1] #------------------------------------------------------------ yynn = ['y','Y','n','N'] start = input('File name = ').strip() while True: second = input('Use feature hashing ? (y,Y,n,N) ') if second in yynn : if second in ['y','Y']: m = int(input('M = ')) print('-------------------') break else: print('Try again.') main() # ------------------------- file = open(start) norm = [] #append bow_1 = [] new_line = '' for line in file: new_line += remove_punc(line.lower()) normalize_word =line_normalize(new_line.split()) file.close() for e in normalize_word: norm.append(e) normal = histogram_bin(norm) for e in normal : bow_1.append([e, count(norm,e) ]) if second in ['y','Y']: bow_fhash = [] new_bow2 = [] for [word,f] in bow_1 : bow_fhash.append([fhash(word,m) , f]) bow_f2 = [] for e in bow_fhash: bow_f2.append(list(e)) zerotoreal_bow = change_data(bow_f2) for k in bow_fhash: count_fhash(zerotoreal_bow ,k) if second in ['n','N']: print('BoW =',sorted(bow_1) ) else: print('BoW =',sorted(zerotoreal_bow))
# 6330523321 (24.75) 342 (2021-03-21 19:13) filename = input('File name = ') feature = input('Use feature hashing ? (y,Y,n,N) ') usehash = False while not feature in ['y','Y','n','N']: print('Try again.') feature = input('Use feature hashing ? (y,Y,n,N) ') if feature in ['y','Y']: M = int(input('M = ')) usehash = True print('-------------------') stopwordslist = [] stopwords_file = open('stopwords.txt', 'r') for line in stopwords_file: strip_stopwords_file = line.strip() strip_split_stopwords_file = strip_stopwords_file.split() stopwordslist += strip_split_stopwords_file stopwords_file.close() def find_replace(t): result = "" for c in t: if c in "\"\'/\\,.:;": result += " " else: result += c return result charcount = 0 file = open(filename, 'r') for line in file: strip_line = line.strip().lower() charcount += len(strip_line) file.close() print('char count =',charcount) alphanumericcount = 0 file = open(filename, 'r') for line in file: strip_line = line.strip().lower() for i in strip_line: isalnum = i.isalnum() if isalnum == True: alphanumericcount +=1 file.close() print('alphanumeric count =',alphanumericcount) linecount = 0 file = open(filename, 'r') for line in file: strip_line = line.strip().lower() linecount +=1 file.close() print('line count =',linecount) wordcount = 0 file = open(filename, 'r') for line in file: strip_line = line.strip().lower() words = find_replace(strip_line) strip_words = words.strip() split_strip_words = strip_words.split() wordcount += len(split_strip_words) file.close() print('word count =',wordcount) all_words_list =[] file = open(filename, 'r') for line in file: strip_line = line.strip().lower() words = find_replace(strip_line) strip_words = words.strip() split_strip_words = strip_words.split() all_words_list += split_strip_words file.close() all_words_withoutstopwords_list = [] a = [] for i in all_words_list: if not i in stopwordslist: all_words_withoutstopwords_list.append(i) BoW = [] def addwordToBoW(BoW,newword): contain = False for i in BoW: if i[0] == newword: contain = True i[1] +=1 break if contain == False: BoW.append([newword,1]) return BoW if usehash == False: for i in all_words_withoutstopwords_list: BoW = addwordToBoW(BoW,i) print('BoW =',sorted(BoW)) def fhash(word,M): G = 37 numchar = 0 for charindex in range (len(word)): numchar += ord(word[charindex])*(G**charindex) return numchar%M if usehash == True: wordhash_list = [] for word in all_words_withoutstopwords_list: wordhash = fhash(word,M) wordhash_list.append(wordhash) BoWhash = [] for i in sorted(wordhash_list): BoWhash = addwordToBoW(BoWhash,i) print('BoW =',BoWhash)
# 6330524021 (26.00) 343 (2021-03-22 16:30) def flash(w,M): #รับสตริง 1 คำ และค่า M คืนค่าจำนวนเต็มของคำ c=0 for i in range(len(w)): c+=ord(w[i])*37**i return c%M def clear(messages): #รับสตริงข้อความ c='' #คืนข้อความที่ตัดอักขระแล้ว for i in messages.lower(): if i.isalnum() :c+=i else:c+=' ' return c def BoW(m): #รับลิสต์ข้อความที่ clear ไม่มี stopwords words=[] #คืนลิสต์ของ คำ+จำนวน for i in m: if i not in words: words.append(i) frequency=[0]*len(words) for i in range(len(m)): frequency[words.index(m[i])]+=1 bow=[] for i in range(len(words)): bow.append([words[i],frequency[i]]) return bow def cut_stop(w): #รับลิสต์คำ ตัด stopwords global stopwords return [e for e in w if e not in stopwords] #start file_name=input('File name = ').strip() while True: feature=input('Use feature hashing ? (y,Y,n,N) ').lower().strip() if feature=='n':break if feature=='y':M=int(input('M = ').strip());break else:print('Try again.') print('-------------------') file=open(file_name,'r') file2=open('stopwords.txt','r') c=0 alllines=file.readlines() for i in alllines: for e in i: if e !='\n':c+=1 print('char count = '+str(c)) c1=0 for i in alllines: i=clear(i) for e in i: if e!=' ':c1+=1 print('alphanumertic count = '+str(c1)) print('line count = '+str(len(alllines))) allw=[] for i in alllines: i=clear(i) for e in i.split(): allw.append(e) allw.sort() print('word count =',len(allw)) stopwords=[] for i in file2: i=clear(i) for e in i.split(): stopwords.append(e) allw=cut_stop(allw) if feature=='y': newallw=[] for i in allw: newallw.append(flash(i,M)) newallw.sort() print('Bow =',BoW(newallw)) if feature=='n': print('BoW =',BoW(allw)) file.close() file2.close()
# 6330525621 (19.00) 344 (2021-03-22 22:27) file_name = open(input("File Name = "),"r") read = input("Use feature hashing ? (y,Y,n,N) ") while read.lower() != 'y' and read.lower() != 'n': print("Try again.") read = input("Use feature hashing ? (y,Y,n,N) ") if read.lower() == "y": M = int(input("M = ")) stopword = open("stopword.txt","r") print("-------------------") def remove_punctuation(s): out='' for c in s: if c.lower() in '0123456789abcdefghijklmnopqrstuvwxyz': out += c.lower() else: out += ' ' return out char = 0 line_count = 0 alpha = 0 word_count = 0 word_file = [] for line in file_name: for e in line: if e != "\n": char += 1 for e in line.lower(): if e.isalnum() and e != "\n": alpha += 1 line_count += 1 line_word = (remove_punctuation(line).split()) for e in line_word: word_file.append(e) for e in word_file: if e != '/n': word_count += 1 def word_in_file(file): words = [] for lin in file: lineword = (remove_punctuation(lin).split()) for k in lineword: words.append(k) return words def BoW_no_fea(words,stop): stop = word_in_file(stop) for s in words: if s in stop: words.remove(s) for s in words: if s in stop: words.remove(s) words.sort() n = 0 bow = [] for i in range(len(words)-1): if words[i+1] != words[i]: f = i-n+1 n = i+1 bow.append([words[i],f]) bow.append([words[len(words)-1],len(words)-n]) return bow def fhash(w,M): u = 0 for i in range(len(w)): u += ord(w[i])*(37**i) result = u%M return result def BoW_w_fea(words,M,stop): stop = word_in_file(stop) for k in words: if k in stop: words.remove(k) for k in words: if k in stop: words.remove(k) words.sort() fh =[] for i in range(len(words)): fh.append(fhash(words[i],M)) bow = [] n = 0 fh.sort() for i in range(len(fh)-1): if fh[i+1] != fh[i]: f = i-n+1 n = i+1 bow.append([fh[i],f]) bow.append([fh[len(fh)-1],len(fh)-n]) return bow print("char count =",char) print("alphanumeric =",alpha) print("line count =",line_count) print("word count =",word_count) if read.lower() == "n": print("BoW =",BoW_no_fea(word_file,stopword)) if read.lower() == "y": print("BoW =",BoW_w_fea(word_file,M,stopword)) stopword.close() file_name.close()
# 6330526221 (21.40) 345 (2021-03-21 23:57) def fhash(w,M) : c = 0 for i in range(len(w)) : c += ord(w[i]) * (37**i) c = c % M return c def remove_punctuation(s): out = '' for c in s: if c.lower() in '0123456789abcdefghijklmnopqrstuvwxyz': out += c.lower() else: out += ' ' return out File_name = open(input('File name = '),'r') BBB = [] text = '' snow = 0 plus = 0 for line in File_name : if '\n' in line : snow += len(line)-1 text += line[:-1] plus += 1 else : snow += len(line) text += line plus += 1 s=remove_punctuation(text) see=''.join([token[0].upper()+token[1:] for token in s.split()]) listt = s.split() stop_words = open('stopwords.txt','r') xxx = '' for line in stop_words : if '\n' in line : xxx += line[:-1] + ' ' else : xxx += line xxx = xxx.split() J = [] for i in range(len(listt)) : if not listt[i] in xxx : J.append(listt[i]) JJ = [] MM = [] for i in range(len(J)) : if not J[i] in JJ : JJ.append(J[i]) else : MM.append(J[i]) SS = [] SSS = [] for i in J : if i not in SS : SS.append(i) SSS.append(J.count(i)) AAA = [] for i in range(len(JJ)) : AAA.append([JJ[i],SSS[i]]) AAA.sort() while True : x = input('Use feature hashing ? (y,Y,n,N) ') if x == 'y' : a = int(input('M = ')) for i in range(len(J)) : BBB.append(str(fhash(J[i],a))) K = [] KK = [] K_ = [] KK_ = [] solution = [] for i in range(len(J)) : if not BBB[i] in K : K.append(int(BBB[i])) for i in range(len(J)) : if not BBB[i] in K_ : K_.append(BBB[i]) for i in K_ : KK_.append(int(i)) K.sort() for i in BBB : if i not in K : K.append(i) KK.append(BBB.count(i)) for i in range(len(KK)) : solution.append([KK_[i],KK[i]]) solution.sort() break elif x == 'Y' : a = int(input('M = ')) for i in range(len(J)) : BBB.append(str(fhash(J[i],a))) K = [] KK = [] K_ = [] KK_ = [] solution = [] for i in range(len(J)) : if not BBB[i] in K : K.append(int(BBB[i])) for i in range(len(J)) : if not BBB[i] in K_ : K_.append(BBB[i]) for i in K_ : KK_.append(int(i)) K.sort() for i in BBB : if i not in K : K.append(i) KK.append(BBB.count(i)) for i in range(len(KK)) : solution.append([KK_[i],KK[i]]) solution.sort() break elif x == 'n' : break elif x == 'N' : break else : print('Try again.') File_name.close() stop_words.close() print('-------------------') print('char count = '+ str(snow)) print('alphanumeric count = ' + str(len(see))) print('line count = ' + str(plus)) print('word count = ' + str(len(listt))) if x == 'n' : print('Bow = ',AAA) elif x == 'N' : print('Bow = ',AAA) elif x == 'y' : print('Bow = ',solution) elif x == 'Y' : print('Bow = ',solution)
# 6330527921 (26.00) 346 (2021-03-22 20:23) def fhash(w,M) : f = 0 for i in range(len(w)): f += ord(w[i]) * (37 ** i) f = f % M return f file_name = str(input('File name = ')) while True : a = input('Use feature hashing ? (y,Y,n,N) ') if a == 'y' or a == 'Y' : M = int(input('M = ')) break elif a == 'n' or a == 'N' : M = 'false' break else: print('Try again.') stopwords = open('stopwords.txt','r') lstopwords = stopwords.read() stopwords.close() lstopwords = lstopwords.split() readfile = open(file_name,'r') llines = [line.strip() for line in readfile.readlines()] linecount = len(llines) charcount = len(''.join(llines)) s = ' '.join(llines) words = '' alnum = '' for i in range(len(s)) : if s[i].isalnum(): words += s[i].lower() alnum += s[i] else : words += ' ' lwords = words.strip().split() wordcount = len(lwords) alnumcount = len(alnum) print('-------------------') print('char count = '+str(charcount)) print('alphanumeric count = '+str(alnumcount)) print('line count = '+str(linecount)) print('word count = '+str(wordcount)) n = 0 while n < len(lwords) : if lwords[n] in lstopwords : lwords.remove(lwords[n]) else : n += 1 lwords.sort() if M == 'false' : p = 1 lp = [] m = 1 while m < len(lwords) : if lwords[m] == lwords[m-1] : p += 1 lwords.remove(lwords[m-1]) else : lp += [p] m += 1 p = 1 if lwords[-1] == lwords[-2] : p += 1 lwords.remove(lwords[-2]) lp += [p] else : lp += [p] BoW = [] for k in range(len(lp)) : BoW += [[lwords[k],lp[k]]] else : fhwords = [] for l in range(len(lwords)) : fhwords += [fhash(lwords[l],M)] fhwords.sort() p = 1 lp = [] o = 1 while o < len(fhwords) : if fhwords[o] == fhwords[o-1] : p += 1 fhwords.remove(fhwords[o-1]) else : lp += [p] o += 1 p = 1 if fhwords[-1] == fhwords[-2] : p += 1 fhwords.remove(fhwords[-2]) lp += [p] else : lp += [p] BoW = [] for k in range(len(lp)) : BoW += [[fhwords[k],lp[k]]] print('BoW = ' + str(BoW))
# 6330528521 (20.50) 347 (2021-03-22 15:52) file_name = input('File name = ') use = input('Use feature hashing ? (y,Y,n,N)') read = open(file_name,'r') while use not in 'y,Y,n,N': print('Try again.') use = input('Use feature hashing ? (y,Y,n,N)') if use in 'y,Y' : M = input('M = ') stopword = open('stopwords.txt','r') count_char = 0 count_line = 0 alpha = 0 b = '' for line in read : line.lower() count_char = count_char + len(line) count_line+=1 for i in line : if 'A'<=i<='Z' or 'a'<=i<='z' or '0'<=i<='9': alpha+=1 if i in ',."\n': b+=' ' else : b+=i.lower() a=b.split(' ') c=[] for i in a : if i != '': c.append(i) print('-------------------') print('char count = '+str(count_char-(count_line-1))) print('alphanumeric count = '+str(alpha)) print('line count = '+str(count_line)) print('word count = '+str(len(c))) m='' for line in stopword: for i in line : if i in ',."\n': m+=' ' else : m+=i v=m.split(' ') p = [] for i in a : if i not in v and i not in ['']: p.append(i) p = sorted(p) rr = [] i=0 if use not in ['y','Y']: while i <len(p)-1 : u = 1 if p[i] == p[i+1]: u = u+1 y = [p[i],u] rr.append(y) i+=2 else : y = [p[i],u] rr.append(y) i+=1 print(rr) else : g = [] for e in p: z = 0 x = 0 for i in e : z = z+ord(i)*37**x x+=1 if x == len(e): f = [e,z%int(M)] g.append(f) gg = [] for i in g: gg.append(i[1]) gg = sorted(gg) ii = 0 rrr = [] while ii < len(gg): uu = 1 if ii == len(gg)-1: yy = [gg[ii],uu] rrr.append(yy) ii+=1 elif gg[ii] == gg[ii+1]: uu = uu+1 yy = [gg[ii],uu] rrr.append(yy) ii+=2 else : yy = [gg[ii],uu] rrr.append(yy) ii+=1 print('BoW =',rrr) stopword.close() read.close()
# 6330529121 (30.00) 348 (2021-03-22 22:52) M = 1 file_name = input('File name = ') fhashenable = True fhash = input('Use feature hashing ? (y,Y,n,N) ') while True : if fhash.lower() == 'y' : fhashenable = True M = int(input('M = ')) break elif fhash.lower() == 'n' : fhashenable = False break else : print('Try again.') fhash = input('Use feature hashing ? (y,Y,n,N) ') def filetolist(x) : stop_words = open(x ,'r') stopwords1 = stop_words.readlines() sentence = '' for ch in stopwords1 : if ch[-1] == '\n' : sentence += ch[:-1].lower() sentence += ' ' else : sentence += ch.lower() sentence += ' ' sentencenobobo = '' for ch in sentence : if ch.isalnum() == True : sentencenobobo += ch else : sentencenobobo += ' ' sentencelist = sentencenobobo.strip().split() stop_words.close() return(sentencelist) filename1 = open(file_name , 'r') filename2 = filename1.readlines() charcounttttt = '' for ch in filename2 : if ch[-1] == '\n' : charcounttttt += ch[:-1].lower() else : charcounttttt += ch.lower() charcount = len(charcounttttt) filename1.close() filename1 = open(file_name , 'r') filename2 = filename1.readlines() numberandalpha = 0 for ch in filename2 : for e in ch : if e.isalnum() == True : numberandalpha += 1 else : numberandalpha += 0 filename1.close() filename1 = open(file_name , 'r') filename2 = filename1.readlines() linecount = 0 for ch in filename2 : linecount += 1 filename1.close() def fhashing(w,M) : k = 0 for i in range(len(w)) : k += ord(w[i]) * (37**i) return(k%M) def bagofword(x,y) : eao = filetolist(x) maieao = filetolist('stopwords.txt') hiyaa = [] for ch in eao : if ch not in maieao : hiyaa.append(ch) if y == True : global M for i in range(len(hiyaa)) : hiyaa[i] = fhashing(hiyaa[i],M) hiyaa.sort() PLZ = [] count = [] countPLZ = 0 for i in range(len(hiyaa)) : if i < len(hiyaa)-1 and hiyaa[i] != hiyaa[i+1] : PLZ.append(hiyaa[i]) countPLZ += 1 count.append(countPLZ) countPLZ = 0 else : countPLZ += 1 if i == len(hiyaa) - 1 : count.append(countPLZ) PLZ.append(hiyaa[i]) FINAL = [] for i in range(len(PLZ)) : FINAL.append([PLZ[i],count[i]]) return(FINAL) print('-------------------') print('char count = ' + str(charcount)) print('alphanumeric count = ' + str(numberandalpha)) print('line count = ' + str(linecount)) print('word count = ' + str(len(filetolist(file_name)))) print('BoW = ' + str(bagofword(file_name,fhashenable)))
# 6330530721 (28.00) 349 (2021-03-20 22:18) fname = input('File name = ') file_name = open(fname,'r') #---------------------------------------------------------- def count_words(s): for i in range(len(s)): if s[i] not in '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': s = s[:i] + ' ' + s[i+1:] new = s.split() q = len(new) return q def sep_words(s): for i in range(len(s)): if s[i].lower() not in '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': s = s[:i].lower() + ' ' + s[i+1:].lower() new = s.split() return new def fhash(w,M): c = 0 for i in range(len(w)): c += ord(w[i])*(37**i) fhash_result = c%M return fhash_result #----------------------------------------------- line_count = 0 number_of_characters = 0 number_of_alnum = 0 number_of_word = 0 bow = [] sep_w = [] for line in file_name.readlines(): line_count +=1 number_of_characters += len(line) s1="".join(c for c in line if c.isalnum()) number_of_alnum += len(s1) number_of_word += count_words(line) sep_w += sep_words(line) file_name.close() stopwords = open('stopwords.txt','r') #-------------------------------------------------- st = [] sep_st = [] for li in stopwords.readlines(): st.append(li) sep_st += sep_words(li) pre_bow = [] for i in sep_w: if i not in sep_st: pre_bow.append(i) pre_bow.sort() list_words = [] list_freq = [] for word in pre_bow: if word not in list_words: list_words.append(word) list_freq.append(pre_bow.count(word)) #----------------------------------------------- bow = [] for e in range(len(list_words)): bow.append([list_words[e],list_freq[e]]) number_of_characters_edit = number_of_characters-(line_count-1) choice = input('Use feature hashing ? (y,Y,n,N) ') #---------------------------------------------------------------- if choice not in ['y','Y','n','N']: while choice not in ['y','Y','n','N']: print('Try again') choice = input('Use feature hashing ? (y,Y,n,N) ') if choice in ['y','Y']: M = int(input('M = ')) print('-------------------') print('char count =', number_of_characters_edit) print('alphanumeric count =', number_of_alnum) print('line count =',line_count) print('word count =',number_of_word) bow_y = [] for i in range(len(pre_bow)): bow_y.append(fhash(pre_bow[i],M)) list_num = [] num_freq = [] for word in bow_y: if word not in list_num: list_num.append(word) num_freq.append(bow_y.count(word)) bow_y2 = [] for e in range(len(list_num)): bow_y2.append([list_num[e],num_freq[e]]) bow_y2.sort() print('BoW =',bow_y2) if choice in ['n','N']: print('-------------------') print('char count =', number_of_characters_edit) print('alphanumeric count =', number_of_alnum) print('line count =',line_count) print('word count =',number_of_word) print('BoW =',bow)
# 6330531321 (22.00) 350 (2021-03-21 22:33) file_name = input("File name = ") file_name2 = file_name file_name = open(file_name.strip(),"r") char_count = 0 c = 0 linecount = 0 alphanumeric = 0 total = 0 beta =[] delete1 = [] delete2 = [] kite = [] insidebow = [] G = 37 for line in file_name: linecount += 1 x = line.strip("\n") for e in x: char_count += 1 if e.isalnum() == True: alphanumeric += 1 if e.isalnum() == True: c+=1 kite.append(e) if e.isalnum() == False: kite.append(" ") wordcount = len("".join(kite).split()) file_name.close() #---------------------------------------------------------------------------# def BoW(file_name): stopword = open("stopwords.txt","r") G = 37 beta =[] delete1 = [] delete2 = [] file_name = open(str(file_name),"r") for line in file_name: line = line.lower() for e in line: if e.isalnum() == True: beta.append(e) if e.isalnum() == False: beta.append(" ") beta = "".join(beta).split() for line in stopword: line = line.strip("") for e in line: if e.isalnum() == True: delete1.append(e) if e.isalnum() == False: delete1.append(" ") delete1 = "".join(delete1).split() for e in beta: if e not in delete1: delete2.append(e) else: delete2 = delete2 file_name.close() stopword.close() BoW =[] insidebow = [] total = 0 for e in delete2: for k in delete2: if e == k: total += 1 if [e,total] in insidebow: pass else: insidebow.append([e,total]) total = 0 return insidebow #-------------------------------------------------------------# hashing = input("Use feature hashing ? (y,Y,n,N) ") while True: if hashing == "n" or hashing == "N": print("-------------------" ) print("char count =",char_count) print("alphanumeric count =",alphanumeric) print("line count =",linecount) print("word count =",wordcount) print(sorted(BoW(file_name2))) break elif hashing == "y" or hashing == "Y": M = input("M = ") print("-------------------" ) print("char count =",char_count) print("alphanumeric count =",alphanumeric) print("line count =",linecount) print("word count =",wordcount) stopword = open("stopwords.txt","r") file_name = open(file_name2,"r") beta =[] delete1 = [] delete2 = [] thong = [] bowwie = [] abc = [] satis =0 som = 0 for line in file_name: line = line.lower() for e in line: if e.isalnum() == True: beta.append(e) if e.isalnum() == False: beta.append(" ") beta = "".join(beta).split() for line in stopword: line = line.strip("") for e in line: if e.isalnum() == True: delete1.append(e) if e.isalnum() == False: delete1.append(" ") delete1 = "".join(delete1).split() for e in beta: if e not in delete1: delete2.append(e) else: delete2 = delete2 file_name.close() stopword.close() for e in delete2: for i in range(len(e)): som += (ord(e[i]))*(G**i) som = som % int(M) thong.append(som) som = 0 abc = [] summation = 0 for e in thong: for k in thong: if e == k: summation += 1 if [e,summation] in bowwie: pass else: bowwie.append([e,summation]) summation = 0 print("BoW =",sorted(bowwie)) break else: print("Try again. ") hashing = input("Use feature hashing ? (y,Y,n,N) ")
# 6330532021 (30.00) 351 (2021-03-22 22:45) def bow_n(x): t = [] for i in x: if i not in t: t.append(i) else: continue t_ = [] for i in t: n = 0 for e in range(len(x)): if i == x[e]: n += 1 else: continue t_.append([i,n]) t_.sort() return t_ def fhash(x): num = 0 for i in range(len(x)): num = num + (ord(x[i]))*(37**i) return num def bow_y(x,y): a = [] for i in x: a.append(fhash(i)%int(y)) t = [] for i in a: if i not in t: t.append(i) else: continue t_ = [] for i in t: n = 0 for e in range(len(a)): if i == a[e]: n += 1 else: continue t_.append([i,n]) t_.sort() return t_ file_name = input('File name = ') r = open(file_name,'r') line = r.readlines() r.close() p = open('stopwords.txt') q = p.readlines() p.close() while True: solu = input('Use feature hashing ? (y,Y,n,N) ') if solu == 'y' or solu == 'Y': m = input('M = ') break elif solu == 'n' or solu == 'N': break else : print('Try again.') print('-------------------') sen_ = '' for i in [i.strip().lower() for i in line]: sen_ = sen_+i+' ' char_co = '' for i in [i.strip() for i in line]: char_co = char_co+i print('char count = '+str(len(char_co))) u = '' for i in sen_: if i.isalnum() : u += i else: continue print('alphanumeric count = '+str(len(u))) print('line count = '+str(len(line))) sen__ = '' for i in sen_: if i.isalnum() : sen__ += i else: sen__ += ' ' y = sen__.split() print('word count = '+str(len(y))) q_ = [] q_ = [i.strip() for i in q] q__ = '' for i in q_: q__ = q__+i.lower()+' ' q___ = q__[:-1].split(' ') new_sen = [] for i in y: if i.lower() in q___: continue else: new_sen.append(i) ans_bow = '' if solu == 'y' or solu == 'Y': ans_bow = bow_y(new_sen,m) if solu == 'n' or solu == 'N': ans_bow = bow_n(new_sen) print('BoW = '+str(ans_bow))
# 6330533621 (10.00) 352 (2021-03-20 14:45) In=str(input('File name = ')) hashing=input('Use feature hashing ? (y,Y,n,N) ') while True: if hashing=='y' or hashing=='Y': M=int(input('M = ')) print('-------------------') break elif hashing=='n' or hashing=='N': print('-------------------') break else: print('Try again.') hashing=input('Use feature hashing ? (y,Y,n,N) ') def char_count(sentence): char=0 sentence=list(sentence) for i in sentence: char+=1 return char def al_count(sentence): al=0 sentence=list(sentence) for i in sentence: if i.isalnum(): al+=1 return al def line_count(): file_name=open('sample.txt','r') n=0 for i in file_name: n+=1 file_name.close() return n def word_count(a): words='' for i in list(a): if i.isalnum(): words+=i else: words+=' ' return len(words.split()) def fhash(word,M): fhash=0 for n in range(len(word)): fhash+=ord(word[n])*(37**n) return fhash%M def new_words(sentence,stopwords): words='' for i in list(sentence): if i.isalnum(): words+=i else: words+=' ' words=words.split() new_words=[] for e in words: if not e in stopwords: new_words.append(e) return (new_words) file_name=open(In,'r') file=open('stopwords.txt','r') sen='' for i in file_name: sen+=i sentence='' for i in list(sen): if i!='\n': sentence+=i print('char count = '+str(char_count(sentence))) print('alphanumeric count = '+str(al_count(sentence))) print('line count = '+str(line_count())) print('word count = '+str(word_count(sentence))) stop='' for n in file: stop+=n stopwords=(stop.lower()).split() sentence=sentence.lower() if hashing=='y' or hashing=='Y': BoW=[] for e in new_words(sentence,stopwords): BoW.append(fhash(e,M)) Final_BoW=[] for e in range(len(sorted(BoW))): if e==0: Final_BoW.append(sorted(BoW)[e]) elif 1<=e: if sorted(BoW)[e-1]<sorted(BoW)[e]: Final_BoW.append(sorted(BoW)[e]) count=[] for n in Final_BoW: count.append((sorted(BoW)).count(n)) BOW=[] for i in range(len(Final_BoW)): BOW.append([Final_BoW[i],count[i]]) print('BoW = {}'.format(BOW)) elif hashing=='n' or hashing=='N': BoW=[] for e in new_words(sentence,stopwords): BoW.append(e) Final_BoW=[] for e in range(len(sorted(BoW))): if e==0: Final_BoW.append(sorted(BoW)[e]) elif 1<=e: if sorted(BoW)[e-1]<sorted(BoW)[e]: Final_BoW.append(sorted(BoW)[e]) count=[] for n in Final_BoW: count.append((sorted(BoW)).count(n)) BOW=[] for i in range(len(Final_BoW)): BOW.append([Final_BoW[i],count[i]]) print('BoW = {}'.format(BOW)) file_name.close() file.close()
# 6330534221 (19.40) 353 (2021-03-21 20:47) stop=open("stopwords.txt","r") stopw=[] for line in stop: stopw+=line.split() #--------------------------------------------------- def bow(sen): sent=sen.split() sentence=[] for e in sent: if e not in stopw: sentence.append(e) uniq=[] for ch in sentence: if ch not in uniq: uniq.append(ch) BoW=[[e,sentence.count(e)] for e in uniq] return BoW #--------------------------------------------------- def fhash(w,M): n=0 for i in range(len(w)): e=w[i] n+=ord(e)*37**i m=n%int(M) return m #--------------------------------------------------- def sum_bow(BOW): BoW=[] first=[] for e in BOW: if e[0] not in first: first.append(e[0]) for ch in first: sum=0 for e in BOW: if e[0]==ch: sum+=e[1] BoW+=[[ch,sum]] return BoW #--------------------------------------------------- file_name=input("File name = ") t= True while t==True: hashing=input("Use feature hashing ? (y,Y,n,N) ") if hashing in ["y","Y","n","N"]: t= False if hashing in["y","Y"]: M=input("M = ") else: print("Try again.") #--------------------------------------------------- file=open(file_name,"r") sent="" lin=0 for line in file: lin+=1 for e in line.strip(): if e.isalnum()==True: sent+=e.lower() elif e==" " : sent+=e else: sent+=" " file.close() alco=0 for e in sent: if e.isalnum()==True: alco+=1 wc=sent.split() #--------------------------------------------------- print("-------------------") print("char count =",len(sent)) print("alphanumeric count =",alco) print("line count =",lin) print("word count",len(wc)) #--------------------------------------------------- BOW=bow(sent) if hashing in ["y","Y"]: for e in BOW: e[0]=fhash(e[0],M) BOW=sum_bow(BOW) BOW.sort() print("BoW =", BOW) #---------------------------------------------------
# 6330535921 (24.67) 354 (2021-03-22 02:46) def remove_punctuation(s): out = '' for c in s: if c.lower() in '0123456789abcdefghijklmnopqrstuvwxyz': out += c.lower() else: out += ' ' return out def count_alpha(w): n = 0 for i in w: if i in '0123456789abcdefghijklmnopqrstuvwxyz': n+=1 return n def fhash(word,m): sum_ =0 for i in range(len(word)): n=0 sum_ += ord(word[i])*37**i ans = sum_%m return ans def bow(lis,ans): for i in ans: s=0 for j in ans: if i == j: s+=1 if [i,s] not in lis: lis.append([i,s]) return lis file = open(input('File name = ')) word = [] n_=0 w = 0 n_char = 0 for line in file: for i in line.strip().split(): word.append(i) w += 1 n_+=1 n_char += len(line.strip()) file.close() word_str = remove_punctuation(' '.join(word)) n=0 while n != 1: x = input('Use feature hashing ? (y,Y,n,N) ') if x in 'yY': x = 'y' n+=1 if x in 'nN': x = 'n' n+=1 if x not in 'yYnN': print('Try again.') continue stop_f = open('stopword.txt') sf = [] for line in stop_f: for i in line.strip().split(): sf.append(i) stop_f.close() word2 = remove_punctuation(' '.join(word)).split(' ') bow_ = [] for i in word2: if i != '' and i not in sf: bow_.append(i) bow_ = sorted([len(i),i] for i in bow_) bow_ = [bow_[i][1] for i in range(len(bow_))] if x=='y': m = int(input('M = ')) bow_ = sorted([fhash(i,m) for i in bow_]) bow_ans = [] ans = bow(bow_ans,bow_) else: bow_ans = [] ans = bow(bow_ans,bow_) print('-------------------') print('char count = ',n_char) print('alphanumeric count = ',count_alpha(word_str)) print('line count = ',n_) print('word count = ',w) print('BoW =',ans)
# 6330536521 (30.00) 355 (2021-03-22 20:13) def stop(a) : b="" x=open(a , "r") for e in x: for a in e: if a.lower().isalnum()==True : b+=a else: b+=" " return b.split() def stop2(a) : n=stop(a) ex=[] c=[] x=open("stopwords.txt","r") for i in x: ex+=i.split() for e in n: if e.lower() not in ex: c+=[e.lower()] return c def fhash(a,b): c=0 for i in range(len(a)): c+=ord(a[i])*(37**i) return c%b def line(a): x=open(a , "r") n=0 for e in x : n+=1 return print("line count = "+str(n)) def char(a): x=open(a , "r") n=1 for e in x : c="".join(e) for i in c[:-1]: n+=1 return print("char count = "+str(n)) def word(a) : n=stop(a) print("word count = "+str(len(n))) def alpha(a) : n=stop(a) c=0 for e in n : c+=len(e) print("alphanumeric count = "+str(c)) file_name=input("File name = ") while True: al=input("Use feature hashing ? (y,Y,n,N) ") if al.lower()=="y" : m=int(input("M = ")) print("""-------------------""") char(file_name) alpha(file_name) line(file_name) word(file_name) n=stop2(file_name) l=[] for e in n: l.append(fhash(e,m)) l=sorted(l) bow=[] co=1 for i in range(len(l)): if i==int(len(l))-1 : bow+=[[l[i],co]] elif l[i]==l[i+1]: co+=1 else: bow+=[[l[i],co]] co=1 print("Bow = "+str(bow) ) break elif al.lower()=="n": print("""-------------------""") char(file_name) alpha(file_name) line(file_name) word(file_name) n=sorted(stop2(file_name)) bow=[] co=1 for i in range(len(n)): if i==int(len(n))-1 : bow+=[[n[i],co]] elif n[i]==n[i+1]: co+=1 else: bow+=[[n[i],co]] co=1 print("Bow = "+str(bow) ) break else : print("Try again.")
# 6330537121 (18.58) 356 (2021-03-22 16:05) def fhash(w,M): n=0 for e in range(len(w)): n += ord(w[e])*37**e return n%M file_name = open(input('File name = '),'r') g = input('Use feature hashing ? (y,Y,n,N) ') if g.lower() == 'y': M = int(input('M =')) x = True elif g.lower() == 'n': x = False else: while g not in ['y','Y','n','N']: print('try again') g = input('Use feature hashing ? (y,Y,n,N)') if g.lower() == 'y': M = int(input('M =')) x = True elif g.lower() == 'n': x = False print('-------------------') j= '' line_count = 0 alphanumeric_count = 0 char_count = 0 word_count = 0 for i in file_name: line_count += 1 y = 0 if "\n" in i : y += len(i)-1 char_count += y else : y += len(i) char_count += y for k in range(len(i)): z = 0 if i[k] not in[ '(', ')', '-', '', '[', ']' ,'"' ,"'" ,';', ':', '>', '<','.',' ','\n',',']: z += 1 alphanumeric_count += z b = '' for g in range(len(i)): if i[g] in'ABCDEFGHIJKLMNOPQRSTUVWXYZ': b += i[g] j += i[g] elif i[g] in 'abcdefghijklmnopqrstuvwxyz': b += i[g] j += i[g] elif i[g] in '0123456789': b += i[g] j += i[g] else: b += ' ' j += ' ' b.split() j.split() word_count +=len(b.split()) fn = open('stopwords.txt','r') s='' for i in fn: for g in range(len(i)): if i[g] in'ABCDEFGHIJKLMNOPQRSTUVWXYZ' or 'abcdefghijklmnopqrstuvxywz' or '0123456789': s += i[g] else: s += " " s.split() fn.close() o=[] for i in j.split(): if i.lower() not in s : o.append(i) BoW0=[] BoW1=[] BoW2=[] BoW3=[] if x == True: for i in o: BoW0.append(fhash(i,M)) for a in BoW0: if a not in BoW2: BoW2.append(a) for e in BoW2: z=0 for k in BoW0: if e == k : z+=1 BoW3.append([e,z]) BoW3.sort() elif x == False: for i in o: if i not in BoW2 : BoW2.append(i) for e in BoW2: z=0 for k in o : if e == k: z+=1 BoW3.append([e,z]) print('char count =',char_count) print('alphanumeric count =',alphanumeric_count ) print('line count =',line_count-1) print('word count =',word_count) print('BoW =',BoW3) file_name.close() file_name.close()
# 6330538821 (16.20) 357 (2021-03-20 16:06) stop = ['it', 'they', 'the', 'a', 'an', 'of', 'on', 'in', 'at', 'is', 'am', 'are', 'was', 'were'] def remove_punctuation(w): out = '' for c in w: if c.lower() in '0123456789abcdefghijklmnopqrstuvwxyz': out += c.lower() else: out += ' ' return out def character_count(s): c=0 for char in s: if char != '\n': c += 1 return c def af_count(s): s = [j for j in s if j.lower() in '0123456789abcdefghijklmnopqrstuvwxyz'] s = ''.join(s) c = len(s) return c def word_c(s): c = len(s.split(' ')) return c def bow(s): s = remove_punctuation(s) s = s.split(" ") p = [] key = [] for i in s: if i == "": continue if i.lower() not in stop: if i in key: p[key.index(i)][1] += 1 else: p.append([i, 1]) key.append(i) p.sort() return p def flash(w,m): g = 37 a = 0 b = 0 for i in range(len(w)): a += ord(w[i])*g**i b += int(a)%m return b def flash_bow(s,m): s=remove_punctuation(s) s=s.split(' ') s= [i for i in s if i != '' or '\n' or " "] s=[i for i in s if i not in stop] p=[] pp=[] for j in s: if j == " " or j == "\n" or j == "": continue if flash(j,m) in pp: p[pp.index(flash(j,m))][1] += 1 continue p.append([flash(j,m),1]) pp.append(flash(j,m)) p.sort() return p file_name = input('File name = ') feature_hashing = input('Use feature hashing ? (y,Y,n,N) ') while feature_hashing not in 'nyNY': print('Try again.') feature_hashing = input('Use feature hashing ? (y,Y,n,N) ') if feature_hashing.lower() == 'n': file = open(file_name, 'r') c = 0 a = 0 l = 0 w = 0 z = '' for line in file: c += character_count(line) l += 1 a += af_count(line) w += word_c(line) z += line file.close() print('-------------------') print('char count =', c) print('alphanumeric count =', a) print('line count =', l) print('word count =', w) print('BoW =', bow(z)) elif feature_hashing.lower() == 'y': file = open(file_name, 'r') m = int(input('M = ')) c = 0 a = 0 l = 0 w = 0 bowe = '' for line in file: c += character_count(line) a += af_count(line) l += 1 w += word_c(line) bowe += line file.close() print('-------------------') print('char count =', c) print('alphanumeric count =', a) print('line count =', l) print('word count =', w) print('BoW =', flash_bow(bowe,m))
# 6330539421 (26.00) 358 (2021-03-22 22:35) #function #---------------------------------------- def wordddd(line,i): global word global Bow global M karm = '' wordd = False line = '***' + line + '***' while i < (len(line)): if line[i].isalnum(): karm += line[i] wordd =True i += 1 else: i += 1 if wordd == True: karm = karm.lower() if karm not in stopwords: if feature: inBow = False for x in Bow: if hashing(karm,M) == x[0]: inBow = True x[1] += 1 if inBow == False: Bow.append([hashing(karm,M),1]) else: inBow = False for x in Bow: if karm in x: inBow = True x[1] += 1 if inBow == False: Bow.append([karm,1]) karm = '' wordd = False word += 1 def countword(line): global word i = 0 while i < len(line) and line[i].isalnum() == False : i += 1 wordddd(line,i) def hashing(wordddddd,M): G = 37 return (sum([ord(wordddddd[e])*(G**e) for e in range(len(wordddddd))]))%M #---------------------------------------------- file_name = input('File name = ').strip() def choicee(): global feature global choice choice = input('Use feature hashing ? (y,Y,n,N) ').strip() if choice in 'YyNn': global M feature = False if choice in 'Yy': M = int(input('M = ').strip()) feature = True else: print('Try again.') choicee() choicee() stop = open('stopwords.txt','r') stopwords = [] for line in stop: for i in line.strip().split(): if i != ' ': if i not in stopwords: stopwords.append(i) stop.close() file = open(file_name,'r') count = 0 alnum = 0 word = 0 lenght = 0 Bow = [] for line in file: lenght += 1 for i in line: if i != '\n': count += 1 if i.isalnum(): alnum += 1 countword(line.strip()) print('-------------------') print('char count =',count) print('alphanumer count =',alnum) print('line count =',lenght) print('word count =',word) Bow.sort() print('BoW =',Bow)
# 6330540021 (23.90) 359 (2021-03-21 21:03) def fhash(w, M): return sum([ord(w[i]) * (G ** i) for i in range(len(w))]) % M def addToBoW(word, BoW, M): if M == -1: for i in range(len(BoW)): if word == BoW[i][0]: BoW[i][1] += 1 return else: for i in range(len(BoW)): if fhash(word, M) == BoW[i][0]: BoW[i][1] += 1 return if M != -1: BoW.append([fhash(word, M), 1]) else: BoW.append([word, 1]) print("File name = ", end="") file_name = input() hashing_enable = "" stopword_list = [] stopword_file_name = "stopwords.txt" G = 37 M = -1 while hashing_enable == "": print("Use feature hashing ? (y,Y,n,N) ", end="") u_input = input().lower() if u_input == "y": hashing_enable = True elif u_input == "n": hashing_enable = False else: print("Try again.") if hashing_enable: print("M = ", end="") M = int(input()) with open(stopword_file_name, "r") as stopword_file: for line in stopword_file: words = [w for w in line.strip().split(" ")] if len(words) > 1: stopword_list.extend(words) character_count = 0 with open(file_name, "r") as input_file: for line in input_file: character_count += len([c for c in line if c != "\n"]) print("-------------------") print("char count =", character_count) alphanumeric_count = 0 with open(file_name, "r") as input_file: for line in input_file: alphanumeric_count += sum([1 for c in line if c.isalnum()]) print("alphanumeric count =", alphanumeric_count) line_count = 0 with open(file_name, "r") as input_file: for line in input_file: line_count += 1 print("line count =", line_count) word_count = 0 with open(file_name, "r") as input_file: for line in input_file: no_special = "".join([c.lower() if c.isalnum() else " " for c in line]) no_special = no_special.split() word_count += len(no_special) print("word count =", word_count) BoW = [] with open(file_name, "r") as input_file: for line in input_file: no_special = "".join([c.lower() if c.isalnum() else " " for c in line]) no_special = no_special.split() no_top_word = [c for c in no_special if c not in stopword_list] for word in no_top_word: addToBoW(word, BoW, M) BoW.sort() print("BoW =", BoW)
# 6330541621 (11.83) 360 (2021-03-22 20:56) def fhash(w,M): G = 37 B = 0 for a in range(len(w)): B += (ord(w[a])*(G**a)) C = B % int(M) return C fn = input('File name = ') ufh = input('Use feature hashing ? (y,Y,n,N) ') while ufh not in 'yYnN': print('Try again.') ufh2 = input('Use feature hashing ? (y,Y,n,N) ') if ufh2 == 'y' or ufh2 == 'Y' or ufh2 == 'n' or ufh2 == 'N': ufh = ufh2 break if ufh == 'y' or ufh == 'Y': M = input('M = ') print('-------------------') file2 = open('stopwords.txt','r') file2read = file2.readlines() file22 = [] file23 = [] file24 = [] for a in range(len(file2read)): D = file22.append(file2read[a][:-1]) for l in range(len(file22)): file23.append(file22[l].split()) for m in range(len(file23)): for n in range(len(file23[m])): file24.append(file23[m][n]) file1 = open(fn,'r') cutn = file1.readlines() cutn1 = [] cutn2 = [] charcount = 0 for b in range(len(cutn)): H = cutn1.append(cutn[b][:-1]) for c in cutn1: charcount += len(c) for d in cutn1: for e in range(len(d)): if d[e] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890': J = cutn2.append(d[e]) alphanumeric = len(cutn2) linecount = len(cutn1) cutn3 = [] cutn4 = [] symbol = [ '(', ')', '-', '_', '[', ']' ,'"' ,"'" ,';', ':', '>', '<','.',',' ] for f in range(len(cutn1)): k = cutn1[f].split(' ') for g in range(len(k)): for h in range(len(k[g])): if k[g][h] in symbol: k[g] = k[g][0:h]+' '+k[g][h+1:] cutn3.append(k[g].split()) for i in range(len(cutn3)): for j in range(len(cutn3[i])): cutn4.append(cutn3[i][j]) wordcount = len(cutn4) cutn5 = [] for k in range(len(cutn4)): cutn5.append(cutn4[k].lower()) cutn6 = [] for p in range(len(cutn5)): if cutn5[p] in file24: pass else: cutn6.append(cutn5[p]) cutn8 = sorted(cutn6) word = cutn8 word1 = [] point = [] for s in range(len(cutn8)): Pt = point.append(cutn8.count(cutn8[s])) if ufh == 'y' or ufh == 'Y': bow = [[fhash(word[e],M),point[e]] for e in range(len(cutn8))] bow2 = [] point = [] for s in range(len(bow)): for t in range(len(bow[s])): if t%2 == 0: bow2.append(bow[s][t]) else: pass bow3 = sorted(bow2) for u in range(len(bow3)): Pt2 = point.append(bow3.count(bow3[u])) bow4 = [[bow3[e],point[e]] for e in range(len(bow3))] bow5 = [] for q in range(len(bow4)): if q == 0: bow5.append(bow4[q]) else: if bow4[q][0] == bow4[q-1][0]: pass else: bow5.append(bow4[q]) print('char count =',charcount) print('alphanumeric count =',alphanumeric) print('line count =',linecount) print('word count =',wordcount) print('BoW =',bow5) elif ufh in 'nN' or ufh2 in 'nN': point = [] for s in range(len(cutn8)): Pt = point.append(cutn8.count(cutn8[s])) bow = [[word[e],point[e]] for e in range(len(cutn8))] bow2 = [] for q in range(len(bow)): if q == 0: bow2.append(bow[q]) else: if bow[q][0] == bow[q-1][0]: pass else: bow2.append(bow[q]) print('char count =',charcount) print('alphanumeric count =',alphanumeric) print('line count =',linecount) print('word count =',wordcount) print('BoW =',bow2)
# 6330542221 (30.00) 361 (2021-03-22 21:58) def sam(file_name): filesim = open(file_name,'r') sam = filesim.readlines() filesim.close() sam = [line.strip() for line in sam] return sam def sentencechar(file_name): sen = ''.join(sam(file_name)) return sen def sentencealpha(file_name): a = '' for i in sam(file_name): a += ' '+i.lower() for i in a: if i not in '0123456789abcdefghijklmnopqrstuvwxyz': a = a.replace(i,' ') return a def alphanumeric_count(file_name): alpha = len(sentencealpha(file_name).replace(' ','')) return alpha def BoW(file_name): sample = sorted(sentencealpha(file_name).split()) filestp = open('stopwords.txt','r') stp = filestp.readlines() filestp.close() stp = [line.strip() for line in stp] s = '' for i in stp: s += ' '+i.lower() stop = s.split() B = [] for i in sample: if i not in stop: B.append(i) b = '' c = [] for i in B: if i!=b : b = i c.append(i) BoW = [] for i in c: no1 = B.count(i) BoW.append( [i,no1]) return BoW def fhash(w,M): b = 0 G = 37 for i in range(len(w)): b += ord(w[i])*(G**i) b = b%M return b def feature_hashing(file_name): sample = sorted(sentencealpha(file_name).split()) filestp = open('stopwords.txt','r') stp = filestp.readlines() filestp.close() stp = [line.strip() for line in stp] s = '' for i in stp: s += ' '+i.lower() stop = s.split() B = [] for i in sample: if i not in stop: B.append(i) C = [] for i in B: C.append(fhash(i,M)) C = sorted(C) d = '' D = [] for i in C: if i!=d : d = i D.append(i) FH = [] for i in D: no1 = C.count(i) FH.append([i,no1]) return FH file_name = input('File name = ') while True: fh = input('Use feature hashing ? (y,Y,n,N) ') if fh.lower() == 'y': M = int(input('M = ')) YorN = 'yes' break elif fh.lower() == 'n': YorN = 'no' break else : print('Try again.') print('-------------------') print('char count =',len(sentencechar(file_name))) print('alphanumeric count =',alphanumeric_count(file_name)) print('line count =',len(sam(file_name))) print('word count =',len(sentencealpha(file_name).split())) if YorN == 'yes': print('BoW =', feature_hashing(file_name)) elif YorN == 'no': print('BoW =',BoW(file_name))
# 6330543921 (18.90) 362 (2021-03-21 14:05) def read_file(file_name): file=open(file_name, encoding='utf-8') lines=[line.strip() for line in file.readlines()] file.close() return lines def read_stopwords(): file=open('stopwords.txt', encoding='utf-8') lines=[line.strip() for line in file.readlines()] file.close() stop_words = ' '.join(lines).split(' ') for i in range(len(stop_words)): stop_words[i] = alpha(stop_words[i]) return stop_words def alpha(s): s=[c for c in s if c.lower() in '0123456789abcdefghijklmnopqrstuvwxyz'] return ''.join(s) def alpha_joinlist(j): for i in range(len(j)): j[i] = alpha(j[i]) return j def fhash(ws, m): f = 0 for i in range(len(ws)): f += ord(ws[i])*(37**i) return f % int(m) def remove_stopwords(joinlists, stop_words): remove_stopwords = [] for joinlist in joinlists: if joinlist not in stop_words: remove_stopwords.append(joinlist) return sorted(remove_stopwords) def bow_word(join_list): bow = [] words = remove_stopwords(alpha_joinlist(join_list), read_stopwords()) for i in range(len(words)): if words[i] != words[i-1]: bow += [[words[i], words.count(words[i])]] return bow def bow_fhash(join_list, m): tbow = [] bow = [] removes = remove_stopwords(alpha_joinlist(join_list), read_stopwords()) for remove in removes: tbow.append(fhash(remove, m)) tbow = sorted(tbow) for i in range(len(tbow)): if tbow[i] != tbow[i-1]: bow += [[tbow[i], tbow.count(tbow[i])]] return bow def output(file_name, hash, m): listfromfile = read_file(file_name) joinlist = ''.join(listfromfile) join_list = ' '.join(listfromfile).split(' ') ap = alpha(joinlist) print('-------------------') print('char count =', len(joinlist)) print('alphanumeric count =', len(ap)) print('line count =', len(listfromfile)) print('word count =', len(join_list)) if hash.lower() == 'n': bow = bow_word(join_list) else: bow = bow_fhash(join_list, m) return bow def name(): check = True file_name = input('File name = ') while check: hash = input('Use feature hashing ? (y,Y,n,N) ') if hash.lower() == 'y' or hash.lower() == 'n': m = '' if hash.lower() == 'y': m = input('M = ') bow = output(file_name, hash, m) print('BoW =', bow) check = False else: print('Try again.') name()
# 6330544521 (22.90) 363 (2021-03-21 10:58) def fhash(ws, m): f = 0 for i in range(len(ws)): f += ord(ws[i])*(37**i) return f%int(m) def alpha(s): out = '' for c in s: if c.lower() in '0123456789abcdefghijklmnopqrstuvwxyz': out += c.lower() return out def ajoinlist(jlist): for i in range(len(jlist)): jlist[i] = alpha(jlist[i]) return jlist def read_file(file_name): file=open(file_name, encoding='utf-8') lines=[line.strip() for line in file.readlines()] file.close() return lines def read_stopwords(): file=open('stopwords.txt', encoding='utf-8') lines=[line.strip() for line in file.readlines()] file.close() stop_words = ' '.join(lines).split(' ') for i in range(len(stop_words)): stop_words[i] = alpha(stop_words[i]) return stop_words def remove_stopwords(joinlists, stop_words): words = [] for joinlist in joinlists: if joinlist not in stop_words: words.append(joinlist) return sorted(words) def bow_word(join_list): bow = [] rmv = remove_stopwords(ajoinlist(join_list), read_stopwords()) for i in range(len(rmv)): if rmv[i] != rmv[i-1]: bow += [[rmv[i], rmv.count(rmv[i])]] return bow def bow_fhash(join_list, m): bw = [] bow = [] rmvs = remove_stopwords(ajoinlist(join_list), read_stopwords()) for rmv in rmvs: bw.append(fhash(rmv, m)) bw = sorted(bw) for i in range(len(bw)): if bw[i] != bw[i-1]: bow += [[bw[i], bw.count(bw[i])]] return bow def inputfirst(): k = True file_name = input('File name = ') while k: hash = input('Use feature hashing ? (y,Y,n,N) ') if hash.lower() == 'y' or hash.lower() == 'n': k = False listfromfile = read_file(file_name) joinlist = ''.join(listfromfile) join_list = ' '.join(listfromfile).split(' ') a = alpha(joinlist) if hash.lower() == 'y': m = input('M = ') print('-------------------') print('char count =', len(joinlist)) print('alphanumeric count =', len(a)) print('line count =', len(listfromfile)) print('word count =', len(join_list)) if hash.lower() == 'n': bow = bow_word(join_list) else: bow = bow_fhash(join_list, m) print('BoW =', bow) else: print('Try again.') inputfirst()
# 6330545121 (24.00) 364 (2021-03-22 02:49) def initial(n): WordStop = '' file = open(n,'r') lines = file.readlines() numLine = len(lines) file.close() Word = OprLine(lines)[0] nonAlnum = OprLine(lines)[1] filestop = open('stopwords.txt','r') linestop = filestop.readlines() linestop = [i.strip('\n') for i in linestop ] for i in linestop: WordStop = WordStop + ' '+ i WordStop = WordStop.split() filestop.close() print('Use feature hashing ? (y,Y,n,N)',end = '') a = input() if a in ['y','Y']: Opr = 'y' main(numLine,Word,Opr,nonAlnum,WordStop) elif a in ['n','N']: Opr = 'n' main(numLine,Word,Opr,nonAlnum,WordStop) else: print('Try again.') initial() def OprLine(lines): [nonAlnum,a] = [0,''] lines = [i.strip('\n') for i in lines ] for i in lines: a = a+ ' ' + i for i in a: if i.isalnum() == False: nonAlnum = nonAlnum +1 a = a.replace(i,' ') return [a,nonAlnum] def fhash(w,M): [a,n,G] = [0,0,37] for i in w: a = a + ord(i.lower())*(G**n) n = n+1 inBoW = a%int(M) return inBoW def countBoW(Data,WordStop,Opr): [BoW,i1,DBoW,ListData] = [[],[],[],[]] for i in WordStop: while i in Data: Data.remove(i) for i in Data: if not i in ListData: ListData.append(i) if Opr == 'y': print('M = ',end = '') M = input() for w in Data: h = (fhash(w,M)) DBoW.append(h) for i in DBoW: if i not in i1: i1.append(i) BoW.append([i,DBoW.count(i)]) elif Opr == 'n': for i in ListData: BoW.append([i,Data.count(i)]) return BoW def main(numLine,Word,Opr,nonAlnum,WordStop): Data = [] Data0 = Word.split() for i in Data0: Data.append(i.lower()) BoW = countBoW(Data,WordStop,Opr) print('-------------------') print('char count =',end= ' ') print(len(Word)-numLine) print('alphanumeric count =',end= ' ') print(len(Word)-nonAlnum) print('line count =',end = ' ') print(numLine) print('word count =',end = ' ') print(len(Data0)) print('BoW =',end = ' ' ) print(BoW) #------------------------------------------------------------------------------- print('File name =',end = ' ') n = input() initial(n)
# 6330547421 (26.90) 365 (2021-03-22 03:46) file_name = input('File name = ') ufh = input('Use feature hashing ? (y,Y,n,N) ') while not ( ufh in ['n','N','y','Y'] ) : print('Try again.') ufh = input('Use feature hashing ? (y,Y,n,N) ') if ufh in ['y','Y'] : M = int(input('M = ')) stop_word = open('stopwords.txt',"r") sw = [] for line in stop_word : sw += line.strip().split() print('-------------------') f = open(file_name ,"r") def char_count(f): s = 0 for line in f : s += len(line.strip()) return s print('char count =',char_count(f)) f = open(file_name ,"r") def alnum_count(f): s = '' for line in f : for i in line : if 'A' <= i <= 'Z' or 'a' <= i <= 'z' or '0' <= i <= '9' : s += i n = len(s) return n print('alphanumeric count =',alnum_count(f)) f = open(file_name ,"r") def line_count(f): s = 0 for line in f : s += 1 return s print('line count =',line_count(f)) f = open(file_name ,"r") def word_count(f): s = '' for line in f : s += ' ' for i in line : if 'A' <= i <= 'Z' or 'a' <= i <= 'z' or '0' <= i <= '9' : s += i else : s += ' ' w = s.strip().split() n = len(w) return n print('word count =',word_count(f)) def fhash(w,M): s = 0 G = 37 for i in range(len(w)) : s += (ord(w[i])*(G**i)) ss = s % M return ss f = open(file_name ,"r") def bow(f): l = '' for line in f : l += ' ' for i in line.lower() : if 'A' <= i <= 'Z' or 'a' <= i <= 'z' or '0' <= i <= '9' : l += i elif i == ' ' : l += ' ' li = l.strip().split() w = '' for i in li : if not ( i in sw ) : w += i w += ' ' lw = w.split() lf = [] b = [] o = [] k = 0 if ufh == 'n' or ufh == 'N' : for i in lw : k = 0 if not ( i in o ) : for w in lw : if i == w : k += 1 o += [i] b += [[i,k]] elif ufh == 'y' or ufh == 'Y' : for i in lw : lf += [fhash(i,M)] for i in lf : k = 0 if not (i in o) : for n in lf : if i == n : k += 1 o += [i] b += [[i,k]] return b print('BoW =',bow(f))
# 6330548021 (18.00) 366 (2021-03-21 12:59) f = input('file_name = ') uf = input('Use feature hashing ? (y,Y,n,N) ') ch = 0 al = 0 wc = 0 ws = [] bow = [] def remove_an(s): se = '' for e in s: if e.lower() in '0123456789abcdefghijklmnopqrstuvwxyz': se += e.lower() return se def remove_an1(a): se = '' for e in a: if e.lower() in '0123456789abcdefghijklmnopqrstuvwxyz': se += e.lower() else : se += ' ' return se def flhash(w,M): G = 37 b = 0 for i in range(len(w)): b += ord(w[i])*(G**i) b = b % M return b def remove_stop_word(h,sw): a = [] for e in h: if e not in sw: a.append(e) return a def count_w(G): coun = 1 GG = [] G = sorted(G) for i in range(len(G)): if not i+1 == len(G): if G[i] == G[i+1]: coun += 1 else : GG.append([G[i],coun]) coun = 1 GG += [[G[-1],coun]] return GG file1 = open('stopwords.txt','r') lines0 = file1.readlines() lines2 = [line.strip() for line in lines0] th = [] for g in lines2: sw1 = remove_an1(g) sw1 = sw1.split() for u in sw1: th.append(u) file1.close file = open(f,'r') s = file.read() lines = s.splitlines() lines1 = [line.strip() for line in lines] lc = len(lines) for c in lines: ch += len(c) for e in lines1: al += len(remove_an(e)) for t in lines1: w = remove_an1(t) w = w.split() wc += len(w) for u in w: ws.append(u) ws01 = remove_stop_word(ws,th) bo1 = count_w(ws01) file.close while uf not in ['y','Y','n','N']: if uf not in ['y','Y','n','N']: print('Try again.') uf = input('Use feature hashing ? (y,Y,n,N) ') if uf in ['n','N']: print('-------------------') print('char count =',ch) print('alphanumeric count =',al) print('line count =',lc) print('word count =',wc) print('BoW =',bo1) elif uf in ['y','Y']: M = int(input('M = ')) fl = [] for g in ws01: fl.append(flhash(g,M)) fl1 = count_w(fl) print('-------------------') print('char count =',ch) print('alphanumeric count =',al) print('line count =',lc) print('word count =',wc) print('BoW =',fl1)
# 6330549721 (30.00) 367 (2021-03-20 18:08) def fhash(w, M): res = 0 G = 37 for i in range(len(w)): res += ord(w[i])*G**i return res%M file_name = input("File name = ") isHashing = "" while True: isHashing = input("Use feature hashing ? (y,Y,n,N) ").lower() if isHashing == "n" or isHashing == "y": break else: print("Try again.") stopword = [] stopfile = open("stopwords.txt").readlines() for i in stopfile: l = i.strip().split() for j in l: if j not in stopword: stopword.append(j) line = open(file_name).readlines() lswd = [] lsct = [] BoW = [] ancount=0 charcount=0 wdcount=0 reswd = [] rescnt = [] for i in line: l = i.strip().lower() lm = "" for j in l: if j.isalpha() == True or j.isnumeric() == True: charcount += 1 ancount += 1 lm += j else: charcount += 1 lm += " " lm = lm.strip().split() wdcount += len(lm) for k in lm: if k not in stopword: if k not in lswd: lswd.append(k) lsct.append(1) else: lsct[lswd.index(k)] += 1 if isHashing == "n": reswd = lswd rescnt = lsct elif isHashing == "y": M = int(input("M = ")) lshash = [] lshashct = [] for i in range(len(lswd)): fhashres = fhash(lswd[i],M) if fhashres not in lshash: lshash.append(fhashres) lshashct.append(lsct[i]) else: lshashct[lshash.index(fhashres)] += lsct[i] reswd = lshash rescnt = lshashct sortedlshash = [] sortedlshashct = [] indexrange = list(range(len(reswd))) for i in range(len(reswd)): minval = 0 inx = 0 for j in range(len(indexrange)): if j == 0: minval = reswd[indexrange[j]] elif minval > reswd[indexrange[j]]: minval = reswd[indexrange[j]] inx = j sortedlshash.append(minval) sortedlshashct.append(rescnt[indexrange[inx]]) indexrange.pop(inx) for i in range(len(sortedlshash)): BoW.append([sortedlshash[i],sortedlshashct[i]]) print("-------------------") print("char count =",charcount) print("alphanumeric count =",ancount) print("line count =",len(line)) print("word count =",wdcount) print("BoW =",BoW)
# 6330550221 (30.00) 368 (2021-03-22 00:51) a = input('File name = ') b = input('Use feature hashing ? (y,Y,n,N) ') while b not in ['y','Y','n','N']: print('Try again.') b = input('Use feature hashing ? (y,Y,n,N) ') if b not in ['n','N']: M = input('M = ') print('-------------------') stop_words = open('stopwords.txt','r') stop_w = [] for line in stop_words: w = line.split() stop_w += w file_name = open(a,'r') f_n = [] for line in file_name: f_n += line file_name.close() f_n_not = [] for i in range(len(f_n)): if f_n[i] != '\n': f_n_not += f_n[i] ch_test = len(f_n_not) file_name = open(a, 'r') ln = 0 for line in file_name: ln += 1 file_name.close() ff = '' for i in range(len(f_n)): if f_n[i].lower() in ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']: ff += f_n[i] al = len(ff) file_name = open(a,'r') f_NN = [] ans = '' for line in file_name: for e in line: if e.isalnum(): ans += e.lower() else: ans += ' ' f_NN = ans.split() file_name.close() wc = len(f_NN) f_NN_not = [] for i in range(len(f_NN)): if f_NN[i] not in stop_w: f_NN_not += [f_NN[i]] def fhash(words,M): r = [] fh = 0 for i in range(len(words)): r += words[i] for i in range(len(r)): fh += ord(r[i])*((37)**i) return fh % M if b not in ['n','N']: fh = [] for i in range(len(f_NN_not)): fh += [fhash(f_NN_not[i],int(M))] check2 = [] ccc = [] for i in range(len(fh)): aa = fh.count(fh[i]) if fh[i] not in check2: check2 += [fh[i]] ccc += [[fh[i], aa]] ccc.sort() else: bbb = [] check = [] for i in range(len(f_NN_not)): aaa = f_NN_not.count(f_NN_not[i]) if f_NN_not[i] not in check: check += [f_NN_not[i]] bbb += [[f_NN_not[i], aaa]] bbb.sort() print('char count = '+str(ch_test)) print('alphanumeric count = '+str(al)) print('line count = '+str(ln)) print('word count = '+str(wc)) if b in ['y','Y']: print('BoW =', ccc) else: print('BoW =', bbb)
# 6330551921 (19.60) 369 (2021-03-22 23:02) def char_count(doc): c=0 for e in range(len(doc)): c+=len(doc[e]) return c def alphanumeric_count(doc): doc = ''.join(doc).lower() c=0 for e in doc : if 'a' <=e <='z' or '1'<=e<='9': c+=1 return c def line_count(doc) : return len(doc) def word_find(doc): doc = ' '.join(doc).lower() c= '' for e in range(len(doc)): if 'a' <=doc[e] <='z' or '1'<=doc[e]<='9' or doc[e]==' ': c+= doc[e] else: c+= ' ' return c.split() def flash(word,m): flash = 0 for i in range(len(word)) : flash+= ord(word[i])*37**i return flash%m def bow (doc,m): c=[] if m == ' ': for e in word_find(doc): if e in word_find(stop_doc): continue else: if e in c : continue else: c.append(e) return [[o,word_find(doc).count(o)] for o in c] else: c=[] f = [] show = [] for e in word_find(doc): if e in word_find(stop_doc): continue else: c.append(e) for i in c: f.append(flash(i,m)) f.sort() for k in range(min(f),max(f)+1): if f.count(k) != 0: show.append([k,f.count(k) ]) return show file_name = input('File name = ') file = open(file_name,'r') doc = file.readlines() for e in range(len(doc)): doc[e] = doc[e].replace('\n','') file.close() stop = open('stopwords.txt','r') stop_doc = stop.readlines() for e in range(len(doc)): stop_doc[e] = stop_doc[e].replace('\n','') stop.close() while True: hashing = input('Use feature hashing ? (y,Y,n,N) ').lower() if hashing == 'y' : m = int(input('M = ')) print('-------------------') print('char count =',char_count(doc)) print('alphanumeric count =',alphanumeric_count(doc)) print('line count =',line_count(doc)) print('word count =',len(word_find(doc))) print('BoW =',bow(doc,m)) break elif hashing == 'n': m=' ' print('-------------------') print('char count =',char_count(doc)) print('alphanumeric count =',alphanumeric_count(doc)) print('line count =',line_count(doc)) print('word count =',len(word_find(doc))) print('BoW =',bow(doc,m)) break print('Try again.')
# 6330552521 (0.00) 370 (2021-03-22 14:51) def fhash( w , M ): fh = 0 for i in range( len(w) ): fh += ( ( ord( w[i] ) ) * ( ( 37 ) ** ( i ) ) ) fhsh = fh % M return fhsh def alnum_count(lines): n = 0 l = ''.join(lines) for x in l: if x.isalnum() == True: n += 1 else: pass return n def bow(lss): BoW = [] for x in lss: eBoW = [] eBoW.append(x) eBoW.append(lss.count(x)) if eBoW not in BoW: BoW.append(eBoW) else: pass return BoW def main(): file_name = input( 'File name = ' ) f_h = input( 'Use feature hashing ? (y,Y,n,N)' ).lower() scw = '!\',."@#$%^&*()_-+=][{}/?|:;><' j = True while j: if f_h == 'n': j = False print( '-' * 19 ) stopwords = open( 'stopwords.txt' , 'r' ) stws = stopwords.readlines() sw_list = [sw.rstrip('\n') for sw in stws] for x in sw_list: if x == '': sw_list = sw_list.remove(x) else: pass sw = ' '.join(sw_list) sw_list = sw.split(' ') file_name = open( 'sample.txt' , 'r' ) fln = file_name.readlines() w_list1 = [line.rstrip('\n').lower() for line in fln] num_char = len( ''.join(w_list1) ) print('char count =', num_char) print('alphanumeric count =', alnum_count(w_list1)) print('line count =', len(w_list1)) for x in w_list1: if x == '': w_list1 = w_list1.remove(x) else: pass w_string = ' '.join(w_list1) w_list = w_string.split(' ') print('word count =', w_list) w_list2 = w_string.split(' ') w_s = ' '.join([x for x in w_list2 if x not in sw_list]) for x in w_s: if x in scw: w_s = w_s.replace(x, '') else: pass w_l = w_s.split(' ') w_l = [x for x in w_l if x not in sw_list] print('BoW =', sorted(bow(w_l))) file_name.close() stopwords.close() elif f_h == 'y': j = False M = int(input('M = ')) print( '-' * 19 ) stopwords = open( 'stopwords.txt' , 'r' ) stws = stopwords.readlines() sw_list = [sw.rstrip('\n') for sw in stws] for x in sw_list: if x == '': sw_list = sw_list.remove(x) else: pass sw = ' '.join(sw_list) sw_list = sw.split(' ') file_name = open( 'sample.txt' , 'r' ) fln = file_name.readlines() w_list1 = [line.rstrip('\n').lower() for line in fln] num_char = len( ''.join(w_list1) ) print('char count =', num_char) print('alphanumeric count =', alnum_count(w_list1)) print('line count =', len(w_list1)) for x in w_list1: if x == '': w_list1 = w_list1.remove(x) else: pass w_string = ' '.join(w_list1) w_list = w_string.split(' ') print('word count =', w_list) w_list2 = w_string.split(' ') w_s = ' '.join([x for x in w_list2 if x not in sw_list]) for x in w_s: if x in scw: w_s = w_s.replace(x, '') else: pass w_l = w_s.split(' ') w_l = [x for x in w_l if x not in sw_list] hsh = [fhash(x, M) for x in w_l] print('BoW =', sorted(bow(hsh))) file_name.close() stopwords.close() else: print('Try again') main()
# 6330553121 (18.00) 371 (2021-03-22 22:55) def cut(li) : list_of_words = [] # cut() takes a list of strings and return list of induvidual lowercased words for string in li: low_string = string.lower() s = '' for w in low_string : if w.isalnum(): s += w else : s += ' ' n = s.split() list_of_words += n return list_of_words def f_count(nt): nt.sort() word = nt[0] c = 0 data = [] for i in range(len(nt)) : if word == nt[i] : c += 1 else : data.append([word,c]) word = nt[i] c = 1 data.append([word,c]) return data def fhash(w,M) : s = 0 for i in range(len(w)) : s += ord(w[i])*(37**i) return s%M #---------------------------------------- file_name = input('File name = ') #---------------------------------------- com = input('Use feature hashing ? (y,Y,n,N) ') while com not in ['y','Y','n','N'] : print('Try again.') com = input('Use feature hashing ? (y,Y,n,N) ') if com in ['y','Y']: hashing = 1 M = int(input('M = ')) else : hashing = 0 #---------------------------------------- lines = open(file_name, 'r') sens = [] chc = 0 alpc = 0 linec = 0 for line in lines : if line[-1] == '\n' : line = line[:-1:] chc += len(line) sens.append(line.strip()) linec +=1 lines.close() stopwords_file = open('stopwords.txt', 'r') stopwords_li = [] for e in stopwords_file: stopwords_li.append(e.strip()) stopwords_file.close() textwords = cut(sens) stopwords = cut(stopwords_li) for word in textwords: alpc += len(word) wordc = len(textwords) norm_textwords = [] for words in textwords: if words not in stopwords : norm_textwords.append(words) BoW = f_count(norm_textwords) z = [] if hashing == 1: for st in norm_textwords : z.append(fhash(st,M)) BoW_hash = f_count(z) print('-------------------') print('char count =',chc) print('alphanumeric count =',alpc) print('line count =',linec) print('word count =',wordc) if hashing == 1 : print('BoW =',BoW_hash) else : print('BoW =',BoW)
# 6330554821 (17.05) 372 (2021-03-21 20:09) file_name = input('File name = ') use = input('Use feature hashing ? (y,Y,n,N) ') while use != 'n' and use != 'N' and use != 'y' and use != 'Y': print('Try again.') use = input('Use feature hashing ? (y,Y,n,N) ') if use == 'n' and use == 'N' and use == 'y' and use == 'Y': break if use == 'Y' or use == 'y': M = input('M = ') ch = ['"',"'",',',';',':',')','(','-','_','.'] line_count = 0 s = '' sentence = '' file = open(file_name,'r') for e in file: s += e.strip().lower() line_count +=1 file.close() for i in range(len(s)): if s[i] in ch: sentence += ' ' else: sentence += s[i] char_count = len(s) print('-'*len('Use feature hashing')) print('char count = '+str(char_count)) word = sentence.split() alpha = ''.join(word) alpha_count = len(alpha) print('alphanumeric count = '+str(alpha_count)) word_count = len(word) print('line count = '+str(line_count)) print('word count = '+str(word_count)) stop_word = [] word1 = [] stop = open('stopwords.txt','r') for e in stop: stop_word += e.split() stop.close() for i in range(len(word)): if word[i] not in stop_word: word1.append(word[i]) word1.sort() BOW2 = [] BOW1 = [] BOW1_COUNT = [] ans = 0 i = 0 while i < len(word1): t = 0 while t < len(word1[i]): ans += ord(word1[i][t])*(37)**t t += 1 ans1 = ans % int(M) BOW1.append(ans1) i += 1 ans = 0 BOW1.sort() I = 0 BOW1_count = 0 while I < len(BOW1): if BOW1[I] not in BOW2 and I == 0: BOW1_count += 1 BOW2.append(BOW1[I]) I +=1 elif BOW1[I] in BOW2: BOW1_count +=1 I += 1 elif BOW1[I] not in BOW2: BOW1_COUNT.append(BOW1_count) BOW2.append(BOW1[I]) if BOW1_count != 0: BOW1_count = 1 I += 1 BOW1_COUNT.append(BOW1_count) BOW = [] for i in range(len(BOW2)): BOW.append([BOW2[i],BOW1_COUNT[i]]) print('BoW = '+str(BOW)) elif use == 'N' or use == 'n': ch = ['"',"'",',',';',':',')','(','-','_','.'] line_count = 0 s = '' sentence = '' file = open(file_name,'r') for e in file: s += e.strip().lower() line_count +=1 file.close() for i in range(len(s)): if s[i] in ch: sentence += ' ' else: sentence += s[i] char_count = len(s) print('-'*len('Use feature hashing')) print('char count = '+str(char_count)) word = sentence.split() alpha = ''.join(word) alpha_count = len(alpha) print('alphanumeric count = '+str(alpha_count)) word_count = len(word) print('line count = '+str(line_count)) print('word count = '+str(word_count)) stop_word = [] word1 = [] stop = open('stopwords.txt','r') for e in stop: stop_word += e.split() stop.close() for i in range(len(word)): if word[i] not in stop_word: word1.append(word[i]) word1.sort() I = 0 word1_count = 0 word2 = [] word1_COUNT = [] while I < len(word1): if word1[I] not in word2 and I == 0: word1_count += 1 word2.append(word1[I]) I +=1 elif word1[I] in word2: word1_count +=1 I += 1 elif word1[I] not in word2: word1_COUNT.append(word1_count) word2.append(word1[I]) if word1_count != 0: word1_count = 1 I += 1 word1_COUNT.append(word1_count) WORD = [] for i in range(len(word2)): WORD.append([word2[i],word1_COUNT[i]]) print('BoW = '+str(WORD))
# 6330555421 (16.47) 373 (2021-03-22 16:35) file_name = open(input('File name = ').strip(), 'r') x = file_name.readlines() stopword_file = open('stopword.txt', 'r') r = [] stopword = [] for line in stopword_file : r.append(line.split()) for i in range(len(r)) : for k in range(len(r[i])) : stopword.append(r[i][k]) stopword_file.close() def fhash(w,M) : sum_hash = 0 for i in range(len(w)) : sum_hash += ord(w[i])*(37**i) fhash = sum_hash % M return fhash alpha_num = 'ABCDEFGHIJKLMNOPQRSTUVWSYZabcdefghijklmnopqrstuvwsyz0123456789' words = '' words_lst = [] lst_words = [] for i in range(len(x)) : for ch in x[i] : if ch in alpha_num : words += ch else : words_lst.append(words) words = '' for f in words_lst : if f != '' : lst_words.append(f) txt = [] for l in range(len(lst_words)) : if lst_words[l].lower() not in stopword : txt.append(lst_words[l]) #---------------------------------------------------------------- ans = input('Use feature hashing ? (y,Y,n,N) ') while ans not in ['y','Y','n','N']: print('Try again.') ans = input('Use feature hashing ? (y,Y,n,N) ') if ans in ['y','Y'] : M = int(input('M = ')) c = 0 t = [] print('-------------------') for i in range(len(x)) : if len(x[i]) > 0 and x[i][-1]=='\n' : t.append(x[i][:-1]) else : t.append(x[i]) for k in range(len(t)) : for ch in t[k] : c+=1 print('char count =', c) d = 0 result = 0 tt = [] for i in range(len(x)) : if len(x[i]) > 0 and x[i][-1]=='\n' : tt.append(x[i][:-1]) else : tt.append(x[i]) for k in range(len(t)) : for c in tt[k] : if c in 'ABCDEFGHIJKLMNOPQRSTUVWSYZabcdefghijklmnopqrstuvwsyz0123456789': result += 1 else : result += 0 print('alphanumeric count =', result) e = 0 for line in x : e += 1 print('line count =', e) words = '' words_lst = [] lst_words = [] for i in range(len(x)) : for w in range(len(x[i])) : if x[i][w] in alpha_num : if x[i][w] == x[i][-1] and x[i][w] in alpha_num : words += x[i][w] words_lst.append(words) else : words += x[i][w] else : words_lst.append(words) words = '' for e in words_lst : if e != '' : lst_words.append(e) print('word count =', len(lst_words)) #------------------ aa = [] for e in txt : aa.append(fhash(e,M)) aa = sorted(aa) fBoW = [] data = [] for i in range(len(aa)) : if i == 0: data += [aa[i]] else : if aa[i] != aa[i-1] : data += [aa[i]] for g in data : fBoW.append([g,aa.count(g)]) print('BoW =', fBoW) elif ans in ['n','N'] : c = 0 t = [] print('-------------------') for i in range(len(x)) : if len(x[i]) > 0 and x[i][-1]=='\n' : t.append(x[i][:-1]) else : t.append(x[i]) for k in range(len(t)) : for ch in t[k] : c+=1 print('char count =', c) d = 0 result = 0 tt = [] for i in range(len(x)) : if len(x[i]) > 0 and x[i][-1]=='\n' : tt.append(x[i][:-1]) else : tt.append(x[i]) for k in range(len(t)) : for c in tt[k] : if c in 'ABCDEFGHIJKLMNOPQRSTUVWSYZabcdefghijklmnopqrstuvwsyz0123456789': result += 1 else : result += 0 print('alphanumeric count =', result) e = 0 for line in x : e += 1 print('line count =', e) words = '' words_lst = [] lst_words = [] for i in range(len(x)) : for w in range(len(x[i])) : if x[i][w] in alpha_num : if x[i][w] == x[i][-1] and x[i][w] in alpha_num : words += x[i][w] words_lst.append(words) else : words += x[i][w] else : words_lst.append(words) words = '' for e in words_lst : if e != '' : lst_words.append(e) print('word count =', len(lst_words)) #------------------------- words = sorted(txt) data = [] BoW = [] for i in range(len(words)) : if i == 0: data += [words[i]] else : if words[i] != words[i-1] : data += [words[i]] for e in data : BoW.append([e,words.count(e)]) print('BoW =', BoW)
# 6330556021 (24.90) 374 (2021-03-20 16:26) #--------------function--------------------- def fhash(w, M): plus = 0 G = 37 for i in range(len(w)): plus += ord(w[i])*(G**i) result = plus%M return (result) file_name = input("File name = ") while True: choice = input("Use feature hashing ? (y,Y,n,N) ") if choice in ["y", "Y", "n", "N"]: break else: print("Try again.") file2 = open("stopwords.txt", "r") file1 = open(file_name, "r") #---------list of stopwords----------------- stopwords = [] x = [i.lower().strip() for i in file2] for i in x: i = i.split() for j in i: stopwords.append(j) #-----------list of words------------------- words =[] y = [i.lower().strip() for i in file1] for i in y: i = i.split() for j in i: for k in j: if k.isalnum() == False: j = j.replace(k, "") words.append(j) #------------------------------------------- char_count = len("".join(y)) #------------------------------------------- alphanumeric_count = len("".join(words)) #------------------------------------------- line_count = len(y) #------------------------------------------- word_count = len(words) #-------------Bag of words------------------ BoW = [] BoWonlywords = [] for i in words: if i not in stopwords and i not in BoWonlywords: BoW.append([i, 0]) BoWonlywords.append(i) BoW = sorted(BoW) BoWonlywords = sorted(BoWonlywords) for i in words: if i in BoWonlywords: BoW[BoWonlywords.index(i)][1] += 1 if choice in ["N", "n"]: show = BoW else: M = int(input("M = ")) BoWonlynum = [] wordstran = [] BoWver2 = [] for i in BoWonlywords: m = str(fhash(i, M)) if m not in BoWonlynum: BoWonlynum.append(m) BoWver2.append([m, 0]) for i in words: if i not in stopwords: wordstran.append(str(fhash(i, M))) for i in wordstran: if i in BoWonlynum: BoWver2[BoWonlynum.index(i)][1] += 1 show = sorted([[int(i[0]),i[1]] for i in BoWver2]) print("-------------------") print("char count =", char_count) print("alphanumeric count =", alphanumeric_count) print("line count =", line_count) print("word count =", word_count) print("BoW =", show) file1.close() file2.close()
# 6330557721 (23.55) 375 (2021-03-22 13:42) file_name = input('File name = ') findB = input('Use feature hashing ? (y,Y,n,N) ') while findB not in ['Y','y','N','n']: print('Try again.') findB = input('Use feature hashing ? (y,Y,n,N) ') #if findB in ['N','n']: #3 stopwod = [] stw = open('stopwords.txt', 'r') for line in stw: n = line.strip().split() for i in range(len(n)): stopwod.append(n[i]) stw.close() #4 li = 0 aka = 0 aln = 0 words = [] rfile = open(file_name,'r') for line in rfile: li+=1 for i in range(len(line)): aka +=1 if line[i].isalnum() == True: aln+=1 aka = (aka - li)+1 rfile.close() words = [] r1file = open(file_name,'r') words = [line.lower().strip().split() for line in r1file] r1file.close() words1 = [] for i in words: for k in i: for u in k: if u.isalnum() == False: k=k.replace(u,'') words1.append(k) wo = len(words1) BoW1 = [] for i in range(len(words1)): if words1[i] not in stopwod: if words1[i] not in BoW1: BoW1.append(words1[i]) BoW1.append(int(0)) for k in range(len(BoW1)): if words1[i] == BoW1[k]: BoW1[k+1] +=1 BoW2 = [] for i in range(int((len(BoW1))/2)): l,m = BoW1[i*2],BoW1[(i*2)+1] BoW2.append([l,m]) if findB in ['N','n']: print('char count =',aka) print('alphanumeric count =',aln) print('line count =',li) print('word count =',wo) print('BoW = '+str(BoW2)) if findB in ['y','Y']: M = int(input('M = ')) fhash = [] words2 = [] for i in range(len(words1)): if words1[i] not in stopwod: words2.append(words1[i]) for i in range(len(words2)): f=0 for k in range(len(words2[i])): f += ord(words2[i][k])*(37**(k)) fhash.append(str(f%M)) BoW = [] for i in range(len(fhash)): if fhash[i] not in BoW: BoW.append(fhash[i]) BoW.append(int(0)) for k in range(len(BoW)): if fhash[i] == BoW[k]: BoW[k+1] +=1 BoWW = [] for i in range(int((len(BoW))/2)): l,m = BoW[i*2],BoW[(i*2)+1] BoWW.append([l,m]) print('char count =',aka) print('alphanumeric count =',aln) print('line count =',li) print('word count =',wo) print('BoW = '+str(BoWW))
# 6330558321 (30.00) 376 (2021-03-22 00:43) def read_stopwords(fd): bow = [] with open(fd, 'r') as f: for line in f: if not line: continue else: bow.extend(line.split()) return bow def ask_hash(): hs = input("Use feature hashing ? (y,Y,n,N) ") if hs in ['y', 'Y']: return True elif hs in ['n', 'N']: return False else: print("Try again.") return ask_hash() def hash_word(word, m): cs = list(word) return sum([ord(c)*(37**i) for i, c in enumerate(cs)]) % m def count_bow(bow): new_bow = [] found = [] for word in bow: if word in found: new_bow[found.index(word)][1] += 1 else: found.extend([word]) new_bow.append([word, 1]) return new_bow stop = read_stopwords("stopwords.txt") fs = input("File name = ") f = open(fs, 'r') nlines, nwords, nchars, nalphanum = 0, 0, 0, 0 all_words = [] for line in f: line = line.strip('\n') nlines += 1 nchars += len(line) nalphanum += sum(c.isalnum() for c in line) line = ''.join([c.lower() if c.isalnum() else " " for c in line]) words = line.split() all_words.extend(words) nwords += len(words) f.close() bow = filter(lambda word: word not in stop, all_words) hash_flag = ask_hash() if hash_flag: m = int(input("M = ")) bow = [hash_word(word, m) for word in bow] # print results print(f"char count = {nchars}") print(f"alphanumeric count = {nalphanum}") print(f"line count = {nlines}") print(f"word count = {nwords}") print(f"BoW = {sorted(count_bow(bow), key=lambda t: t[0])}")
# 6330559021 (27.00) 377 (2021-03-21 23:52) def stopword(s): stop = open(s) y =[] for e in [line.strip()for line in stop] :y+=(e.split()) stop.close() return y #---------------------------------------------------------- def charcount(s) : x = open(s) k = 0 for line in x : for e in line : if e != "\n" : k += 1 x.close() return k #---------------------------------------------------------- def alphanumeric(s) : x = open(s) k =0 for line in x : for e in line : if e in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" : k += 1 x.close() return k #---------------------------------------------------------- def words(s): x = open(s) k="" for line in x : for e in line : if e not in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" : k += " " else :k += e.lower() k=k.split() x.close() return k #---------------------------------------------------------- def line(s) : x = open(s) k = 0 for line in x :k += 1 x.close() return k #---------------------------------------------------------- file_name = input("File name = ") feature = input("Use feature hashing ? (y,Y,n,N) ").lower() while feature not in "yn" : print("Try again.") feature = input("Use feature hashing ? (y,Y,n,N) ").lower() if feature == "n" : print("-------------------") print("char count =",charcount(file_name)) print("alphanumeric count =",alphanumeric(file_name)) print("line count =",line(file_name)) print("word count =",len(words(file_name))) x = words(file_name) k =[] m=[] for e in x : if e not in words("stopwords.txt"):k.append(e) k.sort() for i in range(len(k)) : if i == 0: y = k.count(k[i]) m.append([k[i],y]) else : if k[i] != k[i-1] : y =k.count(k[i]) m.append([k[i],y]) print("BoW =",m) elif feature == "y" : p = input("M = ") print("-------------------") print("char count =",charcount(file_name)) print("alphanumeric count =",alphanumeric(file_name)) print("line count =",line(file_name)) print("word count =",len(words(file_name))) x = words(file_name) k =[] m=[] m2=[] j = [] for e in x : if e not in words("stopwords.txt"):k.append(e) k.sort() for e in k : s = 0 for i in range(len(e)) : s +=ord(e[i])*(37**i) j.append(int(s)%int(p)) j.sort() for i in range(len(j)) : if i == 0: y = j.count(j[i]) m.append([j[i],y]) else : if j[i] != j[i-1] : y =j.count(j[i]) m.append([j[i],y]) print("BoW =",m)
# 6330560521 (30.00) 378 (2021-03-21 22:19) alnum_list = ['a','b','c','d','e','f','g','h','i','j','k','l','m',\ 'n','o','p','q','r','s','t','u','v','w','x','y','z',\ '1','2','3','4','5','6','7','8','9','0'] #-------------------------------------------------------------------- def run(): x = True file_name = input('File name = ') while x==True: use_fh = input('Use feature hashing ? (y,Y,n,N) ') if use_fh in ['y','Y'] : M = int(input('M = ')) x = False print('-------------------') print('char count = '+str(char_count(file_name))) print('alphanumeric count = '+str(alnum_count(file_name))) print('line count = '+str(line_count(file_name))) print('word count = '+str(word_count(file_name))) list_fh = [] f = 0 list_words = real_txt(file_name) for e in list_words: f = fhash(e,M) #print(e,f) list_fh.append(f) f = 0 list_fh.sort() u_list_fh = get_unique(list_fh) Bow = [] count = 0 for e in u_list_fh: count = 0 for c in list_fh : if e == c: count += 1 each_Bow = [] each_Bow.append(e) each_Bow.append(count) Bow.append(each_Bow) print('BoW = '+str(Bow)) elif use_fh in ['n','N'] : x = False print('-------------------') print('char count = '+str(char_count(file_name))) print('alphanumeric count = '+str(alnum_count(file_name))) print('line count = '+str(line_count(file_name))) print('word count = '+str(word_count(file_name))) list_words=real_txt(file_name) u_words = get_unique(list_words) Bow = [] count = 0 for e in u_words: count = 0 for c in list_words : if e == c: count += 1 each_Bow = [] each_Bow.append(e) each_Bow.append(count) Bow.append(each_Bow) print('BoW = '+str(Bow)) else : print('Try again.') x = True #------------------------------------------------------------ stopwords = open('stopwords.txt','r') list_stopwords = [] for line in stopwords: s_words = line.split() for e in s_words : list_stopwords.append(e) stopwords.close() #------------------------------------------------------------ def char_count(file_name): txt = open(file_name,'r') char_c = 0 for line in txt: for i in range(len(line)): if line[i] != '\n': char_c += 1 txt.close() return char_c #------------------------------------------------------------- def alnum_count(file_name): txt = open(file_name,'r') alnum_c = 0 for line in txt: l = line.strip() l = line.lower() for alnum in l : if alnum in alnum_list : alnum_c += 1 txt.close() return alnum_c #-------------------------------------------------------------- def line_count(file_name): txt = open(file_name,'r') line_c = 0 for line in txt : line_c += 1 txt.close() return line_c #-------------------------------------------------------------- def word_count(file_name): txt = open(file_name,'r') word_c = 0 for line in txt: line = line.lower() line = line.strip() for i in range(len(line)): if line[i] not in alnum_list : line = line[:i] + ' ' + line[i+1:] else : line = line[:i] + line[i] + line[i+1:] a = line.split() for e in a : word_c += 1 txt.close() return word_c #-------------------------------------------------------------- def real_txt(file_name) : #txtทั้งหมด ตัดstop word txt = open(file_name,'r') list_words = [] for line in txt : line = line.lower() line = line.strip() for i in range(len(line)): if line[i] not in alnum_list : line = line[:i] + ' ' + line[i+1:] else : line = line[:i] + line[i] + line[i+1:] a = line.split() for e in a : e = e.lower() if e not in list_stopwords : list_words.append(e) list_words.sort() txt.close() return list_words #----------------------------------------------------- def get_unique( words ): words.sort() unique_words = [] for i in range(len(words)) : if i == 0 : unique_words.append(words[i]) elif words[i-1] != words[i] : unique_words.append(words[i]) return unique_words #------------------------------------------------------------- def fhash(w,M): G = 37 fh = 0 for i in range(len(w)): fh += ord(w[i])*(G**i) if i+1 == len(w) : fh = fh%M return fh #----------------------------------------------------- run()
# 6330561121 (16.70) 379 (2021-03-18 14:40) tieagain = 0 stpw = [] charcount = 0 alphancount = 0 lincount = 0 wordc = 0 tempword = [] temphash = [] BoW = [] def fhash(w, M) : calc = 0 for i in range(len(w)) : calc += ord(w[i]) * (37**(i)) fhash = calc%M return fhash file_name = input("File name = ") while tieagain == 0 : choice = input("Use feature hashing ? (y,Y,n,N) ") if choice == 'y' or choice == 'Y' : M = input("M = ") ; FH = 1 ; tieagain = 1 elif choice == 'n' or choice == 'N' : FH = 0 ;tieagain = 1 else : print("Try again.") print("-------------------") stop = open("stopwords.txt", "r") for line in stop : a = line.strip().split() for i in range(len(a)): stpw.append(a[i]) stop.close() file = open(file_name,"r") for line in file : charcount += len(line.strip("\n")) for i in range(len(line.strip("\n"))) : if line.strip("\n")[i].isalnum() == True: alphancount += 1 if line[i].isalnum() == False : if i == len(line.strip("\n"))-1 : wordc +=1 elif line[i+1].isalnum() == True : wordc +=1 if line[0].isalnum() == False : wordc -=1 if line.strip("\n")[-1].isalnum() == True : wordc +=1 lincount += 1 file.close() file = open(file_name,"r") for line in file : line = line.strip().strip("\n").lower().split() for word in line : if word not in stpw : for i in range(len(word)) : if len(word) <= i : break if word[i].isalnum() == False : word = word.strip(word[i]) tempword.append(word) if FH == 1 : for i in range(len(tempword)) : temphash.append(fhash(tempword[i], int(M))) tempword = sorted(temphash) for i in range(len(tempword)) : BoWc = 1 if i == len(tempword) and tempword[i] not in tempword[:i] : BoW.append([tempword[i], BoWc]) elif i == len(tempword) : break if tempword[i] in tempword[:i] : continue temptempword = tempword[i+1:] while tempword[i] in temptempword : if tempword[i] in temptempword : temptempword.remove(tempword[i]) BoWc += 1 BoW.append([tempword[i], BoWc]) file.close() print("char count =",charcount) print("alphanumeric count =",alphancount) print("line count =",lincount) print("word count =",wordc) print("BoW =",sorted(BoW))
# 6330562821 (30.00) 380 (2021-03-21 15:59) def line_to_words(s): new_s = '' for e in s: if not e.isalnum(): new_s += ' ' else: new_s += e if len(new_s.split()) == 0: return '' else: return new_s.split() def fhash(w, M): result = 0 for i in range(len(w)): result += ord(w[i])*(37**i) return result % M def Bag_of_words(w, M): check = []; BoW = [] for i in range(M): check.append([i,0]) for e1,e2 in w: check[fhash(e1, M)][1] += e2 for i in range(len(check)): if check[i][1] != 0: BoW.append(check[i]) return BoW #------------------------- # read stopwords file => turn to [stopw] stopw = [] fn = open('stopwords.txt', 'r') for line in fn: x = line_to_words(line.strip()) for e in x: if e.lower() not in stopw: stopw.append(e.lower()) fn.close() #------------------------- file = input('File name = ').strip() while True: key = input('Use feature hashing ? (y,Y,n,N) ') if key.upper() == 'Y' or key.upper() == 'N': break else: print('Try again.') if key.upper() == 'Y': M = int(input('M = ')) fn = open(file, 'r') counts_ch = 0; counts_alnum = 0; counts_line = 0; counts_word = 0 words = []; check_words = [] for line in fn: for e in line: if e != '\n': counts_ch += 1 if e.isalnum(): counts_alnum += 1 counts_line += 1 x = line_to_words(line) counts_word += len(x) for e in x: if e.lower() not in check_words and e.lower() not in stopw: words.append([e.lower(),0]) check_words.append(e.lower()) if e.lower() in check_words: k = check_words.index(e.lower()) words[k][1] += 1 fn.close() words.sort() print('-'*19) print('char count =', counts_ch) print('alphanumeric count =',counts_alnum) print('line count =', counts_line) print('word count =', counts_word) if key.upper() == 'Y': print('BoW =', Bag_of_words(words, M)) else: print('BoW =', words)
# 6330563421 (30.00) 381 (2021-03-22 20:25) def fhash(w,M): fsum=0 for i in range(len(w)): fsum+=(ord(w[i])*(37**i)) fvalue=fsum%M return fvalue def alphanume_count(x): c=0 y=x.lower() for k in y: if k in 'abcdefghijklmnopqrstuvwxyz0123456789': c+=1 return c def word_count(x): word='' for k in x: if k not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789': word+=' ' else: word+=k return len(word.split()) file_name=input('File name = ') while True: ask=input('Use feature hashing ? (y,Y,n,N) ') if ask=='y' or ask=='Y': M=int(input('M = ')) break elif ask=='n' or ask=='N': break else: print('Try again.') print('-------------------') stop_word=[] sw=open('stopwords.txt','r') for line in sw: for k in line.lower().split(): stop_word.append(k) sw.close() sample='' line_count=0 char_count=0 fn=open(file_name,'r') for line in fn: sample+=(line.strip()+' ') char_count+=len(line.strip()) line_count+=1 fn.close() print('char count = '+str(char_count)) print('alphanumeric count = '+str(alphanume_count(sample))) print('line count = '+str(line_count)) print('word count = '+str(word_count(sample))) bowsample='' bowedit=[] for k in sample.lower(): if k in 'abcdefghijklmnopqrstuvwxyz0123456789': bowsample+=k else: bowsample+=' ' bowsample=bowsample.split() for k in bowsample: if k not in stop_word: bowedit.append(k) if ask=='y' or ask=='Y': fhashresult=[] fhashunique=[] fhashcount=[] for k in bowedit: fhashresult.append(fhash(k,M)) for k in fhashresult: if k not in fhashunique: fhashunique.append(k) for k in fhashunique: counter=0 for e in fhashresult: if e==k: counter+=1 fhashcount.append(counter) fhashbow=[[fhashunique[i],fhashcount[i]] for i in range(len(fhashunique))] fhashbow.sort() print('BoW = '+str(fhashbow)) else: nofhashunique=[] nofhashcount=[] for k in bowedit: if k not in nofhashunique: nofhashunique.append(k) for k in nofhashunique: counter=0 for e in bowedit: if e==k: counter+=1 nofhashcount.append(counter) nofhashbow=[[nofhashunique[i],nofhashcount[i]] for i in range(len(nofhashunique))] nofhashbow.sort() print('BoW = '+str(nofhashbow))
# 6330564021 (30.00) 382 (2021-03-22 01:34) #---------------------------------------------------------- def char_count(file_name): i = 0 fn = open(file_name, 'r') for line in fn: if line[-1] == '\n': i += len(line[:-1]) else: i += len(line) fn.close() return i def alp_count(file_name): i = 0 fn = open(file_name, 'r') for line in fn: for c in line.lower(): if 'a' <= c <= 'z' or '0' <= c <= '9': i += 1 fn.close() return i def line_count(file_name): i = 0 fn = open(file_name, 'r') for line in fn: i += 1 fn.close() return i def stop_words(stop_name): k = [] fn = open(stop_name, 'r') for line in fn: k += line.lower().strip().split() fn.close() return k def words(file_name): k = [] fn = open(file_name, 'r') for line in fn: d = '' for c in line.lower(): if 'a' <= c <= 'z' or '0' <= c <= '9': d += c else: d += ' ' k += d.strip().split() fn.close() return k def BoW(file_name, stop_name): a = words(file_name) b = stop_words(stop_name) k = [] for c in a: if c in b: k.append(c) p = [] for d in a: if d not in k: p.append(d) word_c = [] word = [] for i in range(len(p)): if p[i] in word_c: j = word_c.index(p[i]) word[j] += 1 else: word_c.append(p[i]) word.append(1) r = [] for i in range(len(word)): r.append([word_c[i],word[i]]) return r def f_hashing(file_name, stop_name,M): a = words(file_name) b = stop_words(stop_name) k = [] for c in a: if c in b: k.append(c) p = [] for d in a: if d not in k: p.append(d) word_or1 = [] for e in p: n = 0 for i in range(len(e)): x = ord(e[i]) n += x*((37)**i) word_or1.append(n%M) word_or1 = sorted(word_or1) word_or2 = [] word_num = [] for i in range(len(word_or1)): if word_or1[i] in word_or2: j = word_or2.index(word_or1[i]) word_num[j] += 1 else: word_or2.append(word_or1[i]) word_num.append(1) z = [] for i in range(len(word_num)): z.append([word_or2[i],word_num[i]]) return z #---------------------------------------------------------- stop_name = 'stopwords.txt' file_name = input('File name = ') s = 1 while s == 1: t = input('Use feature hashing ? (y,Y,n,N) ').lower() if t == 'y' or t == 'n': s = 0 else: print('Try again.') s = 1 if t == 'y': M = int(input('M = ')) print('-------------------') print('char count =',char_count(file_name)) print('alphanumeric count =',alp_count(file_name)) print('line count =',line_count(file_name)) print('word count =',len(words(file_name))) print('BoW =',f_hashing(file_name, stop_name,M)) else: print('-------------------') print('char count =',char_count(file_name)) print('alphanumeric count =',alp_count(file_name)) print('line count =',line_count(file_name)) print('word count =',len(words(file_name))) print('BoW =',BoW(file_name, stop_name))
# 6330565721 (4.00) 383 (2021-03-22 21:11) file_name = input('File name = ',) x = input("Use feature hashing ? (y,Y,n,N) ",) while x != 'n' and x != 'N' and x != 'y' and x != 'Y' : print('Try again.') a = input("Use feature hashing ? (y,Y,n,N) ",) if x == 'y' or x == 'Y' : y = int(input('M = ',)) print('-------------------') def fhash(o, p) : f = 0 for i in range(len(o)) : f = f+(ord(o[i])*37**i) f %= p return f def count_words(o) : ct5 = 0 for i in range(len(h)): if h[i] == o : ct5 += 1 else : ct5 += 0 return ct5 s = open('stopwords.txt', 'r') ; z = open(file_name, 'r') ; z2 = z.read().strip() ; z3 = z2.split() ; z4 = " ".join(z3) ss = s.read().strip().split() ct1 = 0 for line in z2 : if line == '\n' : ct1+=0 else : ct1 += len(line) print('char count =', ct1) ct2 = 0 ; a=[] ; b=[] for line in ff : for i in range(len(line)) : if line[i].isalnum()==True : ct2 += 1 a.append(line[i]) b = ''.join(a) else : continue c.append(b) a=[] print('alphanumeric count =', ct2) ct3 = 0 z = open(file_name, 'r') for line in z : ct3 += 1 print('line count =', ct3) ct4 = 0 ; b=[] for i in range(len(z4)) : if z4[i]==z4[0] : b.append(z4[i]) continue if z4[i].isalnum()==False and z4[i].isalnum() != z4[i-1].isalnum() : ct4 += 1 else : b.append(z4[i]) continue print('word count =', ct4) BW = [] ; BW0 = [] ; BW1 = [] ; BW2 = [] h = " ".join(c).lower().split() if x == 'y' or x == 'Y' : for i in range(len(h)) : if h[i] not in ss and h[i] not in BW0 : BW0.append(h[i]) BW1.append([fhash(h[i],b), count_words(h[i])]) q = sorted(BW1) for i in range(len(q)) : if i < len(q)-1 : for j in range(i+1,len(q)) : if q[i][0]==q[j][0] : q[i][1]+=q[j][1] for i in range(len(q)) : if q[i][0]==q[i-1][0] : continue else : BW.append(q[i]) print('BoW =', BW) elif x == 'n' or x == 'N' : for i in range(len(h)) : if h[i] not in ss : BW2.append([h[i], count_words(h[i])]) q = sorted(BoW2) for i in range(len(q)) : if q[i][0]==q[i-1][0] : continue else : BoW.append(k[i]) print('BoW =', BW) z.close() s.close()
# 6330566321 (30.00) 384 (2021-03-22 22:29) def get_words(txt): txt = txt.lower() tmp = '' words = [] for c in txt: if ('a' <= c <= 'z') or ('0' <= c <= '9'): tmp += c else: if tmp != '': words.append(tmp) tmp = '' if tmp != '': words.append(tmp) return words def fhash(w,M): return sum([ord(w[i])*37**i for i in range(len(w))])%M def BoW(words,M): if M != 0: words = [fhash(word,M) for word in words] collected = [] bag = [] for word in words: if word not in collected: collected.append(word) bag.append([word,words.count(word)]) return sorted(bag) def remove_stop(words,stopwords): removed = [] for word in words: if word not in stopwords: removed.append(word) return removed M = 0 file = input('File name = ') useFhash = input('Use feature hashing ? (y,Y,n,N) ') while useFhash not in ['y','Y','n','N']: print('Try again.') useFhash = input('Use feature hashing ? (y,Y,n,N) ') if useFhash in ['y','Y']: M = input('M = ') Pass = True if M == '0': Pass = False for c in M: if not ('0' <= M <= '9'): Pass = False while not Pass: print('Try again.') M = input('M = ') Pass = True if M == '0': Pass = False for c in M: if not ('0' <= M <= '9'): Pass = False M = int(M) with open(file) as f: txt = f.read().lower() charCount = len(txt) - txt.count('\n') print('-------------------') print('char count =',charCount) lineCount = txt.count('\n') + 1 alphanumCount = 0 for c in txt: if ('a' <= c <= 'z') or ('0' <= c <= '9'): alphanumCount += 1 print('alphanumeric count =',alphanumCount) print('line count =',lineCount) words = get_words(txt) wordCount = len(words) print('word count =',wordCount) with open('stopwords.txt') as f: stopwords = f.read().lower().split() words = remove_stop(words,stopwords) print('BoW =',BoW(words,M))
# 6330567021 (22.75) 385 (2021-03-20 21:59) def no_sign(slice): string = '' for i in slice: if i in '():;\'\"\\/,.?': string += ' ' else:string += i return string def list_2_string(l): string = '' for i in l: string += str(i) return string def fhash(list_word,M): p = 0 for i in range(len(list_word)): p += ord(list_word[i])*(37**i) h = p%M return h def no_repeat(slice): once_word = [] for i in range(len(slice)): if slice[i] in once_word:once_word += [] else:once_word+= [slice[i]] return once_word def bow_y_1(slice,m): c = [fhash(i,m) for i in slice] clean_number = no_repeat(c) clean_number.sort() last = [[j,c.count(j)]for j in clean_number] return last ################### def bow_n_coop(slice): zero_repeat = no_repeat(slice) c = [[i,slice.count(i)] for i in zero_repeat] return c #################### def list_present_y(duo_list): c = '' for i in duo_list: if i[1] != 0:c += '['+str(i[0])+', '+str(i[1])+'], ' c = '[' + c[:-2] + ']' return c def list_present_n(bow_of_n): c = '' for i in bow_of_n: if i[1] != 0:c += "['"+str(i[0])+"', "+str(i[1])+'], ' c = '[' + c[:-2] + ']' return c def string_from_list(slice): string = '' for i in slice: for j in i: string += j string += ' ' return string ################### def splt_list_to_list(slice): space = [] for i in slice: for j in i: space += [j] return space def clear_list_but_repeat_nosign_nospace_small_letter(slice): pp = string_from_list(slice).lower() p = (no_sign(pp)).split() return p def stop_words_layer(all_word_list,word_2_stop): space = [] stop = word_2_stop for i in all_word_list: if i not in stop: space += [i] return space #file_sample = open('sample.txt','w') ##file_sample.write('It was the best of times,\nit was the worst of times,\nit was the age of wisdom.\n"555"') ##file_sample.write('one bad bitch and she do what I say so\n0.2 big dollar and a big ass,\nfaygo') #file_sample.close() #file_stop = open('stopwords.txt','w') #file_stop.write('it they\nthe a an\nof on in at\nis am say so are was were') #file_stop.close() file_stopper = open('stopwords.txt','r') stop_word = [jline.split() for jline in file_stopper] word_2_stop = splt_list_to_list(stop_word)################stopwords file_name = input('File name = ').strip() file = open(file_name,'r') word_list = [line.split() for line in file] file2 = open(file_name,'r') each_line = [line.strip() for line in file2] def gogoy(word_list,m,each_line): all_word_list = clear_list_but_repeat_nosign_nospace_small_letter(word_list) number_of_line = len(each_line) number_of_word = len(all_word_list) string_word = list_2_string(each_line) only_alnum = list_2_string(all_word_list) word_from_layer_some_are_repeat = stop_words_layer(all_word_list,word_2_stop)## these section for print('-------------------') print('char count = ' +str(len(string_word))) print('alphanumeric count = ' + str(len(only_alnum))) print('line count = ' +str(number_of_line)) print('word count = ' +str(number_of_word)) print('Bow = ' + list_present_y(bow_y_1(word_from_layer_some_are_repeat,m))) def gogon(word_list,each_line): all_word_list = clear_list_but_repeat_nosign_nospace_small_letter(word_list) number_of_line = len(each_line) number_of_word = len(all_word_list) string_word = list_2_string(each_line) only_alnum = list_2_string(all_word_list) all_word_list_no_repeat = no_repeat(all_word_list) ##these section 4 bow word_from_layer_some_are_repeat = stop_words_layer(all_word_list,word_2_stop) bow_of_n = bow_n_coop(word_from_layer_some_are_repeat) #print(word_from_layer_some_are_repeat) print('-------------------') print('char count = ' +str(len(string_word))) print('alphanumeric count = ' + str(len(only_alnum))) print('line count = ' +str(number_of_line)) print('word count = ' +str(number_of_word)) print('BoW = ' + list_present_n(bow_of_n)) hash = input('Use feature hashing ? (y,Y,n,N) ').strip() if hash in 'yYnN': if hash == 'y' or hash == 'Y': m_order = int(input('M = ')) gogoy(word_list,m_order,each_line) else: gogon(word_list,each_line) else: while hash not in 'yYnN': print('Try again.') hash = input('Use feature hashing ? (y,Y,n,N) ').strip() if hash in 'yYNn': if hash == 'y' or hash == 'Y': m_order = int(input('M = ')) gogoy(word_list,m_order,each_line) else: gogon(word_list,each_line)
# 6330568621 (21.40) 386 (2021-03-21 14:41) def fhash(w, M): s = 0 for idx, char in enumerate(w): s += ord(char) * 37 ** idx return s % M def bowList(word_list, stopwords, isDo, M): used_word = [] for word in word_list: if word not in used_word and word not in stopwords: used_word.append(word) word_freq = [0] * len(used_word) for word in word_list: if word in used_word: word_freq[used_word.index(word)] += 1 if isDo: temp = [0 for i in range(M)] for i in range(len(used_word)): temp[fhash(used_word[i], M)] += word_freq[i] return [[i, temp[i]] for i in range(M) if temp[i] != 0] else: return [[used_word[i], word_freq[i]] for i in range(len(used_word))] def count(text): text = text.lower() alp_count = 0 for idx, char in enumerate(text): if char.isalpha() or char.isdigit(): alp_count += 1 else: text = text[:idx] + " " + text[idx + 1 :] return alp_count, len(text.split()), text.split() def read_file(fname): f = open(fname, "r") text = "" line_count = 0 for line in f: text += line.strip() line_count += 1 char_count = len(text) f.close() alp_count, word_count, word_list = count(text) return char_count, alp_count, line_count, word_count, word_list file_name = input("File name = ") BoW = input("Use feature hashing ? (y,Y,n,N) ") while BoW not in ("N", "n", "Y", "y"): print("Try again.") BoW = input("Use feature hashing ? (y,Y,n,N) ") stop_file = open("stopwords.txt") stopwords = list() for line in stop_file: stopwords += line.strip().split() char_count, alp_count, line_count, word_count, word_list = read_file(file_name) if BoW.upper() == "Y": M = int(input("M = ")) BowList = bowList(word_list, stopwords, True, M) else: BowList = bowList(word_list, stopwords, False, 0) print("char count = ", char_count) print("alphanumeric count = ", alp_count) print("line count = ", line_count) print("word count = ", word_count) print("Bow = ", BowList)
# 6330570821 (25.15) 387 (2021-03-21 18:24) #===========================(DefFunc)====================================# def flash(w, M): t = 0 x = 0 for e in w: x += ord(e) * ( 37**t ) t += 1 return x%M def char_count(filename): file = open(filename,'r') x = 0 for line in file: if line[-1] == '\n': line = line[:-1] x += len(line) file.close() return x def alphanumeric_count(filename): file = open(filename,'r') x = 0 for line in file: for ch in line: if ch.isalnum(): x += 1 file.close() return x def line_count(filename): file = open(filename,'r') x = 0 for line in file: x += 1 file.close() return x def word_count(filename): file = open(filename,'r') longstr = '' for line in file: for ch in line: if ch.isalnum(): longstr += ch else: longstr += ' ' longstr = longstr.split() x = 0 for e in longstr: if len(e) != 0: x +=1 file.close() return x def Bownorm(filename): file = open(filename,'r') longstr = '' BoW = [] for line in file: for ch in line: if ch.isalnum(): longstr += ch else: longstr += ' ' longstr = longstr.split() q = [] for e in longstr: if e.lower() not in stopword: q += [e] wordlist = [] for e in q: if len(e) != 0 and e not in wordlist: wordlist.append(e) #-------------------------------------# for e in wordlist: BoW.append([e,longstr.count(e)]) return BoW def Bowflash(filename): file = open(filename,'r') longstr = '' BoW = [] for line in file: for ch in line: if ch.isalnum(): longstr += ch else: longstr += ' ' longstr = longstr.split() q = [] for e in longstr: if e.lower() not in stopword: q.append(flash(e,M)) wordlist = [] for e in q: if len(str(e)) != 0 and e not in wordlist: wordlist.append(e) #-------------------------------------# for e in wordlist: BoW.append([e,q.count(e)]) return BoW #===========================(DefFunc)====================================# char_c = 0 alphanumeric_c = 0 line_c = 0 word_c = 0 file_name = input('File name = ').strip() hashingcmd = input('Use feature hashing ? (y,Y,n,N) ') while hashingcmd not in ['y','Y','n','N']: print( '''Try again.''' ) hashingcmd = input('Use feature hashing ? (y,Y,n,N) ') if hashingcmd.lower() == 'y': M = int(input('M = ')) stop = open('stopwords.txt','r') stopword = [] for line in stop: stopword += line.strip().split() stop.close() #-------------------------------(count)----------------------------------# char_c = char_count(file_name) print( '-------------------' ) print( 'char count =',str(char_c) ) alphanumeric_c = alphanumeric_count(file_name) print( 'alphanumeric count =',str(alphanumeric_c) ) line_c = line_count(file_name) print( 'line count =',str(line_c) ) word_c = word_count(file_name) print( 'word count =',str(word_c) ) #-------------------------------(count)----------------------------------# #================================(BoW)===================================# if hashingcmd.lower() == 'y': BoW = Bowflash(file_name) #------------------------------------------------------------------------# else: BoW = Bownorm(file_name) #================================(BoW)===================================# print( 'BoW =',BoW)
# 6330571421 (17.78) 388 (2021-03-21 16:29) #Prog-08: Bag-of-words #6330571421 (17.78) Name Anik Romyanon def feature_hashing(word,M): x = [] value = 0 for e in word: x.append(e) for i in range(len(x)): value += ord(x[i])*(37**i) return value%int(M) def cleaning(wordlist): cleaned = [] for word in wordlist: precleaned = [] newword = '' for e in word: if e.lower() in '1234567890abcdefghijklmnopqrstuvwxyz': newword += e else: newword += ' ' newword.strip() precleaned = newword.split() for e in precleaned: cleaned.append(e) return cleaned #bool check ว่าเอาคำแบบไม่เอาสัญลักษณ์หรือไม่ def reading(filename,bool): wordlist = [] linelist = [] file = open(filename,'r') for line in file: word = line.split() for e in word: wordlist.append(e) linelist.append(line) file.close() if bool == True: wordlist = cleaning(wordlist) return wordlist,linelist else: return wordlist def counting(filename): file = open(filename,'r') linelist = '' for line in file: if line.find('\n') != -1: singline= line[:line.find('\n')-1] else: singline = line linelist += singline[1:] file.close() return len(linelist) def alph_counting(filename): file = open(filename,'r') alphchar = '' for line in file: if line.find('\n') != -1: singline = line[:line.find('\n')-1] else: singline = line for e in singline: if e.lower() in '1234567890abcdefghijklmnopqrstuvwxyz': alphchar += e file.close() return len(alphchar) def bag_of_words(wordsinbag,bool): c = 0 data = wordsinbag bagofwords = [] hashlist = [] if bool == True: M = input('M = ') for e in wordsinbag: hashlist.append(feature_hashing(e,M)) data = hashlist for e in data: for i in range(len(data)): if e == data[i]: c += 1 if [e,c] not in bagofwords: bagofwords.append([e,c]) c = 0 return bagofwords #--------------------------------------------- interestedwords =[] file_name = input("File name = ") hash = input('Use feature hashing ? (y,Y,n,N) ') while hash not in ['y','Y','n','N']: print('Try again.') hash = input('Use feature hashing ? (y,Y,n,N) ') stopwordlist = reading('stopwords.txt',False) #อ่านstopwords filewordlist,linelist = reading(file_name,True) #อ่านไฟล์ #เหล่าสมาชิกชมรมตัวแปร charcount = (counting(file_name)) alphcount = (alph_counting(file_name)) line = len(linelist) wordcount = len(filewordlist) print('-------------------') print('char count = '+str(charcount)) print('alphanumeric count = '+str(alphcount)) print('line count = '+str(line)) print('word count = '+str(wordcount)) #ทำความสะอาด v.2 for e in filewordlist: if e.lower() not in stopwordlist: interestedwords.append(e) if hash in 'yY': bagofwords = bag_of_words(interestedwords,True) elif hash in 'nN': bagofwords = bag_of_words(interestedwords,False) print('BoW = '+str(sorted(bagofwords)))
# 6330572021 (30.00) 389 (2021-03-22 15:09) def fhash(w,m): p=0 for i in range(len(w)): p+=ord(w[i])*(37**i) fh=p%m return fh def bow(sen): b=[] bow=[] for e in sen : if not e in b: b.append(e) c=[0]*len(b) for i in range(len(sen)): for j in range(len(b)): if sen[i]==b[j]: c[j]+=1 for k in range(len(b)): bow.append([b[k],c[k]]) return bow file_name=input('File name=') f=input('Use feature hashing ? (y,Y,n,N)') while f!='y' and f!='Y' and f!='n' and f!='N': print('Try again.') f=input('Use feature hashing ? (y,Y,n,N)') if f=='y' or f=='Y': m=input('M=') file=open(file_name,'r') stopw=open('stopwords.txt','r') lines=stopw.readlines() stopw.close() lines=[line.strip() for line in lines] stw='' for i in range(len(lines)): stw+=str(lines[i].lower())+' ' stop=stw.split() char=0 al=0 l=0 sen='' for line in file: for c in line: if c.isalnum()==True: char+=1 al+=1 sen+=c else: char+=1 sen+=' ' l+=1 s=sen.lower().split() sent=[] for p in s: if p not in stop: sent+=[p] print('-------------------') print('char count = ',char-l+1) print('alphanumeric count = ',al) print('line count = ',l) print('word count = ',len(s)) if f=='y' or f=='Y': bb=[] for q in range(len(sent)): bb+=[fhash(sent[q],int(m))] BoW=bow(bb) BoW.sort() print('BoW = ',BoW) else: BoW=bow(sent) BoW.sort() print('BoW = ',BoW) file.close()
# 6330573721 (13.00) 390 (2021-03-21 17:07) def char_count(w): res = 0 for i in w: res+=1 return res - line_count(w) def alphanumeric_count(w): res = 0 for i in w: if i.isalnum(): res+=1 return res def line_count(w): res = 0 for i in w: if i == '\n': res+=1 return res def word_count(w): res = 0 for i in w: res+=1 return res def fhash_calc(w, m): # w = list of character G = 37 w = [i for i in w if i.isalnum()] res = ord(w[0]) times = 1 for i in w[1:]: res+=ord(i)*(G**times) times+=1 return res%m def fhash(w, m): # w = list of string #w = w.split() _list = [] listnum = [] res = [] for i in range(len(w)): _list.append([j for j in w[i]]) listnum.append(fhash_calc(_list[i], m)) no_dups = [] for i in listnum: count = listnum.count(i) if i not in no_dups: no_dups.append(i) res.append([i, count]) return res file_name = input("File name = ") while(True): feature = input("Use feature hashing ? (y,Y,n,N) ").lower() if feature in ['y', 'n']: break else: print("Try again.") stop_words = open('stopwords.txt', 'r') stop_words = stop_words.read().split() if feature.lower() == 'y': M = int(input("M = ")) print('-------------------') with open(file_name, 'r') as f: text = f.read() print("char count = " + str(char_count(text))) print("alphanumeric count = " + str(alphanumeric_count(text))) print("line count = " + str(line_count(text))) print("word count = " + str(word_count(text.split()))) words = [] for i in text.split(): if i.lower() in stop_words: continue words.append(i) print(fhash(words, M)) f.close() elif feature.lower() == 'n': print('-------------------') with open(file_name, 'r') as f: text = f.read() print("char count = " + str(char_count(text))) print("alphanumeric count = " + str(alphanumeric_count(text))) print("line count = " + str(line_count(text))) print("word count = " + str(word_count(text.split()))) words = [] for i in text.split(): if i.lower() in stop_words: continue string = str() for j in i: if j.isalnum(): string+=j else: continue words.append(string) res = [] no_dups = [] for i in words: count = words.count(i) if i not in no_dups: no_dups.append(i) res.append([i, count]) print(res)
# 6330574321 (0.00) 391 (2021-03-22 22:28) file=input("File name = ") file1="sample.txt" file2="stopwords.txt" fea=input("Use feature hashing ? (y,Y,n,N) ") a=["y","Y","n","N"] b=["y","Y"] while fea not in a: print("Try again.") fea=input("Use feature hashing ? (y,Y,n,N) ") if fea in b: M=int(input("M = ")) f1=open(file1,"r").read().lower() f2=open(file2,"r").read().lower() def fhash(j,p): c=0 A=len(j) for e in range(A): c=c+ord(j[e])*(37**e) d=int(c%p) return d k=0 l=1 n=0 w=[] s=[] o=[] r=[] x="" y="" for i in f1: if i!="\n": k=k+1 else: l=l+1 if "0"<=i<="9" or "a"<=i<="z": n=n+1 x=x+i elif x!="": w.append(x) x="" print("-------------------") K=str(k) print("char count = "+K) N=str(n) print("alphanumeric count = "+N) L=str(l) print("line count = "+L) if x!="": word.append(x) x="" W2=str(len(w)) print("word count = "+W2) for e in f2: if "0"<=e<="9" or "a"<=e<="z": y=y+e elif y!="": s.append(y) y="" if y!="": s.append(y) for e in s: for i in range(w.count(e)): w.remove(e) w1=len(w) if fea in b: for i in range(w1): w[i]=fhash(w[i],M) for e in w: if e not in r: o.append([e,w.count(e)]) r.append(e) o.sort() O=str(o) print("BoW = ",O)
# 6330575021 (30.00) 392 (2021-03-21 18:03) def dosam(file): a = open(file,'r',errors = 'ignore').read().lower() b = '' for i in a : if i.isalnum() == True : b+=i else : b+=' ' b = b.split() return b #------------------------------------------------- def dostop(): x = open('stopwords.txt','r',errors = 'ignore').read().split() y = [] for i in x : y.append(i) return y #------------------------------------------------- def alc(): c=0 for i in range(len(dosam(file))) : for j in dosam(file)[i] : c+=1 return c #------------------------------------------------- def cc(file) : a = open(file,'r',errors = 'ignore') c=0 while True : b = a.readline() if b == '' : break elif b[-1] == '\n' : c+=len(b)-1 elif b[-1] != '\n' : c+=len(b) return c #------------------------------------------------- def lc(file) : a = open(file,'r',errors = 'ignore') c=0 while True : b = a.readline() c+=1 if b == '' : c-=1 break return c #------------------------------------------------- def wc() : c=0 for i in range(len(dosam(file))) : c+=1 return c #------------------------------------------------- def bownofh(file): x = dostop() y = dosam(file) z = [] for i in range(len(y)) : if y[i] not in x : z.append(y[i]) z.sort() l = arrange(z) return l #------------------------------------------------- def arrange(sen) : if len(sen) == 0 : return '[]' mem=sen[0] c=0 ans='' for i in range (0,len(sen)): if sen[i]!=mem: ans+=str(mem)+" "+str(c)+" " mem=sen[i] c=1 else: c+=1 ans+=str(mem)+" "+str(c) ans = ans.split() l = [[ans[i*2],int(ans[i*2+1])] for i in range(len(ans)//2)] return l #------------------------------------------------- def rbof() : x = dostop() y = dosam(file) z = [] l = [] for i in range(len(y)) : if y[i] not in x : z.append(y[i]) z.sort() for i in range(len(z)) : l.append(fhash(z[i],M)) l.sort() l = arrange(l) if len(l) == 2 : return '[]' else : l = [[int(l[i][0]),l[i][1]] for i in range(len(l))] return l #------------------------------------------------- def fhash(w,M) : G=37 c = [] d = 0 for i in range(len(w)) : c.append(ord(w[i])) for i in range(len(c)) : d += c[i]*(G**i) d = d%M return d #------------------------------------------------- file = input('File name = ') x = input('Use feature hashing ? (y,Y,n,N) ') c = 0 if x == 'n' or x == 'N'or x == 'y' or x == 'Y' : c+=1 else : c=0 while c == 0 : print('Try again'+'.') x = input('Use feature hashing ? (y,Y,n,N) ') if x == 'n' or x == 'N'or x == 'y' or x == 'Y' : c+=1 else : c=0 if x == 'n' or x == 'N' : print('-------------------') print('char count =',cc(file)) print('alphanumeric count =',alc()) print('line count =',lc(file)) print('word count =',wc()) print('BoW =',bownofh(file)) if x == 'y' or x == 'Y' : M = int(input("M = ")) print('-------------------') print('char count =',cc(file)) print('alphanumeric count =',alc()) print('line count =',lc(file)) print('word count =',wc()) print('BoW =',rbof())
# 6330576621 (22.99) 393 (2021-03-21 23:33) def remove_pun(s): out="" i=0 b=[] for c in s: if c.lower() in '0123456789abcdefghijklmnopqrstuvwxyz': out+=c.lower() i+=1 elif out!="": b.append(out) out="" return b,i def fhash_list(bow,M): temp = 0 fhash = [] for i in range(len(bow)): for k in range(bow[i][1]): for j,c in enumerate(bow[i][0]): temp += ord(c)*(37**j) fhash.append(temp%M) temp = 0 return fhash file_name=input("File name = ") while True: feature=input("Use feature hashing ? (y,Y,n,N) ") if feature in "y,Y": M = int(input("M = ")) a=1 break elif feature in"n,N": a=0 break else: print("Try again") fn=open(file_name,"r") char=("").join(fn.readlines()) s=open("stopwords.txt","r") st=s.read().split() x,y=remove_pun(char) fn.seek(0) line=fn.readlines() line_count=len(line) bow=[] new=[] bownew=[] for e in x: if e not in st: new.append(e) for e in new: bow.append([e,new.count(e)]) bow.sort() for e in bow: if e not in bownew: bownew.append(e) print("-------------------") print("char count = "+ str(len(char)-char.count("\n"))) print("alphanumeric count = " + str(y)) print("line count = " +str(line_count)) print("word count = " +str(len(x))) if a==1: listfhash=fhash_list(bownew,M) f1=[] f3=[] for e in listfhash: f1.append([e,listfhash.count(e)]) f1.sort() for e in f1: if e not in f3: f3.append(e) print("BoW = " + str(f3)) else: print("BoW = " + str(bownew))
# 6330577221 (30.00) 394 (2021-03-22 02:52) #Prog-08: Bag-of-Words #6330577221 (30.00) Name Akrachai Kovittayanun def fhash (w,M): allord=0 for i in range(len(w)): o=ord(w[i])*37**i allord+=o return allord%M filename=input('File name = ') bow=input('Use feature hashing ? (y,Y,n,N)) ') while bow not in ('y','Y','n','N'): print('Try again.') bow=input('Use feature hashing ? (y,Y,n,N)) ') if bow == 'y' or bow == 'Y': m=int(input('M = ')) print('-------------------') stopwords=[] s_file=open('stopwords.txt','r') for line in s_file: for w in line.strip().split(): stopwords.append(w) s_file.close() text='' textalnum='' wordstext='' line_count=0 file = open(filename,'r') for line in file: for e in line.strip(): text+=e text+=' ' line_count+=1 char=len(text)-line_count print('char count =',char) for e in text: if e.isalnum()!=True: textalnum+='' else: textalnum+=e alnum=len(textalnum) print('alphanumeric count =',alnum) print('line count =',line_count) for e in text: if e.isalnum()==True: wordstext+=e.lower() else: wordstext+=' ' wordslist=wordstext.strip().split() wordcount=len(wordslist) print('word count =',wordcount) uniquelist=[] for e in wordslist: if e not in stopwords: uniquelist.append(e) fhashlist=[] if bow=='y' or bow=='Y': for e in uniquelist: fhashwords=fhash(e,m) fhashlist.append(fhashwords) uniquelist=fhashlist output=[] point=0 for i in range(len(uniquelist)): if uniquelist[i] not in uniquelist[i+1:] and uniquelist[i] not in uniquelist[:i]: output.append([uniquelist[i],1]) if uniquelist[i] in uniquelist[i+1:] and uniquelist[i] not in uniquelist[:i]: point+=1 j=i+1 while j in range(len(uniquelist)) and uniquelist[i] in uniquelist[j:]: point+=1 j=uniquelist[j:].index(uniquelist[i])+j+1 output.append([uniquelist[i],point]) point=0 print('BoW =',output) file.close()
# 6330578921 (25.10) 395 (2021-03-20 14:28) def remove_punc(t): out = '' for e in t: if e not in [ '(', ')', '-', '_', '[', ']' ,'"' ,"'" ,';', ':', '>', '<','.',',','\\','/' ]: out += e return out def remove_stopword(t,s): out = '' for e in t: if e not in s: out += e return out def countitem(l): result = [] for i in l: if i in result: result[i] += 1 else: result[i] = 1 return result def fhashword(w,M): n = 0 fh = 0 for i in range(len(w)): fh += ord(w[i])*37**n n += 1 return fh % M #-------------------------------------------------------------------------- file_name = input('File name = ',) wantfh = input('Use feature hashing ? (y,Y,n,N) ') while True: if wantfh == 'y' or wantfh == 'Y': print('-------------------') M = int(input('M = ',)) file = open(file_name,'r') lines = file.readlines() lines = [line.strip() for line in lines] file.close() file = open('stopwords.txt','r') lines1 = file.readlines() lines1 = [line.strip() for line in lines1] file.close() word_input = '' for i in lines: word_input += ' ' + i stopword = '' for j in lines1: stopword += ' ' +j newword_input = [] for i in range(len(word_input)): if word_input[i].isalnum() == True: newword_input += word_input[i] w = remove_punc(word_input.lower()).split() box = '' for i in w: if i not in stopword.split(): box += ' '+ i if i in stopword.split(): box += ' ' box0 = box.split() fhash = [] for i in box0: fhash.append([i,fhashword(i,M)]) bow1 = [] for j in range(len(fhash)): bow1.append(fhash[j][1]) bow1 = sorted(bow1) bow2 = [] for k in bow1: bow2.append([k,bow1.count(k)]) realbowfh = [] for l in bow2: if l not in realbowfh: realbowfh.append(l) print('char count =',len(list(word_input.strip()))-(len(lines)-1)) print('alphanumeric count =',len(newword_input)) print('line count =',len(lines)) print('word count =',len(word_input.split())) print('BoW =',realbowfh) break elif wantfh == 'n' or wantfh == 'N': print('-------------------') file = open(file_name,'r') lines = file.readlines() lines = [line.strip() for line in lines] file.close() file = open('stopwords.txt','r') lines1 = file.readlines() lines1 = [line.strip() for line in lines1] file.close() word_input = '' for i in lines: word_input += ' ' + i stopword = '' for j in lines1: stopword += ' ' +j newword_input = [] for i in range(len(word_input)): if word_input[i].isalnum() == True: newword_input += word_input[i] word1 = remove_punc(word_input.lower()).split() box = '' for i in word1: if i not in stopword.split(): box += ' '+ i if i in stopword.split(): box += ' ' bow1 = [] list_box = box.split() for j in list_box: bow1.append(list_box.count(j)) bow2 = [] for k in range(len(list_box)): bow2.append([list_box[k],bow1[k]]) realbow = [] for l in bow2: if l not in realbow: realbow.append(l) print('char count =',len(list(word_input.strip()))-(len(lines)-1)) print('alphanumeric count =',len(newword_input)) print('line count =',len(lines)) print('word count =',len(word_input.split())) print('BoW =',realbow) break else: print('Try again.') wantfh = input('Use feature hashing ? (y,Y,n,N) ')
# 6330579521 (30.00) 396 (2021-03-22 12:12) def fhash(w,m): bino=0 G=37 for i in range(len(w)): bino+=ord(w[i])*(G**i) result=bino%m return result def alnum_only(a): cut = [] left = 0 i = 0 while i < len(a): if(a[i].isalnum()): left = i while(i+1 < len(a) and a[i+1].isalnum()): i = i+1 right = i+1 cut.append(a[left:right]) i = i+1 return cut print('File name = ',end='') file_name=input() print('Use feature hashing ? (y,Y,n,N) ',end='') while True: yesno=input() if yesno in ['y','Y','n','N']: option=0 if yesno=='y' or yesno=='Y': option=1 print('M = ',end='') M=int(input()) break break else: print('Try again.') print('Use feature hashing ? (y,Y,n,N) ',end='') print('-------------------') f=open('stopwords.txt','r') list_=[line.strip().lower() for line in f] # print(list_) w=[] stopwords=[] for i in range(len(list_)): w.append(list_[i].split()) for i in range(len(w)): for h in range(len(w[i])): stopwords.append(w[i][h]) f2=open(file_name,'r') listf2=[line.strip().lower() for line in f2] #ใช้lower ตั้งแต่ตรงนี้เลย # print(listf2) charcount=0 alnumcount=0 for i in range(len(listf2)): if listf2[i]!='': charcount+=len(listf2[i]) # print('i=',i,'charcount=',charcount) for j in range(len(listf2[i])): if listf2[i][j].isalnum(): alnumcount+=1 list_word=[] for i in range(len(listf2)): for j in range(len((alnum_only(listf2[i])))): list_word.append(alnum_only(listf2[i])[j]) # print(list_word) print('char count =',charcount) print('alphanumeric count =',alnumcount) print('line count =',len(listf2)) print('word count =',len(list_word)) # print('list_word_lower=',list_word) # print('stopwords=',stopwords) aftercut=[] for y in list_word: if y not in stopwords: aftercut.append(y) else: continue # print(aftercut) bow=[] marker = [0] * len(aftercut) # print(marker) c=0 for i in range(len(aftercut)): if(marker[i] == 0): bow.append([aftercut[i],aftercut.count(aftercut[i])]) for j in range(len(aftercut)): if(aftercut[j] == aftercut[i]): marker[j]=1 if option==0: print('BoW =',bow) else: # print('Bow=',bow) hashing=[] for i in range(len(bow)): # print('*',bow[i][0], fhash(bow[i][0],M),bow[i][1]) for h in range(bow[i][1]): hashing.append(fhash(bow[i][0],M)) # print('hashing=',hashing) bowhash=[] for i in range(M): if hashing.count(i)!=0: bowhash.append([i,hashing.count(i)]) # print('i=',i,'bowhash=',bowhash) print('BoW =',bowhash)
# 6330580021 (26.67) 397 (2021-03-22 03:48) file=open(input('File name = '),'r') stop=open('stopword.txt','r') fout='' for line in file: fout+=line stopword='' for line in stop: stopword+=line stopword=stopword.split() line_list = fout.split('\n') if line_list[-1]=='': line_list=line_list[:-1] real_fout=''.join(line_list) def list(fout): out='' for i in fout: if i.lower() in '0123456789abcdefghijklmnopqrstuvwxyz': out += i.lower() else: out += ' ' out=out.split() return out def deleate_stopword(fout,stopword): out=[] for i in range(len(fout)): if not fout[i] in stopword: out.append(fout[i]) return out def BoW(real_list): bow=[] for i in range(len(real_list)): n=0 for j in range(len(real_list)): if real_list[i] == real_list[j]: n+=1 if [real_list[i],n] not in bow: bow.append([real_list[i],n]) bow=sorted(bow) return bow def fhash(w,M): fhash1=[] for i in range(len(w)): fhash=0 for j in range(len(w[i])): fhash+=ord(w[i][j])*(37**j) fhash=fhash%M fhash1.append(fhash) m=[] for i in range(len(fhash1)): n=0 for j in range(len(fhash1)): if fhash1[i]==fhash1[j]: n+=1 if [fhash1[i],n] not in m: m.append([fhash1[i],n]) k=sorted(m) return k listfile=list(fout) a=input('Use feature hashing ? (y,Y,n,N) ') while a not in ['y','Y','n','N']: print('Try again.') a=input('Use feature hashing ? (y,Y,n,N) ') if a in ['y','Y']: M=int(input('M = ')) print('-------------------') print('char count = ',len(real_fout)) print('alphanumeric count = ',len(''.join(listfile))) print('line count = ',len(line_list)) print('word count = ',len(listfile)) if a in ['y','Y']: print('BoW = ',fhash(deleate_stopword(listfile,stopword),M)) else: print('BoW = ',BoW(deleate_stopword(listfile,stopword)))
# 6330583021 (28.00) 398 (2021-03-21 21:07) def cutstop(file_name): lf = [] lt='' f = open('stopwords.txt', "r" ) t = open(file_name, "r" ) for lines in f: line=lines.split() for l in line: lf.append(l) for lines in t: for i in lines: if 'a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9': lt += i.lower() else : lt += ' ' lt=lt.split(' ') listt=[] for l in lt: if l != '': listt.append(l) l3 = [] for i in range(len(listt)): if listt[i] not in lf: l3.append(listt[i]) f.close() t.close() return l3 def four(file): f = open(file,"r") s1 = 0 for lines in f: if '\n' in lines: s1 += len(lines)-1 else : s1 += len(lines) s2=0 f.close() f = open(file,"r") for lines in f: for i in lines: if 'a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9': s2 +=1 f.close() f = open(file,"r") s3=0 for lines in f: s3 +=1 f.close() f = open(file,"r") s4 = '' s5 = 0 for lines in f: for i in lines: if 'a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9': s4 += i else : s4 += ' ' s4=s4.split(' ') for n in s4: if n != '': s5 += 1 print('-------------------') print('char count = '+ str(s1)) print('alphanumeric count = '+ str(s2)) print('line count = '+ str(s3)) print('word count = '+ str(s5)) print('File name =', end=' ' ) file_name= input() while True : print('Use feature hashing ? (y,Y,n,N)', end=' ') met = input() if met not in 'nNyY': print('Try again') else: met = met.lower() break if met == 'n': four(file_name) lf = [] lt='' f = open('stopwords.txt', "r" ) t = open(file_name, "r" ) for lines in f: line=lines.split() for l in line: lf.append(l) for lines in t: for i in lines: if 'a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9': lt += i.lower() else : lt += ' ' lt=lt.split(' ') listt=[] for l in lt: if l != '': listt.append(l) l3 = [] for i in range(len(listt)): if listt[i] not in lf: l3.append(listt[i]) l4=[] l5=[] for l in l3: if l not in l4: l4.append(l) l5.append(1) else: x = l4.index(l) l5[x] = l5[x]+1 bom=[] for i in range(len(l4)): n = [l4[i],l5[i]] bom.append(n) print('BoW = ', bom) f.close() t.close() elif met == 'y': print('M = ',end="") m = input() four(file_name) cutlist = cutstop(file_name) bowlist=[] for l in cutlist: s=0 for i in range(len(l)): s +=ord(l[i])*(37**i) s = s%int(m) bowlist.append(s) bowlist.sort() bowlist1 = [bowlist[0]] bowlist2 = [1] for i in range(1,len(bowlist)): if bowlist[i] != bowlist[i-1]: a= bowlist[i] b = 1 bowlist1.append(a) bowlist2.append(b) else: k=bowlist1.index(bowlist[i]) bowlist2[k] += 1 truelist=[] for i in range(len(bowlist1)): truelist.append([bowlist1[i],bowlist2[i]]) print('BoW = ',end='') print(truelist)
# 6330585221 (30.00) 399 (2021-03-20 21:06) def fhash(w,M): fhas = 0 for i in range(len(w)): fhas += ord(w[i])*(37**i) return fhas%M def edit_string(file): x=''; fo = open(file) for line in fo: for e in line.lower(): if e.isalnum() or e == ' ': x+=e.lower() else: x+=' ' fo.close() return x.split() def sameword(sample): norepeat = []; x = sorted(sample) if len(x) > 1: for i in range(len(x)-1): if x[i] == x[i+1]: continue norepeat.append(x[i]) norepeat.append(x[i+1]) else: norepeat = x return sorted(norepeat) def frequency(list_): if len(list_) == 0: return '' else: x = sorted(list_); f = x[0]; fre = []; c=0 for i in range(len(x)): if f == x[i]: c+=1 else: fre.append(c) c=1 f = x[i] fre.append(c) return fre def BoW(sample, stopwords, feature, M): edited = [] for e in edit_string(sample): if e.lower() not in edit_string(stopwords): edited.append(e.lower()) if feature.lower() == 'n': return [[sameword(sorted(edited))[i], frequency(sorted(edited))[i]] for i in range(len(frequency(edited)))] else: k = [] for e in edited: k.append(str(fhash(e, M))) return [[int(sameword(sorted(k))[i]), frequency(sorted(k))[i]] for i in range(len(frequency(k)))] def main(): file_name = input('File name = '); M=0 feature = input('Use feature hashing ? (y,Y,n,N) ') while feature.lower() != 'n' and feature.lower() != 'y': print('Try again.') feature = input('Use feature hashing ? (y,Y,n,N) ') if feature.lower() == 'y': M = int(input('M = ')) c_alnum=0; c_char=0; c=0; c_line = 0; fn = open(file_name) for line in fn: for e in line: if e.lower().isalnum(): c_alnum+=1 if e == '\n': c+=1 c_char += len(line) c_line += 1 print('-'*19) print('char count =',c_char-c) print('alphanumeric count =', c_alnum) print('line count =', c_line) print('word count =', len(edit_string(file_name))) print('BoW =',BoW(file_name, 'stopwords.txt', feature, M)) fn.close() main()
# 6330586921 (24.67) 400 (2021-03-22 23:29) def remove_punctuation(s): out = '' for c in str(s): if c.lower() in '0123456789abcdefghijklmnopqrstuvwxyz': out += c.lower() else: out += ' ' return out def read_file(f) : a = [] for line in f : a.append(line) return a def char_count(f): #true char_count = 0 for i in f : for e in i : if e != "\n" : char_count += 1 return char_count def alphabet_count(f): #true count = 0 for i in f : for e in i : if e.isalnum() : count += 1 return count def word_count(f): #true count = 0 for e in f: a = remove_punctuation(e).split() count += len(a) return count def read_stopword(stopword): a = [] for line in stopword: for i in line.split(): a.append(i.lower()) return a def count(a): e = [] for i in a: for q in remove_punctuation(i).split(): e.append(q) # print(e) b = [] for i in e: if i not in b:b.append(i) # print(b) # print(e) count = [] for i in b: c = 0 for q in e : if i == q:c += 1 count.append([i, c]) return sorted(count) def remove_stopword(line,stopword): line= line.split() out = "" for i in line : if i in stopword:pass else:out += i + " " return out def BoW_not_have_fhash(File, stopword): file_not_have_stopword = '' for i in File: file_not_have_stopword+= remove_stopword(remove_punctuation(i), stopword) + ' ' a = file_not_have_stopword.split() return count(a) def fhash(w, M): G = 37 f = 0 for i in range(len(w)): f += ord(w[i])*(G**i) return f%M def BoW_have_fhash(File, stopword,M): file_have_stopword = '' for i in File: file_have_stopword += remove_stopword(remove_punctuation(i), stopword) + ' ' a = file_have_stopword.split() b = [] # print(a) for i in a: b.append(fhash(i, M)) # print(b) return count(b) file_name = input("File name = ") f = open(file_name) f = read_file(f) stopword = open('stopword.txt') stopword = read_stopword(stopword) while True: input_it = input("Use feature hashing ? (y,Y,n,N) ") if input_it == 'Y' or input_it == 'y': M = int(input("M = ")) break elif input_it == 'N' or input_it == 'n': M = False break else: print("Try again") print('-------------------') print("char count = "+ str(char_count(f)) ) print("alphanumeric count = "+ str(alphabet_count(f))) print("line count = "+ str(len(f))) print("word count = "+ str(word_count(f))) if M == False : print("BoW = " + str(BoW_not_have_fhash(f,stopword))) else : print("BoW = " + str(BoW_have_fhash(f,stopword,M)))
# 6330587521 (26.00) 401 (2021-03-22 14:53) #Prog-08: Bag-of-words #6330587521 (26.00) Aunchisa Suwanchatree #----------------------------------------------------------------------------------------------------- def choose(ans): ans=ans.lower() repeat=0 while repeat==0: if ans=='y': return True elif ans=='n': return False else: print('Try again.') repeat=0 ans=input('Use feature hashing ? (y,Y,n,N) ').lower() def remove_punc(text): result='' for ch in text: if not ch.isalnum(): result += " " else: result += ch return result def stopword(): sw=open('stopwords.txt','r') result='' for line in sw: line=line.lower().strip() line=remove_punc(line) result+=line result+=' ' sw.close() return result.split() def delete_stopword(listofword): result=[] for e in listofword: if not e in stopword(): result.append(e) return result def fhash(w,m): G=37 j=[] for i in range(len(w)): j.append(ord(w[i])*G**i) return sum(j)%m #----------------------------------------------------------------------------------------------------- file_name=input('File name = ') choosing=choose(input('Use feature hashing ? (y,Y,n,N) ')) if choosing: m=int(input('M = ')) lines=open(file_name,'r') wordcount=0 linecount=0 chcount=0 aln=0 text='' for line in lines: linecount+=1 for e in line: #count the charater if e!= '\n': chcount+=1 #count only num and alphabet if e.isalnum(): aln+=1 #word count wordcount+= len(remove_punc(line).split()) #text for bow text += remove_punc(line.lower()) text = delete_stopword(text.split()) print('-'*19) print('char count =',chcount) print('alphanumeric count =',aln) print('line count =',linecount) print('word count =',wordcount) #case of no if not choosing: text.sort() text.append(text[-1]+'cream') k=text[0] c=1 bow=[] for i in range(1,len(text)): if text[i]==k: c+=1 else: bow.append([k,c]) k=text[i] c=1 print ('BoW =',bow) #case of yes if choosing: numfhash=[] for e in text: numfhash.append(fhash(e,m)) #bow numfhash.sort() numfhash.append(numfhash[-1]+1) k=numfhash[0] c=1 bow=[] for i in range(1,len(numfhash)): if numfhash[i]==k: c+=1 else: bow.append([k,c]) k=numfhash[i] c=1 print ('BoW =',bow) lines.close()
# 6330588121 (30.00) 402 (2021-03-22 23:45) f=input('File name = ') h=input('Use feature hashing ? (y,Y,n,N) ') file_name=open(f) file_st=open('stopwords.txt','r') def low_er(file_name, file_st): st=[];txt=[];t='' for line in file_st: for e in line.strip().split(): st.append(e) for line in file_name: for i in line: if i.lower() in '0123456789abcdefghijklmnopqrstuvwxyz' or i.lower() in ' ': t+=i.lower() else: t+=' ' tlis=t.split() for n in tlis: if n not in st: txt.append(n) if txt==[]: return '' else: return txt def char_count(f): file_name=open(f,'r') c=0 for line in file_name: for i in line: if i!='\n': c+=1 file_name.close() return c def alphanumeric_count(f): file_name=open(f,'r') c=0 for line in file_name: for t in line: if t.isalnum(): c+=1 file_name.close() return c def line_count(f): file_name=open(f,'r') c=0 for line in file_name: c+=1 return c def word_count(f): file_name = open(f,'r') p = '' for line in file_name: for ss in line: if ss.isalnum(): p += ss else: p += ' ' file_name.close() return len(p.split()) def bow1(file_name, file_st): low=sorted(low_er(file_name, file_st)) n=1;a=[] for i in range(len(low)-1): if low[i]==low[i+1]: n+=1 else: a.append([low[i],n]) n=1 if low==[]: return [] elif low[-2]==low[-1]: a.append([low[-2],n]) else: a.append([low[-1],1]) return a def outcome(f): print('-'*19) print('char count =',char_count(f)) print('alphanumeric count =',alphanumeric_count(f)) print('line count =',line_count(f)) print('word count =',word_count(f)) def fhash(M, f, file_st): s=0;b=[] l=low_er(file_name, file_st) for w in l: for i in range(len(w)): s+=ord(w[i])*(37**i) f=s % M s=0 b.append(f) fh=sorted(b) Q=[] for i in fh: if i not in Q: Q.append(i) return sorted(count(fh,Q)) def count( data, element ): c = 0 a = [] for i in element: for e in data: if e == i: c += 1 a.append([i,c]) c=0 return a while h not in ['y','Y','n','N']: print('Try again.') h=input('Use feature hashing ? (y,Y,n,N) ') if h in ['n','N']: outcome(f) print('BoW =',bow1(file_name, file_st)) if h in ['y','Y']: M=int(input('M = ')) outcome(f) print('BoW =',fhash(M, file_name, file_st)) file_name.close() file_st.close()
# 6330589821 (27.00) 403 (2021-03-19 17:17) #--------------------------------------------------- f_name = input('File name = ') fh = input('Use feature hashing ? (y,Y,n,N) ') if fh == 'y' or fh == 'Y': m = input('M = ') print('-------------------') while fh not in 'YyNn': print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') #--------------------------------------------------- #char count #alphanumeric count #line count #word count file = open(f_name) char_count = 0 alpha ='' word ='' line_count = 0 for line in file: if len(line) != 0: char_count += len(line)-1 line_count += 1 if line[-1] != '\n': char_count += 1 for e in line: if e.lower() in 'abcdefghijklmnopqrstuvwxyz' or\ e.lower() in '0123456789': alpha += e word += e else: word += ' ' alpha_count = len(alpha) word_count = len(word.split()) print('char count = '+str(char_count)) print('alphanumeric count = '+ str(alpha_count)) print('line count = ' + str(line_count)) print('word count = '+ str(word_count)) file.close() #--------------------------------------------------- #stop word stop = open('stopwords.txt','r') stop_word ='' for line in stop: for e in line: if e.lower() in 'abcdefghijklmnopqrstuvwxyz' or\ e.lower() in '0123456789': stop_word += e else: stop_word += ' ' stop_word =stop_word.split() stop.close() #--------------------------------------------------- #remove stop word file = open(f_name) s_word = '' for line in file: for e in line: if e.lower() in 'abcdefghijklmnopqrstuvwxyz' or\ e.lower() in '0123456789': s_word += e else: s_word += ' ' s_word = s_word.lower().split() sig_word =[] for i in range(len(s_word)): if s_word[i] not in stop_word: sig_word.append(s_word[i]) file.close() #--------------------------------------------------- #BoW #--------------------------------------------------- bow = [] sig_word.sort() if fh in 'nN': if stop_word != s_word: num = 0 for i in range(len(sig_word)): if i == 0: check = sig_word[i] num = 1 elif sig_word[i] == check: num += 1 else: bow.append([check, num]) check = sig_word[i] num = 1 bow.append([check, num]) print('Bow =',bow) #--------------------------------------------------- elif fh in 'yY': f_hash = [] for e in sig_word: hash_num = 0 for i in range(len(e)): hash_num += ord(e[i])*(37**i) hash_num = hash_num%int(m) f_hash.append(hash_num) f_hash.sort() bow = [] if stop_word != s_word: for i in range(len(f_hash)): if i == 0: check = f_hash[i] num = 1 elif f_hash[i] == check: num += 1 else: bow.append([check,num]) check = f_hash[i] num = 1 bow.append([check,num]) print('BoW =',bow)
# 6330591021 (22.00) 404 (2021-03-22 16:21) def fhash(w,m) : a=0 n=0 for i in w : a=a+(ord(i)*(37**n)) n=n+1 a=a%int(m) return a print("File name =" , end=" ") file_name = input() file=open(file_name,'r') line_count=0 char_count=0 al_count=0 allwords=[] word="" word_count=0 bow=[] for x in file: #print(x) char_count=char_count+len(x) line_count=line_count+1 for i in x : if i.lower() in "1234567890abcdefghijklmnopqrstuvwxyz" : al_count = al_count+1 word=word+i if i.lower() not in "1234567890abcdefghijklmnopqrstuvwxyz" : word=word+" " a = ((word.lower()).split()) a.sort() #print("a= ", a) stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"] for i in a : word_count=word_count+1 if i not in allwords : allwords.append(i) if i not in stopwords : bow.append([i.lower(),a.count(i)]) #print("allwords=",allwords) print("Use feature hashing ? (y,Y,n,N)" , end=" ") n= input() check=0 while check==0 : if (n=="y" or n=="Y") : check=1 print("M = " ,end= "") m=input() print("-------------------") print("char count =",char_count+1-line_count) print("alphanumeric count =",al_count) print("line count =",line_count) print("word count =",word_count) bownew=[] for i in a : if i not in stopwords : bownew.append(fhash(i,m)) bownew.sort() #print(bownew) allnewans=[] for i in bownew : if [i, bownew.count(i)] not in allnewans : allnewans.append([i, bownew.count(i)]) #print("allnewans=" , allnewans) print("BoW =" , allnewans) elif (n=="n" or n=="N") : check=1 print("-------------------") print("char count =",char_count+1-line_count) print("alphanumeric count =",al_count) print("line count =",line_count) print("word count =",word_count) file.close() print("BoW =",bow) else : print ("Try again.") n = input()
# 6330592621 (26.00) 405 (2021-03-21 19:49) def remove_punc(word): k = '' for i in range(len(word)) : if word[i].lower() in ' 0123456789abcdefghijklmnopqrstuvwxyz': k += word[i].lower() else : k += ' ' return k def char_count(word): count = 0 for i in range (len(word)) : count += 1 return count def word_count(word): m = [] k = remove_punc(word) k = k.split(" ") for i in range (len(k)): if k[i] != '' : m.append(k[i]) count = len(m) return count def alphanum_count(words) : count = 0 for i in range (len(words)) : if words[i].lower() in '0123456789abcdefghijklmnopqrstuvwxyz': count += 1 return count def cut_stop (words): stopwords = open('stopwords.txt' , 'r') w = '' stop_w = [] for word in stopwords: stop_w += word.split() words = words.split() for i in range(len(words)): if words[i] not in stop_w: w += words[i] + ' ' return w def bow (file) : bow = [] word = [] f = [] words = '' file = open(file , 'r') for i in file : words += remove_punc(i) words = cut_stop(words) words = words.split() for i in range (len(words)): if words[i] not in word : word.append(words[i]) f.append(1) else : k = word.index(words[i]) f[k] += 1 for i in range(len(word)) : bow.append([word[i] , f[i]]) return bow def fhash(w , m): a = 0 for i in range (len(w)): a += ord(w[i])*(37)**i k = a % m return k def bow_2 (file , m) : bow = [] word = [] f = [] words = '' file = open(file , 'r') for i in file : words += remove_punc(i) words = cut_stop(words) words = words.split() for i in range (len(words)) : a = fhash(words[i] , m) if str(a) not in word: word.append(str(a)) f.append(1) else : k = word.index(str(a)) f[k] += 1 for i in range (len(word)): bow.append([int(word[i]) , f[i]]) return bow def sum_func(file): a = open(file , 'r') char_countt = 0 word_countt = 0 alpnum_countt = 0 line_countt = 0 for lines in a : char_countt += char_count(lines) alpnum_countt += alphanum_count(lines) word_countt += word_count(lines) line_countt += 1 a.close() return char_countt , alpnum_countt , word_countt , line_countt file_name = input('File name = ') open(file_name , 'r') fh = input('Use feature hashing ? (y,Y,n,N) ') while fh.lower() != 'y' and fh.lower() != 'n' : print('Try again.') fh = input('Use feature hashing ? (y,Y,n,N) ') if fh.lower() == 'n' : a,b,c,d = sum_func(file_name) print('-------------------') print('char count =',a - d + 1) print('alphanumeirc count =',b) print('line count =',d) print('word count =',c) print('BoW =',bow(file_name)) elif fh.lower() == 'y' : a,b,c,d = sum_func(file_name) m = int(input('M =')) print('-------------------') print('char count =',a - d + 1) print('alphanumeirc count =',b) print('line count =',d) print('word count =',c) print('BoW =',bow_2(file_name, m))
# 6330593221 (26.67) 406 (2021-03-22 19:00) def read_stopwords(stopwords): a = [] for line in stopwords: for i in line.split(): a.append(i.lower()) return a def read_file(file): a = [] for line in file: a.append(line) return a def remove_punctuation(s): out = '' for c in s: if c.lower() in '0123456789abcdefghijklmnopqrstuvwxyz': out += c.lower() else: out += ' ' return out def remove_stopwords(line, stopwords): line = line.split() out = [] for i in line: if i not in stopwords:out.append(i) return ' '.join(out) def char_count(File): c = 0 for line in File: for i in line: if i == '\n': pass else: c+= 1 return c def alphanumeric_count(File): q = 0 for line in File: for i in line: if i.lower() in '0123456789abcdefghijklmnopqrstuvwxyz': q += 1 return q def word_count(File): a = 0 for i in File: a += len(remove_punctuation(i).split()) return a def count( data, element): a = [] for e in element: c = 0 for i in data: if i == e:c += 1 a.append([e, c]) return a def BoW_with_fhash(File, stopword,m): a = '' for i in File: a += remove_stopwords(remove_punctuation(i), stopword) + " " a = a.split() e = [] for i in a: e.append(fhash(i, m)) q = [] for i in e: if not i in q: q.append(i) return sorted(count(e, q)) def BoW(File, stopword): a = '' for i in File: a += remove_stopwords(remove_punctuation(i), stopword) + " " a = a.split() e = [] for i in a: if not i in e: e.append(i) return sorted(count(a, e)) def fhash(w, M): G = 37 f = 0 for i in range(len(w)): f += ord(w[i])*(G**i) return f%M def read_Use_feature_hashing_(): a = input("Use feature hashing ? (y,Y,n,N) ") while True: if a == "Y" or a == "y" or a == "n" or a == "N": break a = input("Try again.\nUse feature hashing ? (y,Y,n,N) ") if a == "Y" or a == "y": m = input("M = ") return int(m) else: return False #--------------------------------------------------------------------------- def main(): file = input('File name = ').strip() stop_word = open('stopword.txt') file = open(file) fn = read_file(file) stopwords = read_stopwords(stop_word) m = read_Use_feature_hashing_() print('-------------------') print('char count =', char_count(fn)) print('alphanumeric count =', alphanumeric_count(fn)) print('line count =', len(fn)) print('word count =', word_count(fn)) if m == False: # print('yo') print("BoW =", BoW(fn, stopwords)) else: # print('ya') print('BoW =', BoW_with_fhash(fn, stopwords, m)) file.close() stop_word.close() #-------------------------------------------------------------------------- main()
# 6330594921 (0.00) 407 (2021-03-22 19:09) def summary_count(): print('-'*19) open_file = open('sample.txt') line = open_file.read() char_count = '' for e in line: if not e in '\n': char_count += e char_count = len(char_count) print('char count = ',char_count) alphanumeric_count = '' for e in line: if e.isalnum() : alphanumeric_count += e alphanumeric_count = len(alphanumeric_count) #alphanumeric count print('alphanumeric count = ',alphanumeric_count) open_file.close() open_file = open('sample.txt') line = open_file.readline(); count = 1 for line in open_file: count += 1 print('line count = ', count) # line count open_file.close() open_file = open('sample.txt') word_count = len(open_file.read().split()) print('word count = ', word_count) # word count open_file.close() #--------------------------------------------------------------------------------------------------- def count( data, element ): # return the count of the given element in the given data c = 0 for e in data: if e == element: c += 1 return c #--------------------------------------------------------------------------------------------------- def no_hashing_BoW(): file = open('sample.txt') line1 = file.read(); line1 = line1.lower() L1 ='' for e in line1: q = '' for i in e: if not i in ',.\'\"': q += i L1 += q; line1 = L1.split(); line1.sort() stop = open('stopword.txt') line2 = stop.read(); line2 = line2.lower(); line2 = line2.split() line2.sort() # print(line1); #print(line2) no_stop =[] for e in line1: if not e in no_stop: if not e in line2: no_stop.append(e) # print(no_stop) #---------------------------------------------------------- for e in line1: p =[] for i in no_stop: if i in line1: c = count(line1, i) p.append([i,c]) print('BoW = ',p) #--------------------------------------------------------------------------------------------------- def hashing_BoW(M): file = open('sample.txt') line1 = file.read(); line1 = line1.lower() L1 = '' for e in line1: q = '' for i in e: if not i in ',.\'\"': q += i L1 += q; line1 = L1.split(); line1.sort() stop = open('stopword.txt') line2 = stop.read(); line2 = line2.lower(); line2 = line2.split() line2.sort() # print(line1); #print(line2) no_stop = [] for e in line1: if not e in no_stop: if not e in line2: no_stop.append(e) # print(no_stop) # ----fhash-------------------------------------------------------- def fhash(w, M): s = [] for i in range(len(w)): if w[i] in w: c = ord(str(w[i])) * (37 ** i) s.append(c) return sum(s) % M #------------------------------------------------------------ p =[] for e in line1: if e in no_stop: p.append(fhash(e,M)); p.sort() q =[] for e in p: if not e in q: q.append(e) f = [] for i in range(len(q)): if q[i] in p: c = count(p,q[i]) f.append(c) a =[] for i in range(len(q)): a.append([q[i],f[i]]) return print('BoW = ',a) #--------------------------------------------------------------------------------------------------- def show(): inp = input('File name = ') # ใส่ชื่อแฟ้ม while 'sample.txt' in inp: inp1 = input('Use feature hashing ? (y,Y,n,N) ') if inp1[-1] == 'y' or inp1[-1] == 'Y': # ต้องการทำ feature hashing M = int(input('M = ')) # จากนั้นเอาไปประมวลผลต่อ summary_count();hashing_BoW(M) ; break if inp1[-1] == 'n' or inp1[-1] == 'N'.upper(): # ไม่ต้องการทำ feature hashing summary_count(); no_hashing_BoW(); break else: # ต้องการทำ feature hashing แต่ใส่ผิด print('Try again') #--------------------------------------------------------------------------------------------------- show()
# 6330595521 (18.00) 408 (2021-03-22 22:00) def format(file): with open(file) as f: f = f.readlines() with open('stopword.txt') as stopword: stopword = stopword.read() stopword = stopword.splitlines() stopword = " ".join(stopword) stopword = stopword.split(" ") filt = "0123456789abcdefghijklmnopqrhtuvwxyzs" special = "'" tmp = [] #Line count lines = len(f) count1 = len("".join(f))-lines f = " ".join(f) f = f.splitlines() f = " ".join(f) f = f.split(" ") ans = [i for i in f if len(i) > 0] text = " ".join(ans) text1 = "".join(ans) #text count ascii_count = 0 #Pautuation count all_count = count1 ver = [] result = [] word = "" for i in text: if i.lower() in filt: ascii_count += 1 word += i else: if i == " ": tmp.append(word) word = "" else: tmp.append(word) word = "" tmp.append(word) for i in tmp: if len(i.split("'")) > 1: rel = i.split("'") ver.append(rel[0]) else: if i != "": ver.append(i) for i in ver: if i.lower() in stopword: pass else: if len(i) > 0: result.append(i.lower()) return result, lines, ascii_count, all_count, len(ver) def fhash(w, M): count = 0 for i in range(len(w)): count += ord(w[i])*(37**i) return count%M def bowtext(text): result = [] ans = [] for i in text: if i in ans: pass else: ans.append(i) for i in ans: count = 0 for j in text: if j == i: count += 1 result.append([i, count]) return result def filterhars(bow, M): ans = [] result = [] result1 = [] for i in bow: ans.append(fhash(i, M)) for i in ans: if i in result: pass else: result.append(i) for i in result: count = 0 for j in ans: if j == i: count += 1 result1.append([i, count]) return result1 def main(file, mode): bow, lines, textcount, nonedigit, wordcount = format(file) bow = filterhars(bow, mode) return bow, lines, textcount, nonedigit, wordcount def mainly(file): bow, lines, textcount, nonedigit, wordcount = format(file) bow = bowtext(bow) return bow, lines, textcount, nonedigit, wordcount # if len(i.split("'")) > 1: # ans = i.split("'") # tmp.append(ans[0]) # else: # tmp.append(i) #for i in f: # word = "" # for j in i: # if j in filt: # word += j #def ord(c): #return #def fhash(w, M): #return if __name__ == "__main__": textfile = str(input("File Name = ")) while True: hars = str(input("Use feature hashing ? (y,Y,n,N) ")) if hars.lower() == 'y': m = int(input('M = ')) bow, lines, textcount, paucount, word = main(textfile, m) bow = sorted(bow, key=lambda x: x[0]) break elif hars.lower() == 'n': bow, lines, textcount, paucount, word = mainly(textfile) break else: print('Try again.') print("-------------------") print('char count = {:}'.format(paucount)) print('alphanumueric = {:}'.format(textcount)) print('line count = {:}'.format(lines)) print('word count = {:}'.format(word)) print('BoW = {:}'.format(bow))