0.0% ≤ diff ≤ 50.0%

18 clusters, 50 submissions
ALL: cluster #1 (2)
# 6330303621  (2021-03-21 22:13) %diff = 12.08

file_name = input('File name = ',)
a = input("Use feature hashing ? (y,Y,n,N) ",)
while a != 'n' and a != 'N' and a != 'y' and a != 'Y' :
        print('Try again.')
        a = input("Use feature hashing ? (y,Y,n,N) ",)
if a == 'y' or a == 'Y' :
    b = int(input('M = ',))
print('-------------------')

def fhash(w, M) :
    n = 0
    for i in range(len(w)) :
        n = n+(ord(w[i])*37**i)
    n %= M
    return n
def count_words(w) :
    count5 = 0
    for i in range(len(g)):
        if g[i] == w :
            count5 += 1
        else :
            count5 += 0
    return count5

sw = open('stopwords.txt', 'r')
f = open(file_name, 'r')
ff1 = f.read().strip()
ff = ff1.split()
fff = " ".join(ff)
sww = sw.read().strip().split()

count1 = 0
for line in ff1 :
    if line == '\n' :
        count1+=0
    else :
        count1 += len(line)
print('char count =', count1)

count2 = 0
x=[]
z=[]
for line in ff :
    for i in range(len(line)) :
        if line[i].isalnum()==True :
            count2 += 1
            x.append(line[i])
            y = ''.join(x)
        else :
            continue
    z.append(y)
    x=[]
print('alphanumeric count =', count2)

count3 = 0
f = open(file_name, 'r')
for line in f :
    count3 += 1
print('line count =', count3)

count4 = 0
for i in range(len(fff)) :
    if fff[i]==fff[0] :
        continue
    if fff[i].isalnum()==False and fff[i].isalnum() != fff[i-1].isalnum() :
        count4 += 1
    else :
        continue
print('word count =', count4)

BoW = []
BoW0 = []
BoW1 = []
BoW2 = []
g = " ".join(z).lower().split()
if a == 'y' or a == 'Y' :
    for i in range(len(g)) :
        if g[i] not in sww and g[i] not in BoW0 :
            BoW0.append(g[i])
            BoW1.append([fhash(g[i],b), count_words(g[i])])
    k = sorted(BoW1)
    for i in range(len(k)) :
        if i < len(k)-1 :
            for j in range(i+1,len(k)) :
                if k[i][0]==k[j][0] :
                    k[i][1]+=k[j][1]
    for i in range(len(k)) :
        if k[i][0]==k[i-1][0] :
            continue
        else :
            BoW.append(k[i])
    print('BoW =', BoW)
elif a == 'n' or a == 'N' :
    for i in range(len(g)) :
        if g[i] not in sww :
            BoW2.append([g[i], count_words(g[i])])
    k = sorted(BoW2)
    for i in range(len(k)) :
        if k[i][0]==k[i-1][0] :
            continue
        else :
            BoW.append(k[i])
    print('BoW =', BoW)
f.close()
sw.close() # 6330565721  (2021-03-22 21:11) %diff = 12.08
file_name = input('File name = ',)
x = input("Use feature hashing ? (y,Y,n,N) ",)
while x != 'n' and x != 'N' and x != 'y' and x != 'Y' :
        print('Try again.')
        a = input("Use feature hashing ? (y,Y,n,N) ",)
if x == 'y' or x == 'Y' :
    y = int(input('M = ',))
print('-------------------')
def fhash(o, p) :
    f = 0
    for i in range(len(o)) :
        f = f+(ord(o[i])*37**i)
    f %= p
    return f
def count_words(o) :
    ct5 = 0
    for i in range(len(h)):
        if h[i] == o :
            ct5 += 1
        else :
            ct5 += 0
    return ct5
s = open('stopwords.txt', 'r') ; z = open(file_name, 'r') ; z2 = z.read().strip() ; z3 = z2.split() ; z4 = " ".join(z3)
ss = s.read().strip().split()
ct1 = 0
for line in z2 :
    if line == '\n' :
        ct1+=0
    else :
        ct1 += len(line)
print('char count =', ct1)
ct2 = 0 ; a=[] ; b=[]
for line in ff :
    for i in range(len(line)) :
        if line[i].isalnum()==True :
            ct2 += 1
            a.append(line[i])
            b = ''.join(a)
        else :
            continue
    c.append(b)
    a=[]
print('alphanumeric count =', ct2)
ct3 = 0
z = open(file_name, 'r')
for line in z :
    ct3 += 1
print('line count =', ct3)
ct4 = 0 ; b=[]
for i in range(len(z4)) :
    if z4[i]==z4[0] :
        b.append(z4[i])
        continue
    if z4[i].isalnum()==False and z4[i].isalnum() != z4[i-1].isalnum() :
        ct4 += 1
    else :
        b.append(z4[i])
        continue
print('word count =', ct4)
BW = [] ; BW0 = [] ; BW1 = [] ; BW2 = []
h = " ".join(c).lower().split()
if x == 'y' or x == 'Y' :
    for i in range(len(h)) :
        if h[i] not in ss and h[i] not in BW0 :
            BW0.append(h[i])
            BW1.append([fhash(h[i],b), count_words(h[i])])
    q = sorted(BW1)
    for i in range(len(q)) :
        if i < len(q)-1 :
            for j in range(i+1,len(q)) :
                if q[i][0]==q[j][0] :
                    q[i][1]+=q[j][1]
    for i in range(len(q)) :
        if q[i][0]==q[i-1][0] :
            continue
        else :
            BW.append(q[i])
    print('BoW =', BW)
elif x == 'n' or x == 'N' :
    for i in range(len(h)) :
        if h[i] not in ss :
            BW2.append([h[i], count_words(h[i])])
    q = sorted(BoW2)
    for i in range(len(q)) :
        if q[i][0]==q[i-1][0] :
            continue
        else :
            BoW.append(k[i])
    print('BoW =', BW)
z.close()
s.close()
ALL: cluster #2 (2)
# 6330426121  (2021-03-22 18:01) %diff = 13.88

def fhash(w,M):
    x = 0
    for i in range(len(w)):
        x += ord(w[i])*37**i
    return x%M

def to_alpha(s):
    i = 0
    for c in s:
        if c.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
            i += 1
    return i

def check_word(s):
    x = []
    w = ''
    for c in s:
        if c.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
            w += c
        else:
            if w != '':
                x.append(w)
                w = ''
    if w != '':
        x.append(w)
    return x

file_name = input('File name = ')
checkfh = input('Use feature hashing ? (y,Y,n,N) ')
while True:
    if checkfh == 'y' or checkfh == 'Y':
        M = int(input('M = '))
        checkfh = True
        break
    elif checkfh == 'n' or checkfh == 'N':
        checkfh = False
        break
    else:
        print('Try again.')
        checkfh = input('Use feature hashing ? (y,Y,n,N) ')
print('-------------------')

stopword = open('stopwords.txt', 'r')
t = open(file_name, 'r')
stw = []
for line in stopword:
    for e in line.strip().split():
        stw.append(e.lower())
stopword.close()

count_line = 0
count_c = 0
count_alp = 0
count_word = 0
for line in t:
    count_line += 1
    count_c += len(line)
    count_alp += to_alpha(line)
    count_word += len(check_word(line))
count_c -= count_line-1
t.close()

t = open(file_name, 'r')
BoW = []
if checkfh == True:
    W_in_BoW = []
    for line in t:
        for e in check_word(line):
            if e.lower() not in stw:
                if fhash(e.lower(),M) not in W_in_BoW:
                    W_in_BoW.append(fhash(e.lower(),M))
                    BoW.append([fhash(e.lower(),M),1])
                else:
                    for i in range(len(BoW)):
                        if BoW[i][0] == fhash(e.lower(),M):
                            BoW[i][1] += 1
    BoW.sort()
else:
    W_in_BoW = []
    for line in t:
        for e in check_word(line):
            if e.lower() not in stw:
                if e.lower() not in W_in_BoW:
                    W_in_BoW.append(e.lower())
                    BoW.append([e.lower(),1])
                else:
                    for i in range(len(BoW)):
                        if BoW[i][0] == e.lower():
                            BoW[i][1] += 1
t.close()

print('char count =',count_c)
print('alphanumeric count =',count_alp)
print('line count =',count_line)
print('word count =',count_word)
print('BoW =',BoW) # 6330433521  (2021-03-22 16:52) %diff = 13.88

def fhash(w,M):
    nword = []
    f = 0
    for e in w:
        nword.append(e[:len(e)+1])
    for i in range(len(nword)):
        f += (ord(nword[i])*37**i)
    return f%M

def alphabet(t):
    c = 0
    for e in t:
        if e.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
            c += 1
    return c

def chword(t):
    x = []
    w = ''
    for e in t:
        if e.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
            w += e
        else:
            if w != '':
                x.append(w)
                w = ''
    return x

file_name = input('File name = ')
feature = input('Use feature hashing ? (y,Y,n,N) ')
while True:
    if feature == 'y' or feature == 'Y':
        M = int(input('M = '))
        feature = True
        break
    elif feature == 'n' or feature == 'N':
        feature = False
        break
    else:
        print('Try again.')
        feature = input('Use feature hashing ? (y,Y,n,N) ')
print('-------------------')


stopword = open('stopwords.txt', 'r')
fle = open(file_name,'r')
stop = []
for line in stopword:
    for e in line.strip().split():
        stop.append(e.lower())
stopword.close

chcount = 0
alcount = 0
lincount = 0
wcount = 0
for line in fle:
    lincount += 1
    chcount += len(line)
    alcount += alphabet(line)
    wcount += len(chword(line))
chcount = chcount - lincount +1
fle.close()

fle = open(file_name, 'r')
bow = []
if feature == True:
    wbow = []
    for line in fle:
        for e in chword(line):
            if e.lower() not in stop:
                if fhash(e.lower(),M) not in wbow:
                    wbow.append(fhash(e.lower(),M))
                    bow.append([fhash(e.lower(),M), 1])
                else:
                    for i in range(len(bow)):
                        if bow[i][0] == fhash(e.lower(),M):
                            bow[i][1] += 1
    bow.sort()
else:
    wbow = []
    for line in fle:
        for e in chword(line):
            if e.lower() not in stop:
                if e.lower() not in wbow:
                    wbow.append(e.lower())
                    bow.append([e.lower(),1])
                else:
                    for i in range(len(bow)):
                        if bow[i][0] == e.lower():
                            bow[i][1] += 1
fle.close()

print('char count = ', chcount)
print('alphanumeric count = ', alcount)
print('line count = ', lincount)
print('word count = ', wcount)
print('BoW = ', bow)
ALL: cluster #3 (5)
# 6330362621  (2021-03-22 00:39) %diff = 26.74
filename=input('File name = ')
feature=input('Use feature hashing ? (y,Y,n,N) ')
while feature not in 'yYnN':
    print('Try again.')
    feature=input('Use feature hashing ? (y,Y,n,N) ')
if feature in "yY":
    M=int(input("M = "))
print('-------------------')
file= open(filename).read().lower().strip('\n')
file2= open('stopwords.txt').read().lower()

charcount =0
alphanumericcount=0
linecount=1
word=[]
word2=[]
a=''
b=''
def fhash(w,M):
    ans=0
    for i in range(len(w)):
        ans+=ord(w[i])*(37**i)
    return ans%M
for e in file:
    if e!='\n':
        charcount+=1
    else:
        linecount+=1
    if 'a'<=e<='z' or '0'<= e<='9':
        alphanumericcount+=1
        a+=e
    else:
        if a!='':
            word.append(a)
            a=''
print('char count =',charcount)
print('alphanumeric count =',alphanumericcount)
print('line count =',linecount)
if a!='':
    word.append(a)
wordcount=len(word)
print('word count =',wordcount)
for x in file2:
    if 'a'<=x<='z' or '0'<=x<='9':
        b+=x
    else:
        if b!='':
            word2.append(b)
            b=''
if b!='':
    word2.append(b)


for e in word2:
    for i in range(word.count(e)):
        word.remove(e)

if feature in "yY":
    for i in range(len(word)):
        word[i]=fhash(word[i],M)


BoW =[]
a=[]
for e in word:
    if e not in a:
        BoW.append([e,word.count(e)])
        a.append(e)


BoW.sort()
print('BoW =',BoW) # 6330375821  (2021-03-22 22:25) %diff = 26.74
file_name=input('File name = ')
ft=input('Use feature hashing ? (y,Y,n,N) ')
while ft not in 'yYnN':
    print('Try again.')
    ft=input('Use feature hashing ? (y,Y,n,N) ')
if ft in "yY":
    M=int(input("M = "))
print('-------------------')

file= open(file_name).read().lower().strip('\n')
file2= open('stopwords.txt').read().lower()

Ch_c,Ap_c,L_c,word,word2,A,B =0,0,1,[],[],'',''



def fhash(w,M):
    ans=0
    for i in range(len(w)):
        ans+=ord(w[i])*(37**i)
    return ans%M
def PrBow(word):

    Bow,A =[],[]

    for e in word:
        if e not in A:
            Bow.append([e,word.count(e)])
            A.append(e)
    Bow.sort()
    print('BoW =',Bow)
    return(Bow)
for e in file:
    if e!='\n':
        Ch_c+=1
    else:
        L_c+=1
    if 'A'<=e<='z' or '0'<= e<='9':
        Ap_c+=1
        A+=e
    else:
        if A!='':
            word.append(A)
            A=''
print('char count =',Ch_c)
print('alphanumeric count =',Ap_c)
print('line count =',L_c)
if A!='':
    word.append(A)
word_count=len(word)
print('word count =',word_count)
for x in file2:
    if 'A'<=x<='z' or '0'<=x<='9':
        B+=x
    else:
        if B!='':
            word2.append(B)
            B=''
if B!='':
    word2.append(B)


for e in word2:
    for i in range(word.count(e)):
        word.remove(e)

if ft in "yY":
    for i in range(len(word)):
        word[i]=fhash(word[i],M)

PrBow(word) # 6330355221  (2021-03-22 23:03) %diff = 34.41

#---------------------------------------

def fhash(w,M):
    confhash = 0
    for i in range(len(w)):
        confhash += ord(w[i])*(37**i)
    return confhash % M

#---------------------------------------

vala = ''
valb = ''
vocab_one = []
vocab_two = []
linecount = 1
sarawordcount = 0
sicticcount = 0

#---------------------------------------

list_Fileimport = input('File name = ')
thename_char = input('Use feature hashing ? (y,Y,n,N) ')

#---------------------------------------

while thename_char not in 'yYnN':
    print('Try again.')
    thename_char = input('Use feature hashing ? (y,Y,n,N) ')
if thename_char in "yY":
    case_one = int(input("M = "))
print('-------------------')

linefilea = open('stopwords.txt')
linefileaa = linefilea.read()
open_filetwo = linefileaa.lower()

linefile = open(list_Fileimport)
linefiles = linefile.read()
linefiless = linefiles.lower()
open_file = linefiless.strip('\n')

#---------------------------------------

for i_e in open_file:
    if i_e != '\n':
        sicticcount += 1
    else:
        linecount += 1
    if 'a'<= i_e <='z' or '0'<= i_e <='9':
        sarawordcount += 1
        vala += i_e
    elif vala != '':
        vocab_one.append(vala)
        vala = ''
print('char count =', sicticcount)
print('alphanumeric count =', sarawordcount)
print('line count =', linecount)

#---------------------------------------

if vala != '':
    vocab_one.append(vala)
countvocab = len(vocab_one)
print('word count =',countvocab)
for i_j in open_filetwo :
    if '0'<= i_j <='9' or 'a'<= i_j <='z':
        valb+=i_j
    elif valb != '':
        vocab_two.append(valb)
        valb = ''
if valb!='':
    vocab_two.append(valb)
for i_o in vocab_two:
    for i in range(vocab_one.count(i_o)):
        vocab_one.remove(i_o)
if thename_char in "yY":
    for i in range(len(vocab_one)):
        vocab_one[i] = fhash(vocab_one[i],case_one)

#---------------------------------------

vala = []
finalBoW =[]
for i_k in vocab_one:
    if i_k not in vala:
        finalBoW.append([i_k,vocab_one.count(i_k)])
        vala.append(i_k)

#---------------------------------------

finalBoW.sort()
print('BoW =',finalBoW) # 6330574321  (2021-03-22 22:28) %diff = 42.96

file=input("File name = ")
file1="sample.txt"
file2="stopwords.txt"
fea=input("Use feature hashing ? (y,Y,n,N) ")
a=["y","Y","n","N"]
b=["y","Y"]

while fea not in a:
    print("Try again.")
    fea=input("Use feature hashing ? (y,Y,n,N) ")
if fea in b:
    M=int(input("M = "))

f1=open(file1,"r").read().lower()
f2=open(file2,"r").read().lower()

def fhash(j,p):
    c=0
    A=len(j)
    for e in range(A):
        c=c+ord(j[e])*(37**e)
    d=int(c%p)
    return d

k=0
l=1
n=0
w=[]
s=[]
o=[]
r=[]
x=""
y=""
for i in f1:
    if i!="\n":
        k=k+1
    else:
        l=l+1
    if "0"<=i<="9" or "a"<=i<="z":
        n=n+1
        x=x+i
    elif x!="":
        w.append(x)
        x=""

print("-------------------")
K=str(k)
print("char count = "+K)
N=str(n)
print("alphanumeric count = "+N)
L=str(l)
print("line count = "+L)
if x!="":
    word.append(x)
    x=""
W2=str(len(w))
print("word count = "+W2)

for e in f2:
    if "0"<=e<="9" or "a"<=e<="z":
        y=y+e
    elif y!="":
        s.append(y)
        y=""

if y!="":
    s.append(y)

for e in s:
    for i in range(w.count(e)):
        w.remove(e)

w1=len(w)
if fea in b:
    for i in range(w1):
        w[i]=fhash(w[i],M)

for e in w:
    if e not in r:
        o.append([e,w.count(e)])
        r.append(e)
o.sort()
O=str(o)
print("BoW = ",O) # 6330467921  (2021-03-21 23:06) %diff = 49.49

def fhash(w,M):
    s = 0
    for i in range(len(w)):
       s += ord(w[i])*(37**i)
    fh = s%M
    return fh

def count(word, wordslist):
    c = 0
    for w in wordslist:
        if w == word:
            c += 1
    return c

file_name = input('File name = ')
yn = input('Use feature hashing ? (y,Y,n,N) ')
while yn not in 'yYnN':
    print('Try again.')
    yn = input('Use feature hashing ? (y,Y,n,N) ')
if yn == "y" or yn == 'Y':
    M = int(input('M = '))

print('-------------------')

stopwords = []
stopfile = open("stopwords.txt","r")
for line in stopfile:
    line = line.lower()
    if len(line) > 0:
        stopwords += line.split()
stopfile.close()

abnum = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
cc = 0
abc123 = 0
lc = 0
words = []
file = open(file_name,"r")
for line in file:
    l = ''
    for a in line:
        if a != '\n':
            cc += 1
        if a not in abnum:
            l += ' '
        else:
            l += a
            abc123 +=1
    words += l.split()
    if len(line) > 0:
        lc += 1
file.close()
print('char count =',cc)
print('alphanumeric count =',abc123)
print('line count =',lc)
print('word count =',len(words))

for i in range(len(words)):
    words[i] = words[i].lower()
for i in range(len(stopwords)):
    stopwords[i] = stopwords[i].lower()

cut_words = []
for a in words:
    if a not in stopwords:
        cut_words.append(a)

if yn == 'y' or yn == 'Y':
    for i in range(len(cut_words)):
       cut_words[i] = fhash(cut_words[i],M)
bow = []
for e in cut_words:
    if e not in bow:
        bow.append(e)
for i in range(len(bow)):
    bow[i] = [bow[i], count(bow[i],cut_words)]
bow.sort()

print('BoW =',bow)
ALL: cluster #4 (7)
# 6330241321  (2021-03-22 23:59) %diff = 27.07


def fhash(w, M) :
    a = 0
    for i in range(len(w)) :
        a = a + (ord(w[i]) * ((37)**i))
    a = a % M
    return a




file_name = input("File name = ")
ans = input("Use feature hashing ? (y,Y,n,N) ")
M = 0
while ans != "n" and ans != "N" and ans != "y" and ans != "Y" :
    print("Try again.")
    ans = input("Use feature hashing ? (y,Y,n,N) ")
if ans.lower() == "n":
    ans = False
else :
    M = int(input("M = "))
    ans = True
print("-"*19)






l1 = 0
l2 = 0
lineCount = 0
words = []

file_words = open(file_name, "r")
for line in file_words :
    lineCount = lineCount + 1
    for c in line :
        l1 = l1 + 1
        if c == "\n" :
            l1 = l1 - 1
        if ("0" <= c <= "9") or ("a" <= c <= "z") or ("A" <= c <= "Z") :
            l2 = l2 + 1

    word = ""
    for c in line :
        if ("0" <= c <= "9") or ("a" <= c <= "z") or ("A" <= c <= "Z") :
            word = word + c
        else :
            if len(word) != 0 :
                words.append(word)
            word = ""
file_words.close()


stopwords = []
file_stopwords = open("stopwords.txt", "r")
for line in file_stopwords :
    for w in line.split() :
        w = w.lower()
        if w not in stopwords :
            stopwords.append(w)
file_stopwords.close()





a = []
for c in words :
    c = c.lower()
    if c in stopwords :
        pass
    else :
        have = False
        if ans :
            d = fhash(c, M)
            for i in range(len(a)) :
                if a[i][0] == d :
                    a[i][1] = a[i][1] + 1
                    have = True
                    break
            if not have :
                a.append([d, 1])
        else:
            for i in range(len(a)) :
                if a[i][0] == c :
                    a[i][1] = a[i][1] + 1
                    have = True
                    break
            if not have :
                a.append([c, 1])


print("char count =", l1)
print("alphanumeric count =", l2)
print("line count =", lineCount)
print("word count =", len(words))
print("BoW =", a) # 6330257421  (2021-03-22 19:43) %diff = 27.07
def fhash(w, M):
    G = 37
    r = 0
    for i in range(len(w)):
        r += (ord(w[i]) * (G**i))
    return r % M
tx = input('File name = ')
hashing = input('Use feature hashing ? (y,Y,n,N) ')
if(hashing == ''):
  hashing = 'a'
while hashing not in 'yYnN ':
  hashing = input('Use feature hashing ? (y,Y,n,N) ')
  if(hashing == ''):
    hashing = 'a'
  print('Try again.')
if(hashing in 'yY'):
  m = int(input('M = '))
print('-------------------')
i = 0
j = 0
lc = 0
words = []
file = open(tx, 'r')
for l in file:
  lc=lc+1
  for k in l:
    i=i+1
    if(k == '\n'):
      i=i-1
    if('a'<=k<='z')or('A'<=k<='Z')or('0'<=k<='9'):
        j=j+1
  word = ''
  for k in l:
    if('a'<=k<='z')or('A'<=k<='Z')or('0'<=k<='9'):
      word=word+k
    elif len(word) != 0:
      words.append(word)
      word = ''
file.close()
stopword = []
stop = open('stopwords.txt', 'r')
for line in stop:
    for word in line.strip().split():
        word = word.lower()
        if word not in stopword:
            stopword.append(word)
stop.close()
r = []
for c in words:
    c = c.lower()
    if c not in stopword:
        if hashing in 'yY':
            cEdit = fhash(c, m)
            for x in range(len(r)):
                if r[x][0] == cEdit:
                    r[x][1] += 1
                    break
            else:
                r.append([cEdit, 1])
        else:
            for x in range(len(r)):
                if r[x][0] == c:
                    r[x][1] += 1
                    break
            else:
                r.append([c, 1])
print('char count =', i)
print('alphanumeric count =', j)
print('line count =', lc)
print('word count =', len(words))
print('BoW =', r) # 6330188821  (2021-03-22 23:18) %diff = 33.55


file_name = input("File name = ")



BoW = input("feature hashing ? (y,Y,n,N) ")
M = - 1
while BoW not in "nNyY":
    print ("try again")
    BoW = input("feature hashing ? (y,Y,n,N) ")
if BoW in "Yy":
    M = int(input("M = "))
    BoW = True
else:
    BoW = False
print("-------------------")





a = []
stop = open("stopwords.txt" , "r")
for line in stop:
     for x in line.strip().split():
         x = x.lower()
         if x not in a:
            a.append(x)
stop.close()



len1 = 0
len2 = 0
linecount = 0
words = []

file = open(file_name , "r")
for line in file:
    linecount += 1
    for b in line:
        len1 += 1
        if ("A"<= b <= "Z") or  ("a"<= b <="z") or ("0" <= b <= "9"):
            len2 += 1
        if b == "\n":
            len1 -= 1

        word = ''
    for b in line:
        if ('A' <= b <= 'Z') or ('a' <= b <= 'z') or ('0' <= b <= '9'):
              word += b
        else:
            if len(word) != 0:
                words.append(word)
            word = ""
file.close()

def get(words, stopWords, isBoW, M):
    k = []
    for p in words:
        p = p.lower()
        if p in stopWords:
            pass
        else:
            found = False
            if BoW:
                G = 37
                r = 0
                for i in range(len(x)):
                    p = ord(x[i])
                    p = p * (G**i)
                    r += p
                    Edit = r % M
                else:
                    for i in range(len(k)):
                        if k[i][0]==Edit:
                            k[i][1] += 1
                            found = True
                            break
                    if not found:
                        k.append([Edit, 1])
                    else:
                        for i in range(len(k)):
                            if k[i][0] == p:
                                k[i][1] += 1
                                found = True
                                break
                        if not found :
                            k.append([p, 1])
    return k

print("char count=", len1)
print("alphanumeric count", len2)
print("line count=", linecount)
print("word count =", len(words))
print("BoW =", get(words, a, BoW, M)) # 6330487421  (2021-03-22 23:39) %diff = 33.55

def iinput():
    M=-1

    file_name = input('File name = ')
    wantfhash = input('Use feature hashing ? (y,Y,n,N) ')
    while wantfhash not in ['y', 'Y', 'n', 'N']:
        print('Try again.')
        wantfhash = input('Use feature hashing ? (y,Y,n,N) ')
    if wantfhash in ['y', 'Y']:
        M = int(input('M = '))
        wantfhash = True
    else:
        wantfhash = False
    print('-------------------')

    return file_name, wantfhash, M

def sstopwords():
    x = []
    stopWordsFile = open('stopwords.txt', 'r')
    for line in stopWordsFile:
        for word in line.strip().split():
            word = word.lower()
            if word not in x:
                x.append(word)
    stopWordsFile.close()

    return x

def wwords(file_name):
    q = 0
    p = 0
    lines = 0
    words = []

    wordsFile = open(file_name, 'r')
    for line in wordsFile:
        lines += 1
        for y in line:
            q += 1
            if y == '\n':
                q -= 1
            if ('A' <= y <= 'Z') or('a' <= y <= 'z') or ('0' <= y <= '9')  :
                p += 1

        w = ''
        for y in line:
            if ('0' <= y <= '9') or ('A' <= y <= 'Z') or('a' <= y <= 'z')  :
                w += y
            else:
                if len(w) != 0:
                    words.append(w)
                w = ''
    wordsFile.close()

    return q, p , lines, words

def fhash(w, M):
    G = 37
    x = 0
    for i in range(len(w)):
        x += (ord(w[i])*(G**i))

    return x % M

def bbow(words, stopWords, wantfhash, M):
    r = []
    for y in words:
        y = y.lower()
        if y in stopWords:
            pass
        else:
            found = False
            if wantfhash:
                cEdit = fhash(y, M)
                for i in range(len(r)):
                    if r[i][0] == cEdit:
                        r[i][1] += 1
                        found = True
                        break
                if not found:
                    r.append([cEdit, 1])
            else:
                for i in range(len(r)):
                    if r[i][0] == y:
                        r[i][1] += 1
                        found = True
                        break
                if not found:
                    r.append([y, 1])


    return r

#----------------------------------------------------------------------------
file_name, wantfhash, M = iinput()
stopWords = sstopwords()
q, p , lines, words = wwords(file_name)
print('char count =', q)
print('alphanumeric count =', p)
print('line count =', lines)
print('word count =', len(words))
print('BoW =', bbow(words, stopWords, wantfhash, M)) # 6330477121  (2021-03-22 23:55) %diff = 38.01


def Input_data():
    Count = 0
    M = -1

    File_name_input = input('File name = ')

    BoW_num = input('Use feature hashing ? (y,Y,n,N) ')

    while BoW_num not in ['Y', 'y', 'N', 'n']:
        print('Try again.')
        BoW_num = input('Use feature hashing ? (y,Y,n,N) ')
        Count += 1

    if BoW_num in ['y', 'Y']:
        M = int(input('M = '))
        BoW_num = True

    elif BoW_num in ['n' , 'N']:
        BoW_num = False

    else:
        pass

    print('-------------------')

    return File_name_input, BoW_num, M

def TikTok(w, M):
    Start = 37
    Second = 0
    for i in range(len(w)):
        Second += ((Start**i) * ord(w[i]))

    Ans = (Second % M)

    return Ans

def Words_Func(File_name_input):
    Lenght1= 0
    Lenght2 = 0
    Num_Line = 0
    words = []
    Count = 0
    word = ''
    wordsFile = open(File_name_input, 'r')

    for line in wordsFile:
        Num_Line += 1
        for c in line:
            Lenght1+= 1
            if c == '\n':
                Lenght1-= 1
            else:
                pass
            #Count += 1

        for c in line:
            if ('a' <= c <= 'z') or ('A' <= c <= 'Z') or ('0' <= c <= '9'):
                Lenght2 += 1
                word += c

            else:
                if len(word) != 0:
                    words.append(word)
                else:
                    False
            #Count += 1

                word = ''

    wordsFile.close()

    return Lenght1 , Lenght2 , Num_Line, words


def StopWords_Func():
    r = []
    File_Of_stopWords = open('stopwords.txt', 'r')
    Count = 0

    for line in str(File_Of_stopWords):
        for i in line.strip().split():
            i = i.lower()

            if i not in r:
                r.append(i)

            else:
                False

            #Count += 1
    File_Of_stopWords.close()

    return r



def BoW_Ans(words, stopWords, BoW_num, M):
    Ans = []
    for j in words:
        j = j.lower()

        if j in stopWords:
            pass

        else:
            Check = False
            if BoW_num:
                Edit = TikTok(j, M)
                for i in range(len(Ans)):

                    if Ans[i][0] == Edit:
                        Ans[i][1] += 1
                        Check = True
                        break

                    else:
                        pass

                if not Check:
                    Ans.append([Edit, 1])
                else:
                    pass

            else:
                for i in range(len(Ans)):
                    if Ans[i][0] == j:
                        Ans[i][1] += 1
                        Check = True
                        break
                    else:
                        False

                if not Check:
                    Ans.append([j, 1])
                else:
                    pass

    return Ans


File_name_input,\
BoW_num, \
M = Input_data()

stopWords = StopWords_Func()

Lenght1,\
Lenght2 ,\
Num_Line, words = Words_Func(File_name_input)


print('char count = ', Lenght1)
print('alphanumeric count = ', Lenght2)
print('line count = ', Num_Line)
print('word count = ', len(words))
print('BoW = ', BoW_Ans(words, stopWords, BoW_num, M)) # 6330255121  (2021-03-22 23:47) %diff = 47.63


file_name = input('File name = ')
fh = input('Use feature hashing ? (y,Y,n,N) ')
while fh != 'y' and fh != 'Y' and fh != 'n' and fh != 'N':
    print('Try again.')
    fh = input('Use feature hashing ? (y,Y,n,N) ')
if fh == 'y' or fh == 'Y':
    M = int(input('M = '))

print('-------------------')



def tostopwords():
    stopwordlist = []
    stopwords = open('stopwords.txt', 'r')
    for line in stopwords:
        for s in line.split():
            s = s.lower()
            if s not in stopwordlist:
                stopwordlist.append(s)
    stopwords.close()

    return stopwordlist


lenght = 0
wfile = open(file_name, 'r')
for w in wfile:
    for ww in w:
        if ww != '\n':
            lenght += 1
wfile.close()

l = 0
words = []
word = ''
wfile = open(file_name, 'r')
for w in wfile:
    for ww in w:
        if ('a'<=ww<='z') or ('A'<=ww<='Z') or ('0'<=ww<='9'):
            word += ww
        else:
            l += len(word)
            words.append(word)
            word = ''
wfile.close()

words2 = []
for w in words:
    if w != '':
        words2.append(w)


linecount = 0
wfile = open(file_name, 'r')
for line in wfile:
    linecount += 1
wfile.close()



def fhash(word,M):
    G = 37
    v = 0
    for i in range(len(word)):
        v += ord(word[i])*(G**i)

    f = v % M
    return f

#----------------------------------------------------------------
for w in words:
    w = w.lower()




print('char count =', lenght)
print('alphanumeric count =', l)
print('line count =', linecount)
print('word count =', len(words2))
print('BoW =', ) # 6330459921  (2021-03-21 17:25) %diff = 47.89

# --------------------------------------------------
def fhash(w,M) :
    c = []
    for i in range(len(w)) :
        if ("a" <= w[i] <= "z") or ("A" <= w[i] <= "Z") or ("0" <= w[i] <= "9"):
            c.append(w[i])

    a = []
    for i in range(len(c)) :
        x = str(ord(c[i]))
        a.append(x)


    ass = 0
    for i in range(len(a)):
        z =  int(a[i]) * (37**i )
        ass += z
    ass = ass % M
    return ass

# --------------------------------------------------
file_name = input("File name = ")
useBoW = input("Use feature hashing ? (y,Y,n,N)")
# หา "char count" , "alphanumeric count " , "line count" , "words count"
charcount = 0
alphanumericcount = 0
newline = 0
words = []
linecount = 0
file = open(file_name , "r")
for line in file :
    linecount += 1
    for c in line :
        charcount += 1
        if c == "\n" :
            newline += 1
        if "a" <= c <= "z" or "A" <= c <= "Z" or "0" <= c <= "9" :
            alphanumericcount += 1
    word = ""
    for c in line:
        if "a" <= c <= "z" or "A" <= c <= "Z" or "0" <= c <= "9" :
            word += c
        else :
            if len(word) != 0 :
                words.append(word)
            word = ""
charcount = charcount - newline
file.close()

#หาstop words
stopwords = []
stopw = open( "stopwords.txt" , "r")
for line in stopw :
    for w in line.strip().split() :
        w = w.lower()
        if w not in stopwords :
            stopwords.append(w)



while useBoW not in ["y" , "Y" , "n" , "N"] :
    print("Try again.")
    useBoW = input("Use feature hashing ? (y,Y,n,N)")
BoW = []
if useBoW in ["y" , "Y"] :
    M = int(input("M = "))
    print("-------------------")
    for c in words :
        c = c.lower()
        if c in stopwords :
            pass
        else :
            found = 0
            newc = fhash(c,M)
            for i in range(len(BoW)) :
                if  BoW[i][0] == newc :
                    BoW[i][1] += 1
                    found = 1
                    break
            if not found :
                BoW.append([newc,1])

if useBoW in ["n" , "N"] :
    print("-------------------")
    for c in words :
        c = c.lower()
        if c in stopwords :
            pass
        else :
            found = 0
            for i in range(len(BoW)) :
                if  BoW[i][0] == c :
                    BoW[i][1] += 1
                    found = 1
                    break
            if not found :
                BoW.append([c,1])


print("char count = " , charcount)
print("alphanumeric count = " , alphanumericcount)
print("line count = " , linecount)
print("word count = " , len(words))
print("BoW = " , BoW)
ALL: cluster #5 (2)
# 6330234021  (2021-03-22 22:03) %diff = 30.54


def char_count(file_name): ###
    fn = open(file_name)
    c = 0
    for line in fn :
        for e in line :
            if e != '\n' :
                c += 1
    fn.close()
    return c

def alphanumeric_count(file_name) : ###
    fn = open(file_name)
    c = ''
    c_1 = "\"\'/\\,.:; "
    for line in fn :
        for e in line :
            if e not in c_1 :
                c += e
    fn.close()
    return len(c)- line_count(file_name) + 1

def line_count(file_name) :  ###
    fn = open(file_name)
    c = 0
    for line in fn :
        c += 1
    fn.close()
    return c

def word_count(file_name) : ###
    f = open(file_name)
    c = ''
    x = 0
    alp = 'abcdefghijklmnopqrstuvwxyz0123456789'
    for line in f :
        for g in line :
            if g.lower() in alp :
                c += g
            else :
                c += ' '
        x += len(c.split())
        c = ''
    f.close()
    return x

def BoW(file_name , stopwords) : ###
    f1 = open(file_name)
    f2 = open(stopwords)
    l = []
    cc = ''
    d2 = []
    alp = 'abcdefghijklmnopqrstuvwxyz0123456789'
    c =''

    for line in f1 :
        for g in line :
            if g.lower() in alp :
                cc += g.lower()
            else :
                cc += ' '
    for line in f2 :
        c += ' '
        if line[-1] == '\n' : line = line[:-1]
        for g in line :
            c += g.lower()
    c1 = c.split()
    c2 = ''
    for r in cc.split() :
        if r not in c1 :
            c2 += r + ' '
    d = c2.split()
    for d1 in d :
        if d1 not in d2 : d2.append(d1)
    for e in d2 :
        c3 = 0
        w = 0
        while c2.find(e,w) != -1 :
            c3 += 1
            w += c2.find(e,w) + 1
        l.append([e,c3])
    l.sort()
    f1.close()
    f2.close()
    return l

def feature_harshing(l,M) : ###
    x = []
    y = ''
    for l1 in l :
        c = 0
        c1 = 0
        for l3 in l1[0] :
            c += ord(l3)*(37**c1)
            c1 += 1
        flash = c % M
        y += str(flash)*l1[1]

    for i in range(M):
        c2 = 0
        c3 = 0
        while y.find(str(i),c3) != -1 :
            c2 += 1
            c3 = y.find(str(i),c3) + 1
        if c2 != 0 :
            x.append([i,c2])
    return x

def display(file_name , stopwords , x) :
    print("-------------------")
    print("char_count =", char_count(file_name))
    print("alphanumeric_count =", alphanumeric_count(file_name))
    print("line_count =", line_count(file_name))
    print("word_count =", word_count(file_name))
    if x == '0' :
        print('BoW =' , BoW(file_name , stopwords) )
    if x == '1' :
        print('BoW =' , feature_harshing(BoW(file_name , stopwords),M))

file_name = input("File name = ")
x = input("use feature hashing ? (y,Y,n,N)")
while x not in ['y','Y','n','N'] :
    print('Try again.')
    x = input("use feature hashing ? (y,Y,n,N)")
if x == 'y' or x == 'Y' :
    M = int(input("M = "))
    display(file_name , 'stopwords.txt' , '1')
else :
    display(file_name , 'stopwords.txt' , '0') # 6330349521  (2021-03-22 23:53) %diff = 30.54
def char_count(file_name):
    f = open(file_name)
    c = 0
    d = 0
    for line in f:
        c += len(line)
        if line[-1::]=='\n':
            d += 1
    f.close()
    return c-d

def alnum_count(file_name):
    f = open(file_name)
    c = 0
    for line in f:
        for g in line:
            if g.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
                c += 1
    f.close()
    return c

def line_count(file_name):
    f = open(file_name)
    c = 0
    for line in f:
        c += 1
    f.close()
    return c

def word_count(file_name):
    f = open(file_name)
    c = ''
    wc = 0
    for line in f:
        for g in line:
            if g.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
                c += g
            else:
                c += ' '
        wc += len(c.split())
        c = ''
    f.close()
    return wc

def BoW(file_name,stopwords):
    f1 = open(file_name)
    f2 = open(stopwords)
    lb = []
    lc = []
    cfn = ''
    d2 = []
    csw = ''
    cb  = '.'
    for line in f1:
        for g in line:
            if g.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
                cfn += g.lower()
            else:
                cfn += ' '
    for line in f2:
        csw += ' '
        if line[-1::1] == '\n':
            line = line[0:-1:1]
        for g in line:
            csw += g.lower()
    for r in cfn.split():
        if r not in csw.split():
            cb += r
            cb += '.'
    for cdc in cb.split('.'):
        if cdc not in lc:
            lc.append(cdc)
    for e in lc:
        if e == '':
            pass
        else:
            cnb = 0
            w = 0
            while cb.find(e,w)!=-1:
                cnb += 1
                w = cb.find(e,w)+1
            lb.append([e, cnb])
    lb.sort()
    f1.close()
    f2.close()
    return lb

def feature_harshing(l,M):
    s = []
    f = ''
    for l1 in l:
        c = 0
        c1 = 0
        for l3 in l1[0]:
            c += ord(l3)*(37**c1)
            c1 += 1
        fhash = c%M
        f += (str(fhash)+'.')*l1[1]
    for i in range(M):
        c2 = 0
        c3 = 0
        while f.find(str(i),c3) != -1:
            c2 += 1
            c3 = f.find(str(i),c3)+1
        if c2 != 0:
            s.append([i, c2])
    return s

def display(file_name,stopwords,x):
    print('-------------------')
    print('char count =', char_count(file_name))
    print('alphanumeric count =', alnum_count(file_name))
    print('line count =', line_count(file_name))
    print('word count =', word_count(file_name))
    if x=='0':
        print('BoW =', BoW(file_name,stopwords))
    if x=='1':
        print('BoW =',feature_harshing(BoW(file_name,stopwords),M))

file_name = input("File name = ")
x = input("Use feature hashing ? (y,Y,n,N) ").strip()
while x not in ['y','Y','n','N']:
    x = input("Use feature hashing ? (y,Y,n,N) ").strip()
if x == 'y' or x == 'Y':
    M = int(input("M = ").strip())
    display(file_name,'stopwords.txt','1')
else:
    display(file_name,'stopwords.txt','0')
ALL: cluster #6 (2)
# 6330340821  (2021-03-21 02:21) %diff = 35.06
file_name=input('File name = ')
op=input('Use feature hashing ? (y,Y,n,N) ')
while op not in ['y','Y','n','N']:
    print('Try again.')
    op=input('Use feature hashing ? (y,Y,n,N) ')
if op in ['y','Y']:
    M=input('M = ')
print('-------------------')
stop=open('stopwords.txt','r')
file=open(file_name,'r')
linecount=0
wordcount=0
xyz=''
words=[]
charcount=0
alphacount=0
for line in file:
    linecount+=1
    charcount+=len(line)
    for e in line:
        if e.isalnum():
            xyz+=e
        else:
            xyz+=" "
word=xyz.split()
wordcount+=len(word)
for i in word:
    words.append(i.lower())
for e in range(len(word)):
    for u in word[e]:
        if u.lower() in'abcdefghijklmnopqrstuvwxyz0123456789':
            alphacount+=1

charcount=charcount-linecount+1

print('char count =',charcount)
print('alphanumeric count =',alphacount)
print('line count =',linecount)
print('word count =',wordcount)

aa=[]
stopword=[]
for line in stop:
    n= line.split()
    for i in n:
        stopword.append(i.lower())

#######################################
def removepunc(x):
    y=[]
    k=''
    for i in x:
        for e in i:
            if e not in '\'\"\(\),\/\\.:;-><+-*=' :
                k+=e
        y.append(k)
        k=''
    return y
#######################################
for i in words:
    if i not in stopword:
        aa.append(i)
ww= removepunc(aa)
#######################################
w=[]#word
n=[]#fre
for i in range (len(ww)):
    if ww[i] not in w:
        w.append(ww[i])
        n.append(1)
    else:
        n[w.index(ww[i])]+=1
wn=[]
for i in range (len(w)):
    wn.append([w[i],n[i]])
wn.sort()
#######################################
def fhash(w,M):
    G=37
    y=0
    for i in range (len(w)):
        y+=ord(w[i])*G**(i)
    z=y%int(M)
    return z
#######################################
if op.lower()=='y':
    ss=[]
    tt=[]
    for i in range (len(ww)):
        if fhash(ww[i],M) not in ss:
            ss.append(fhash(ww[i],M))
            tt.append(1)
        else:
            tt[ss.index(fhash(ww[i],M))]+=1
    fn=[]
    for i in range (len(ss)):
        fn.append([ss[i],tt[i]])
    fn.sort()
    print('BoW =',fn)
else:
    print('BoW =',wn)
#######################################
stop.close()
file.close() # 6330481621  (2021-03-22 19:51) %diff = 35.06
file_name=input("File_name= ")
use=input("Use feature hashing ? (y,Y,n,N) ")
while use not in ['y','Y','n','N']:
    print('Try again.')
    use=input("Use feature hashing ? (y,Y,n,N) ")
if use in ['y','Y']:
    M=input("M = ")
print('-------------------')
stop=open('stopwords.txt','r')
file=open(file_name,'r')
cha=0
alpha=0
stw=[]
linecount=0
wordcount=0
sen=''#ประโยคในfile ที่ cleanแล้ว
for line in stop:
    a=line.split()
    for e in a:
        stw.append(e)
for line in file:
    linecount+=1
    a=line.split()
    cha+=len(line)
    for e in line:
        if e.isalnum():
            sen+=e
        else:
            sen+=' '
word=sen.split() #[]คำสะอาด
wordlow=[] #[]คำสะอาดพิมเล็ก
wordcount=len(word)
cha=cha-linecount+1
for e in word:
    wordlow.append(e.lower())
for i in range(len(word)):
    for e in word[i]:
        if '0'<=e<='9' or 'a'<=e.lower()<='z':
            alpha+=1
perfsen=' '.join(wordlow) #ประโยคสวย
print('char count =',cha)
print('alphanumeric count =',alpha)
print('line count =',linecount)
print('word count =',wordcount)
#-----------------------------------
def fhash(w,M):
    summ=0
    G=37
    for i in range(len(w)):
        summ+=ord(w[i])*G**i
    sol=summ%int(M)
    return sol
#-----------------------------------
def clean(s):
    a=[ '(', ')', '-', '_', '[', ']' ,'"' ,"'" ,';', ':', '>', '<','.' ]
    c=[]
    for i in range(len(s)):
        if s[i] not in a:
            c.append(s[i])
    return c
#-----------------------------------
nsen=[]
for e in wordlow:
    if e not in stw:
        nsen.append(e)
newsen=clean(nsen)
#-----------------------------------
x=[]
y=[]
for i in range(len(newsen)):
    if newsen[i] not in x:
        x.append(newsen[i])
        y.append(1)
    else:
        y[x.index(newsen[i])]+=1
block=[]
for i in range(len(x)):
    block.append([x[i],y[i]])
block.sort()
#-----------------------------------
if use in ['y','Y']:
    o=[]
    p=[]
    for i in range(len(newsen)):
        if fhash(newsen[i],M) not in o:
            o.append(fhash(newsen[i],M))
            p.append(1)
        else:
            p[o.index(fhash(newsen[i],M))]+=1
    q=[]
    for i in range(len(o)):
        q.append([o[i],p[i]])
    q.sort()
    print('BoW =',q)
else:
    print('BoW =',block)
stop.close()
file.close()
ALL: cluster #7 (2)
# 6330248821  (2021-03-22 20:51) %diff = 35.17
alp = 'abcdefghijklmnopqrstuvwxyz'
num = '0123456789'
file = input('File name = ')
x = input('Use feature hashing ? (y,Y,n,N) ').lower()
M = ''
def nFhash(w):
    BoW = []
    bow = []
    count = []
    for word in w:
        if word not in bow:
            bow.append(word)
            count.append(int(1))
        else :
            for i in range(len(bow)):
                if word == bow[i]:
                    count[i] += 1
    for j in range(len(bow)):
        BoW.append([bow[j],count[j]])
    return BoW

def yFhase(w,m):
    fhase = []
    BoW = []
    bow = []
    count = []
    for word in w:
        f = 0
        for i in range(len(word)):
            f += ord(word[i]) * (37 ** i)
        fhase.append(f % int(m))
    for e in fhase :
        if e not in bow:
            bow.append(e)
            count.append(1)
        else:
            for j in range(len(bow)):
                if e == bow[j]:
                    count[j] += 1
    for k in range(len(bow)):
        BoW.append([bow[k],count[k]])
    return BoW

while x not in'ny':
    print('Try again')
    x = input('Use feature hashing ? (y,Y,n,N) ').lower()

if x == 'y':
    M = input('M = ')
sFile = open('stopwords.txt','r')
stop_words = []
for line in sFile:
    stop_words += line.split()
    stop_words = list(map(str.lower,stop_words))
sFile.close()
wFile = open(file,'r')

charCount = 0
alpCount = 0
lineCount = 0
wordCount = 0

words = []
text = ''
for line in wFile:
    lineCount += 1
    words += line.split()
    words = list(map(str.lower,words) )
    for char in line.strip():
        charCount += 1

for word in words:
    for alpnum in word:
        if alpnum in alp or alpnum in num:
            text += alpnum
    text += ' '
clearedWords = text.split()
print(clearedWords)
wordCount += len(clearedWords)
for i in range(len(clearedWords)):
    alpCount += len(clearedWords[i])
print('-------------------')
print('char count =',charCount)
print('alphanumeric count =',alpCount)
print('line count =',lineCount)
print('word count =',wordCount)
deletedWord = []
for w in clearedWords:
    if w not in stop_words:
        deletedWord.append(w)
if x == 'y':
    print('BoW =',sorted(yFhase(deletedWord,M)))
else :
    print('Bow =',sorted(nFhash(deletedWord))) # 6330474221  (2021-03-22 20:52) %diff = 35.17
alpnum = 'abcdefghijklmnopqrstuvwxyz0123456789'

file = input('File name = ')
x = input('Use feature hashing ? (y,Y,n,N) ').lower()
M = ''

def Bag_of_words(words):
    BoW = []
    word_list = []
    count = []

    for word in words:
        if word not in word_list:
            word_list.append(word)
            count.append(int(1))
        else:
            for i in range(len(word_list)):
                if word_list[i] == word:
                    count[i]+=1

    for i in range(len(word_list)):
        BoW.append([word_list[i],count[i]])

    return sorted(BoW)

def fhash_BOW(BoW,M):
    BoW_hash = []
    hash_list = []
    hash_count = []
    for word, count in BoW:
        num_hash = fhash(word,M)
        if num_hash not in hash_list:
            hash_list.append(num_hash)
            hash_count.append(count)
        else:
            for i in range(len(hash_list)):
                if num_hash == hash_list[i]:
                    hash_count[i]+=count


    for i  in range(len(hash_list)):

        BoW_hash.append([hash_list[i],hash_count[i]])

    BoW_hash = sorted(BoW_hash)
    return BoW_hash

def fhash(word, M):
    f = 0

    for i,char in enumerate(word):
        f += ord(char)*(37**i)

    f = f%int(M)

    return f


while x not in ['n','y']:
    print('Try again')
    x = input('Use feature hashing ? (y,Y,n,N) ').lower()

if x == 'y':
    M = input('M =')

sFile = open('stopwords.txt','r')
stop_words = []
for line in sFile:
    stop_words += line.split()
    stop_words = list(map(str.lower,stop_words))
sFile.close()

wFile = open(file,'r')

charCount = 0
alpCount = 0
lineCount = 0
wordCount = 0

words = []

for line in wFile:
    lineCount+=1
    words += line.split()
    words = list(map(str.lower,words))
    charCount+= len(line.strip())

wFile.close()

clean_words = []

for word in words:
    text = ''
    for char in word:
        if char in alpnum:
            text += char
            alpCount+=1
    clean_words.append(text)

wordCount += len(clean_words)

clean_word_stopword = []

for word in clean_words:
    if word not in stop_words:
        clean_word_stopword.append(word)

print(clean_word_stopword)

BoW = Bag_of_words(clean_word_stopword)

if x == 'y':
    BoW_hash = fhash_BOW(BoW,M)

print('-------------------')
print('char count =',charCount)
print('alphanumeric count =',alpCount)
print('line count =',lineCount)
print('word count =',wordCount)

if x =='y':
    print('BoW = ', BoW_hash)
else:
    print('BoW = ', BoW)
ALL: cluster #8 (2)
# 6330323121  (2021-03-22 21:47) %diff = 41.78

#.....................................................................................
#ให้ w คือคำที่ประกอบด้วยอักขระ c0 c1 c2 ... cn –1
#fhash(w,M) = fhash(c0 c1 c2 ... cn –1, M) = ( ord(c0) + ord(c1)G1 + ord(c2)G2 + ... + ord(cn –1)Gn –1) % M



def fhash(w,M) :
    u=0
    G=37
    fh=0
    for i in range(len(w)):
        fh+=ord(w[i])*(G**u)
        u+=1
    return fh%M
def char_count(file_name):
    n = -1
    c = 0
    f = open(file_name)
    for line in f:
        n += 1
        c += len(line)

    f.close()
    c -= n
    return c

    return c
def a_and_num_count(file_name):
    f=open(file_name)
    c=0
    alphabet='abcdefghijklmnopqrstuvwxyz'
    num='0123456789'
    for line in f:
        for i in line:
            if i in alphabet or i in alphabet.upper() or i in num:
                c+=1
    f.close()


    return c
def words_count(file_name):
    f=open(file_name)
    s=''
    alphabet='abcdefghijklmnopqrstuvwxyz'
    num='0123456789'
    for line in f:
        for i in line :
            if i in alphabet or i in alphabet.upper() or i in num :
                s+=i
            else:
                s+=' '
    x=s.split()
    f.close()
    return len(x)



def line_count(file_name):
    c = 0
    f = open(file_name)
    for line in f:
        c += 1
    f.close()
    return c
def BoW_Nn(file_name,stop):
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    num = "1234567890"
    f = open(file_name)
    f2 = ""
    s2 = ""



    for line in f:
        for e in line:
            if e.lower()  in alphabet or e in num:

                f2 += e
            else:
               f2 += " "
    s = open(stop)
    for line in s:
        for e in line:
            s2 += e

    f3 = f2.lower().split()
    s3 = s2.lower().split()
    x = []
    for e in f3:
        if e not in s3:
            x.append(e)
    x.sort()

    b0 = [x[0]]
    b1 = [1]

    for i in range(1,len(x)):
        if x[i] != x[i-1]:
            b0.append(x[i])
            b1.append(1)
        else:

            b1[-1] += 1


    b = []
    for i in range(len(b0)):
        b.append([b0[i],b1[i]])
    f.close()
    s.close()
    return b
def BoW_Yy(file_name,stop,M):
    b=BoW_Nn(file_name,stop)
    by=[]

    for i in range(len(b)):
        by.append(fhash(b[i][0],M))
    by.sort()
    #[1,1,2,3,3,4,5,5,6]
    by0=[by[0]]
    by1=[1]
    for i in range(1,len(by)):
        if by[i-1]!=by[i]:
            by0.append(by[i])
            by1.append(1)
        else:
            by1[-1]+=1
    bowyes=[]
    for i in range(len(by0)):
        bowyes.append([by0[i],by1[i]])
    return bowyes
#..........................................

file_name=input('File name = ')
yn=input('Use feature hashing ? (y,Y,n,N) ')
while yn not in ['Y','y','N','n'] :
    print('Try again.')
    yn=input('Use feature hashing ? (y,Y,n,N) ')
if yn =='N' or yn=='n':
    print('char count = '+str(char_count(file_name)))
    print('alphanumeric count = '+str(a_and_num_count(file_name)))
    print('line count = '+str(line_count(file_name)))
    print('word count = '+str(words_count(file_name)))
    print('BoW = '+str(BoW_Nn(file_name,'stopword.txt')))
elif yn=='Y' or yn=='y':

    M=int(input('M = '))
    print('char count = '+str(char_count(file_name)))
    print('alphanumeric count = '+str(a_and_num_count(file_name)))
    print('line count = '+str(line_count(file_name)))
    print('word count = '+str(words_count(file_name)))
    print('BoW = '+str(BoW_Yy(file_name,'stopword.txt',M))) # 6330352321  (2021-03-22 03:58) %diff = 41.78

def fhash(w,m):
    c = []
    s = 0
    for i in range(len(w)):
        c.append(ord(w[i]))
    for i in range(len(w)):
        s += c[i]*(37**i)
    fh = s%m
    return fh

def char_count(filename):
    c_line = 0
    c_char = 0
    f = open(filename)
    for line in f:
        c_line += 1
        for i in range(len(line)):
            c_char += 1
    f.close()
    c_char -= c_line-1
    return c_char

def alp_count(filename):
    c = 0
    f = open(filename)
    for line in f:
        for e in range(len(line)):
            if ("a" <= line[e].lower() <= "z") or ("0" <= line[e] <= "9"):
                c += 1
    f.close()
    return c

def line_c(filename):
    c = 0
    f = open(filename)
    for line in f:
        c += 1
    f.close()
    return c

def word_count(filename):
    alp = "abcdefghijklmnopqrstuvwxyz"
    num = "1234567890"
    st = ""
    f = open(filename)
    for line in f:
        for e in line:
            if (e.lower() not in alp) and (e not in num):
                st += " "
            else:
                st += e
    f.close()
    c = st.lower().split()
    return len(c)

def bow_n(filename,stopwords):
    alp = "abcdefghijklmnopqrstuvwxyz"
    num = "1234567890"
    s1 = ""
    s2 = ""
    l1 = []
    l2 = []
    l = []
    f1 = open(filename)
    for line in f1:
        for e in line:
            if (e.lower() not in alp) and (e not in num):
                s1 += " "
            else:
                s1 += e
    f2 = open(stopwords)
    for line in f2:
        for e in line:
            s2 += e
    f1.close()
    f2.close()
    l1 = s1.lower().split()
    l2 = s2.lower().split()
    for e in l1:
        if e not in l2:
            l.append(e)
    l.sort()
    bow0 = [l[0]]
    bow1 = [1]
    for i in range(1,len(l)):
        if l[i] == l[i-1]:
            bow1[-1] += 1
        else:
            bow0.append(l[i])
            bow1.append(1)
    bow = []
    for i in range(len(bow0)):
        bow.append([bow0[i],bow1[i]])
    return bow

def bow_y(filename,stopwords,m):
    bow = bow_n(filename,stopwords)
    for i in range(len(bow)):
        bow[i][0] = fhash(bow[i][0],m)
    bow.sort()
    bowy = [bow[0]]
    for i in range(1,len(bow)):
        if bow[i][0] == bow[i-1][0]:
            bowy[-1][1] += bow[i][1]
        else:
            bowy.append(bow[i])
    return bowy

#---------------------------------------------------------------------

yesno = ["y","Y","n","N"]
file_name = input("File name = ")
ufh = input("Use feature hashing ? (y,Y,n,N) ")
while ufh not in yesno:
    print("Try again.")
    ufh = input("Use feature hashing ? (y,Y,n,N) ")
if ufh== "n" or ufh == "N":
    print("-"*19)
    print("char count = " + str(char_count(file_name)))
    print("alphanumeric count = " + str(alp_count(file_name)))
    print("line count = " + str(line_c(file_name)))
    print("word count = " + str(word_count(file_name)))
    print("BoW = " + str(bow_n(file_name,"stopword.txt")))
elif ufh == "y" or ufh == "Y":
    M = int(input("M = "))
    print("-"*19)
    print("char count = " + str(char_count(file_name)))
    print("alphanumeric count = " + str(alp_count(file_name)))
    print("line count = " + str(line_c(file_name)))
    print("word count = " + str(word_count(file_name)))
    print("BoW = " + str(bow_y(file_name,"stopword.txt",M)))
ALL: cluster #9 (2)
# 6330489721  (2021-03-22 21:01) %diff = 41.94
file_name = input('File name = ')
ft = input('Use feature hashing ? (y,Y,n,N) ')
uh = False
while ft not in ['y','Y','n','N']:
    print('Try again.')
    ft = input('Use feature hashing ? (y,Y,n,N) ')
if ft in ['y','Y']:
    M=input('M = ')
    uh = True
print('-------------------')

stopwords_list = []
stopwords_file = open('stopwords.txt', 'r')
line_count=0
char_count=0
alpha_count=0
word_count=0

for line in stopwords_file:

    strip_stopwords_file = line.strip()
    strip_split_stopwords_file = strip_stopwords_file.split()
    stopwords_list += strip_split_stopwords_file
stopwords_file.close()

file = open(file_name, 'r')
for line in file:
    strip_line = line.strip().lower()
    char_count += len(strip_line)
file.close()

file = open(file_name, 'r')
for line in file:
    strip_line = line.strip().lower()
    for i in strip_line:
        isalnum = i.isalnum()
        if isalnum == True:
            alpha_count +=1
file.close()

file = open(file_name, 'r')
for line in file:
    strip_line = line.strip().lower()
    line_count +=1
file.close()

def find_replace(t):
    result = ""
    for c in t:
        if c in "\"\'/\\,.:;":
            result += " "
        else:
            result += c
    return result

file = open(file_name, 'r')
for line in file:
    strip_line = line.strip().lower()
    words = find_replace(strip_line)
    strip_words = words.strip()
    split_strip_words = strip_words.split()
    word_count += len(split_strip_words)
file.close()

print('char count =',char_count)
print('alphanumeric count =',alpha_count)
print('line count =',line_count)
print('word count =',word_count)


all_words_list =[]
file = open(file_name, 'r')
for line in file:
    strip_line = line.strip().lower()
    words = find_replace(strip_line)
    strip_words = words.strip()
    split_strip_words = strip_words.split()
    all_words_list += split_strip_words
file.close()

all_words_withoutstopwords_list = []
for i in all_words_list:
    if not i in stopwords_list:
        all_words_withoutstopwords_list.append(i)

BoW = []
def add(BoW,d):
    c = True
    for i in range(len(BoW)):
        if BoW[i][0] == d:
            c = False
            BoW[i][1] += 1
    if c == True:
        BoW.append([d,1])
    return BoW

def fhash(list_of_word,M):
    wordhash_list = []
    for word in list_of_word:
        char_count = 0
        for i in range(len(word)):
            char_count += ord(word[i])*(37**i)
        wordhash_list.append(char_count%int(M))
    return wordhash_list

if uh == False:
    for i in all_words_withoutstopwords_list:
        BoW = addwordToBoW(BoW,i)
    print('BoW =',sorted(BoW))

if uh == True:
    wordhash_list = fhash(all_words_withoutstopwords_list,M)
    BoWhash = []
    for i in sorted(wordhash_list):
        BoWhash = add(BoWhash,i)
    print('BoW =',BoWhash) # 6330523321  (2021-03-21 19:13) %diff = 41.94
filename = input('File name = ')
feature = input('Use feature hashing ? (y,Y,n,N) ')
usehash = False
while not feature in ['y','Y','n','N']:
    print('Try again.')
    feature = input('Use feature hashing ? (y,Y,n,N) ')
if feature in ['y','Y']:
    M = int(input('M = '))
    usehash = True
print('-------------------')
stopwordslist = []
stopwords_file = open('stopwords.txt', 'r')
for line in stopwords_file:

    strip_stopwords_file = line.strip()
    strip_split_stopwords_file = strip_stopwords_file.split()
    stopwordslist += strip_split_stopwords_file
stopwords_file.close()

def find_replace(t):
    result = ""
    for c in t:
        if c in "\"\'/\\,.:;":
            result += " "
        else:
            result += c
    return result

charcount = 0
file = open(filename, 'r')
for line in file:
    strip_line = line.strip().lower()
    charcount += len(strip_line)
file.close()
print('char count =',charcount)

alphanumericcount = 0
file = open(filename, 'r')
for line in file:
    strip_line = line.strip().lower()
    for i in strip_line:
        isalnum = i.isalnum()
        if isalnum == True:
            alphanumericcount +=1
file.close()
print('alphanumeric count =',alphanumericcount)

linecount = 0
file = open(filename, 'r')
for line in file:
    strip_line = line.strip().lower()
    linecount +=1
file.close()
print('line count =',linecount)

wordcount = 0
file = open(filename, 'r')
for line in file:
    strip_line = line.strip().lower()
    words = find_replace(strip_line)
    strip_words = words.strip()
    split_strip_words = strip_words.split()
    wordcount += len(split_strip_words)
file.close()
print('word count =',wordcount)

all_words_list =[]
file = open(filename, 'r')
for line in file:
    strip_line = line.strip().lower()
    words = find_replace(strip_line)
    strip_words = words.strip()
    split_strip_words = strip_words.split()
    all_words_list += split_strip_words
file.close()

all_words_withoutstopwords_list = []
a = []
for i in all_words_list:
    if not i in stopwordslist:
        all_words_withoutstopwords_list.append(i)

BoW = []
def addwordToBoW(BoW,newword):
    contain = False
    for i in BoW:
        if i[0] == newword:
            contain = True
            i[1] +=1
            break
    if contain == False:
        BoW.append([newword,1])
    return BoW
if usehash == False:
    for i in all_words_withoutstopwords_list:
        BoW = addwordToBoW(BoW,i)
    print('BoW =',sorted(BoW))

def fhash(word,M):
    G = 37
    numchar = 0
    for charindex in range (len(word)):
        numchar += ord(word[charindex])*(G**charindex)
    return numchar%M

if usehash == True:
    wordhash_list = []
    for word in all_words_withoutstopwords_list:
        wordhash = fhash(word,M)
        wordhash_list.append(wordhash)
    BoWhash = []
    for i in sorted(wordhash_list):
        BoWhash = addwordToBoW(BoWhash,i)
    print('BoW =',BoWhash)
ALL: cluster #10 (2)
# 6330275721  (2021-03-22 21:44) %diff = 42.03
#--------------------------------------------------------
alphabet = 'abcdefghijklmnopqrstuvwxyz'
number = '0123456789'
special_char = '!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
stopword_file = 'stopwords.txt'
#--------------------------------------------------------
def char_count(x):
    file = open(x, 'r')
    c = 0
    for line in file:
        line = line.strip()
        c += len(line)
    file.close()
    return c

def alphanumeric_count(x):
    file = open(x, 'r')
    c = 0
    for line in file:
        line = line.strip()
        text = ''
        for char in line:
            if char.lower() in alphabet or char in number:
                text += char
        c += len(text)
    file.close()
    return c

def line_count(x):
    file = open(x, 'r')
    c = 0
    for line in file:
        c += 1
    file.close()
    return c

def word_count(x):
    file = open(x, 'r')
    c = 0
    for line in file:
        line = line.strip()
        text = ''
        for char in line:
            if char in special_char:
                text += ' '
            else:
                text += char
        c += len(text.split())
    file.close()
    return c

def fhash(w, M):
    sum = 0
    for i in range(len(w)):
        sum += ord(w[i])*(37**i)
    return sum % M

def list_stopwords(x):
    file = open(x, 'r')
    stopwords_list=[]
    for line in file:
        line = line.strip()
        pre_stw = ''
        for char in line:
            if char in special_char:
                pre_stw += ' '
            else:
                pre_stw += char
        pre_stw = pre_stw.strip().split()
        for i in pre_stw:
            stopwords_list.append(i)
    file.close()
    return stopwords_list

def BoW(x, ufh, M):
    file = open(x, 'r')
    words=[]
    for line in file:
        line = line.strip()
        pre_words = ''
        for char in line:
            if char in special_char:
                pre_words += ' '
            else:
                pre_words += char
        pre_words = pre_words.strip().split()
        for e in pre_words:
            k = e.lower()
            if k not in list_stopwords(stopword_file):
                words.append(k)
    words.sort()
    file.close()

    bow = []; bow_n = []; bow_y = []; n_word = []
    if ufh in ['n', 'N']:
        for e in words:
            if e in bow_n:
                n_word[bow_n.index(e)] += 1
            else:
                bow_n.append(e); n_word.append(1)
        for i in range(len(bow_n)):
            bow.append([bow_n[i],n_word[i]])
        return bow
    elif ufh in ['y', 'Y']:
        for e in words:
            p = fhash(e,M)
            if p in bow_y:
                n_word[bow_y.index(p)] += 1
            else:
                bow_y.append(p); n_word.append(1)
        for i in range(len(bow_y)):
            bow.append([bow_y[i],n_word[i]])
        bow.sort()
        return bow

#--------------------------------------------------------
file_name = input('File name = ')
ufh = input('Use feature hashing ? (y,Y,n,N) ')
while ufh not in ['y','Y','n','N']:
    print('Try again.')
    ufh = input('Use feature hashing ? (y,Y,n,N) ')
if ufh in ['y', 'Y']:
    M = int(input('M = '))
elif ufh in ['n', 'N']:
    M = 0
else:
    M = 0
print('-'*19)
print('char count =', char_count(file_name))
print('alphanumeric count =', alphanumeric_count(file_name))
print('line count =', line_count(file_name))
print('word count =', word_count(file_name))
print('BoW =', BoW(file_name, ufh, M)) # 6330281421  (2021-03-21 12:19) %diff = 42.03
#--------------------------------------
#ข้อมูลที่แก้ได้
stopword_file='stopwords.txt'
sp_char='!@#$%^&*()_+{}[]:\";\',./<>?\\=-`'
al_and_nume='abcdefghijklmnopqrstuvwxyz0123456789'
#--------------------------------------
#ส่วนฟังก์ชั่น
def c_count(filename):
    with open(filename,'r') as file:
        n=0
        for line in file:
            line=line.strip()
            n+=len(line)
    return n
def alpha_count(filename):
    with open(filename,'r') as file:
        n=0
        for line in file:
            line=line.strip()
            text=''
            for char in line:
                #if not(char in sp_char):
                if char.lower() in al_and_nume :
                    text+=char
            #text=''.join(text.split())
            n+=len(text)
    return n
def line_count(filename):
    with open(filename,'r') as file:
        n=0
        for line in file:
            n+=1
    return n
def word_count(filename):
    with open(filename,'r') as file:
        n=0
        for line in file:
            line=line.strip()
            text=''
            for char in line:
                if char in sp_char:
                    text+=' '
                else:
                    text+=char
            n+=len(text.split())
    return n
def list_of_stopwords(filename):
    with open(filename,'r') as file:
        stopwords_list=[]
        for line in file:
            line=line.strip()
            text=''
            for char in line:
                if char in sp_char:
                    text+=' '
                else:
                    text+=char
            text=text.split()
            for i in text:
                stopwords_list.append(i.lower())
    return stopwords_list
def fhash(word,m):
    sum=0
    for i in range(len(word)):
        sum+=ord(word[i])*(37**i)
    return sum%m
def BoW(filename,condition,m):
    with open(filename,'r') as file:
        words=[]
        for line in file:
            line=line.strip()
            text=''
            for char in line:
                if char in sp_char:
                    text+=' '
                else:
                    text+=char
            text=text.split()
            for i in text:
                if not(i.lower() in list_of_stopwords(stopword_file)):
                    words.append(i.lower())
        words.sort()
    bag_of_word=[]
    repit_word=[]
    bag_of_words=[]
    if condition.lower() =='n':
        for i in words:
            if i in bag_of_word:
                repit_word[bag_of_word.index(i)]+=1
            else:
                bag_of_word.append(i)
                repit_word.append(1)
        for i in range(len(bag_of_word)):
            bag_of_words.append([bag_of_word[i],repit_word[i]])
        return bag_of_words
    else:
        for i in words:
            p=fhash(i,m)
            if p in bag_of_word:
                repit_word[bag_of_word.index(p)]+=1
            else:
                bag_of_word.append(p)
                repit_word.append(1)
        for i in range(len(bag_of_word)):
            bag_of_words.append([bag_of_word[i],repit_word[i]])
        bag_of_words.sort()
        return bag_of_words
#--------------------------------------
#ส่วนทำงาน
file=input('File name = ')
feature=input('Use feature hashing ? (y,Y,n,N) ')
while not(feature in ['Y','n','N','y']):
    print('Try again.')
    feature=input('Use feature hashing ? (y,Y,n,N) ')
if feature.lower() == 'y':
    m=int(input('M = '))
else:
    m=0
print('-------------------')
print('char count =',c_count(file))
print('alphanumeric count =',alpha_count(file))
print('line count =',line_count(file))
print('word count =',word_count(file))
print('BoW =',BoW(file,feature,m))
ALL: cluster #11 (2)
# 6330199721  (2021-03-22 17:35) %diff = 43.85

file_name = input('File name = ')
fn = open(file_name.strip(), 'r')
a = input('Use feature hashing ? (y,Y,n,N) ')
while 1>0:
    if a in 'nN':
        break
    elif a in 'yY':
        M = input('M = ')
        break
    else :
        print('Try again.')
        a = input('Use feature hashing ? (y,Y,n,N) ')
print('-------------------')


s = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890'
cc = 1
ac = 0
lc = 0
wc = 0
for i in fn:
    lc+=1
    for b in range(len(i)):
        cc+=1
        if i[b] in s:
            ac+= 1
            if i[b+1] not in s:
                wc+=1
print('char count =',cc-lc)
print('alphanumeric count =',ac)
print('line count =',lc)
print('word count =',wc)



if a in 'nN':
    fn = open(file_name.strip(), 'r')
    st = open("stopwords.txt", 'r')
    s = 'abcdefghijklmnopqrstuvwxyz1234567890'
    w = ''
    for l in fn:
        l = l.strip('\n')
        l = l.lower()
        for i in l:
            if i in s:
                w += i
            else:
                w += ' '
    w = w.split()
    for l in st:
        l = l.split()
        for i in l:
            while i in w:
                w.remove(i)
    bow = []
    for i in w:
        x = w.count(i)
        if [i,x] not in bow:
            bow += [[i,x]]

    print('BoW =',bow)



def fhash(x,M):
    a = 0
    for i in range(len(x)):
        c = ord(x[i])*(pow(37,i))
        a += c
    d =  a%int(M)
    return str(d)



if a in 'Yy':
    fn = open(file_name.strip(), 'r')
    st = open("stopwords.txt", 'r')
    s = 'abcdefghijklmnopqrstuvwxyz1234567890'
    w = ''
    for l in fn:
        l = l.strip('\n')
        l = l.lower()
        for e in l:
            if e in s:
                w += e
            else:
                w += ' '
    w = w.split()
    for l in st:
        l = l.split()
        for i in l:
            while i in w:
                w.remove(i)
    j=[]
    for i in w:
        j += fhash(i,M)
    bow = []
    for i in j:
        x = j.count(i)
        if [int(i),x] not in bow:
            bow += [[int(i),x]]
            bow.sort()
    print('BoW =',bow)

fn.close()
st.close() # 6330200621  (2021-03-22 00:15) %diff = 43.85

file_name = input('File name = ')
fn = open(file_name.strip(), 'r')
fh = input('Use feature hashing ? (y,Y,n,N) ')
while True:
    if fh not in 'yYnN':
        print('Try again.')
        fh = input('Use feature hashing ? (y,Y,n,N) ')
    if fh in 'yY':
        M = input('M = ')
        break
    if fh in 'nN':
        break
#------------------------------------------------------------
def everything(fn):
    ac = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0']
    anc = 0
    cc = 0
    lc = 0
    tap = ''
    for line in fn:
        line = line.strip('\n')
        line = line.lower()
        lc += 1
        for e in line:
            cc += 1
            if e in ac:
                anc += 1
                tap += e
            else:
                tap += ' '
    tap = tap.split()
    wc = len(tap)
    return anc,cc,lc,wc
#-------------------------------------------------------------

anc,cc,lc,wc = everything(fn)
print('-'*len('Use feature hashing'))
print('char count =',cc)
print('alphanumeric count =',anc)
print('line count =',lc)
print('word count =',wc)
fn.close()

#---------------------------------------------------
def bow1():
    fn = open(file_name.strip(), 'r')
    st = open("stopwords.txt", 'r')
    ac = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0']
    b = ''
    for line in fn:
        line = line.strip('\n')
        line = line.lower()
        for e in line:
            if e in ac:
                b += e
            else:
                b += ' '
    b = b.split()
    for line in st:
        line = line.split()
        for e in line:
            while e in b:
                b.remove(e)

    bow = []
    for e in b:
        z = b.count(e)
        if [e,z] not in bow:
            bow.append([e,z])
    fn.close()
    st.close()
    return bow
#---------------------------------------------------

if fh == 'n' or fh == 'N':
    bow = bow1()
    print('BoW =',bow)

#---------------------------------------------------
def fhash(a,M):
    summ=0
    for i in range (len(a)):
        summ += ord(a[i])*(37)**i
    c = summ % int(M)
    return str(c)
#----------------------------------------
def bow2():
    fn = open(file_name.strip(), 'r')
    st = open("stopwords.txt", 'r')
    ac = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0']
    b = ''
    for line in fn:
        line = line.strip('\n')
        line = line.lower()
        for e in line:
            if e in ac:
                b += e
            else:
                b += ' '
    b = b.split()
    for line in st:
        line = line.split()
        for e in line:
            while e in b:
                b.remove(e)
    c=[]
    for f in b:
        c.append(fhash(f,M))

    bow = []
    for e in c:
        z = c.count(e)
        if [int(e),z] not in bow:
            bow.append([int(e),z])
            bow.sort()
    fn.close()
    st.close()
    return bow
#-------------------------------------------
if fh == 'y' or fh == 'Y':
    bbb = bow2()
    print('BoW =',bbb)
ALL: cluster #12 (2)
# 6330192221  (2021-03-22 23:09) %diff = 44.18
c=0
alpha=0
l=0
w=[]
x=[]
b=[]
Bag=[]
logic=['y','Y','n','N']
def char_count(line):
    c=0
    c+=(len(line)-1)
    return c
#-------------------------------
def alphanumeric_count(line):
    c1=0
    for t in line:
        if 'a'<=t<='z' or 'A'<=t<='Z' or '0'<=t<='9':
            c1+=1
    return c1
#-------------------------------
def line_count(line):
    l=0
    if len(line)!=0:
        l+=1
    return l
#-------------------------------
def word_count(line):
    word=[]
    wordn=[]
    s=""
    for t in line:
        if 'a'<=t<='z' or 'A'<=t<='Z' or '0'<=t<='9':
            s+=t
        else:
            if s!="":
                word.append(s.lower())
            s=""
    return word
#-------------------------------
def BoW(lis):
    c=[]
    for i in range(len(lis)):
        if [lis[i],lis.count(lis[i])] not in c:
            c.append([lis[i],lis.count(lis[i])])
    return c
#-----------------------------
def fhash(w,M):
    G=37
    s=0
    for i in range(len(w)):
        s+=ord(w[i])*(G**i)
        s=s%M
    return s
#-----------------------------
def hashedBoW(wordlist,M):
    A=[]
    for word in wordlist:
        A.append(fhash(word,M))
    return BoW(A)
#-----------------------------
file_name=input("File name = ")
fin = open(file_name,"r")
fin2= open("stopwords.txt","r")
a=input("Use feature hashing ? (y,Y,n,N) ")
for line in fin:
    c+=char_count(line)
    alpha+=alphanumeric_count(line)
    l+=line_count(line)
    w+=word_count(line)
else:
    c+=1
lw=len(w)
for line in fin2:
    x+=line.split()
for i in range(len(w)):
    if w[i] not in x:
        b.append(w[i].lower())
if a=='y' or a=='Y':
    M=int(input("M = "))
    print("-------------------")
    print("char count =",c)
    print("alphanumeric count =",alpha)
    print("line count =",l)
    print("word count =",lw)
    print("BoW =",hashedBoW(b,M))
elif a=='n' or a=='N':
    print("-------------------")
    print("char count =",c)
    print("alphanumeric count =",alpha)
    print("line count =",l)
    print("word count =",lw)
    print("BoW =",BoW(b))
else:
   while a not in logic:
        print("Try again.")
        a=input("Use feature hashing ? (y,Y,n,N) ")
        if a=='y' or a=='Y':
            M=int(input("M = "))
            print("-------------------")
            print("char count =",c)
            print("alphanumeric count =",alpha)
            print("line count =",l)
            print("word count =",lw)
            print("BoW =",hashedBoW(b,M))
        elif a=='n' or a=='N':
            print("-------------------")
            print("char count =",c)
            print("alphanumeric count =",alpha)
            print("line count =",l)
            print("word count =",lw)
            print("BoW =",BoW(b)) # 6330356921  (2021-03-21 22:58) %diff = 44.18
#---------------------------------------------------------------------
def fhash(w,M) :
    a = []
    b = 0
    c = 0
    for i in range(len(w)) :
        a.append(str(ord(w[i])))
    for i in range(len(a)) :
        b += int(a[i])*(37**c)
        c += 1
    b = int(b)%int(M)
    return b
#---------------------------------------------------------------------
def cutpunc(N) :
    result = ""
    for c in N:
        if c in "\"\'/\\,.:;" :
            result += ""
        elif c in "\n" :
            result += " "
        else :
            result +=c
    return result
#---------------------------------------------------------------------
def cutword(N) :
    N = cutpunc(N)
    N = N.lower()
    N = N.split()
    x = ""
    a = open("stopwords.txt", "r")
    for lines in a :
        x += lines
    b = cutpunc(x)
    b = b.split()
    result = ""
    for c in N  :
        if c in b :
            result += ""
        else :
            result += c+" "
    return result
#---------------------------------------------------------------------
def BOW1(N) :
    N = cutword(N)
    N = N.split()
    N.sort()
    N.append("")
    a = []
    c = 1
    for i in range(len(N)-1) :
        if N[i]==N[i+1] :
            c +=1

        else :
            a.append([N[i],c])
            c = 1

    return a

#---------------------------------------------------------------------
def BOW2(N) :
    N = cutword(N)
    N = N.split()
    N.sort()
    a = []
    c = 1
    x = []
    for i in range(len(N)) :
        x.append(fhash(N[i],M))
    x.sort()
    x.append("")
    for i in range(len(x)-1) :
        if x[i]==x[i+1] :
            c +=1

        else :
            a.append([x[i],c])
            c = 1

    return a


#---------------------------------------------------------------------
x = ""
character_count = 0
line_count = 0
word_count = 0
alphanumeric_count = 0
d = input("File name = ")
a = open(d, "r")
for lines in a :
    x += lines
    character_count += len(lines)
    line_count +=1
print(x)
y = cutpunc(x)
z = "".join(y)
character_count -=line_count-1
y = cutpunc(x)
y = y.split()
for i in range(len(y)):
    if y[i]==y[i] :
        word_count += 1
h = 0
for i in range(len(z)) :
    if " "==z[i] :
        h +=1
alphanumeric_count += len(z)-h
a.close()
#---------------------------------------------------------------------

b = input("Use feature hashing ? (y,Y,n,N) ")
if b=="y" or b=="Y" :
    M = int(input("M = "))
    print("-------------------")
    print("char count =",character_count)
    print("alphanumeric count =",alphanumeric_count)
    print("line count =",line_count)
    print("word count =",word_count)
    print("BoW =",BOW2(x))
if b=="n" or b=="N" :
    print("-------------------")
    print("char count =",character_count)
    print("alphanumeric count =",alphanumeric_count)
    print("line count =",line_count)
    print("word count =",word_count)
    print("BoW =",BOW1(x))

while b!="n" and b!="N"and b!="y" and b!="Y" :
    print("Try again.")
    b = input("Use feature hashing ? (y,Y,n,N) ")
    if b=="y" or b=="Y" :
        M = int(input("M = "))
        print("-------------------")
        print("char count =",character_count)
        print("alphanumeric count =",alphanumeric_count)
        print("line count =",line_count)
        print("word count =",word_count)
        print("BoW =",BOW2(x))
    if b=="n" or b=="N" :
        print("-------------------")
        print("char count =",character_count)
        print("alphanumeric count =",alphanumeric_count)
        print("line count =",line_count)
        print("word count =",word_count)
        print("BoW =",BOW1(x))
        break
ALL: cluster #13 (2)
# 6330180721  (2021-03-18 17:39) %diff = 44.88
#Prog-08: Bag-of-words
#6330180721 Nichakul Pichitwutikorn
def fhash(w,m):
    a = 0
    for e in range(len(w)):
        a+= ord(w[e])*(37**e)
    return a%m
def num(lis,word):
    c = 0
    for t in lis:
        if t == word:c+=1
    return c
def cut_repeat(listt):
    qr = []
    for e in listt:
        if not e in qr:
            qr.append(e)
    return qr

file_name = input('File name = ')
h = input('Use feature hashing ? (y,Y,n,N) ')
while h!='y' and h!='Y' and h!='n' and h!='N':
    print('Try again.')
    h = input('Use feature hashing ? (y,Y,n,N) ')
if h in 'yY':
    m= input('M = ')

book = open(file_name,'r')
stop = open('stopwords.txt','r')
char_al = 0; al = 0;l = 0
sen = ''; st = ''
for line in book:
    for i in line:
        if 'a'<=i<='z' or 'A'<=i<='Z' or '0'<=i<='9':
            al+=1
            sen+=i
        else:
            char_al+=1
            sen+=' '
    l+=1
sen = sen.lower().split()
for t in stop:
    for s in t:
        if s==' ':st+=' '
        else:
            st+=s
st = st.lower().split()
bow = []; ans = []; f = []
for p in sen:
    if not p in st:bow.append(p)

print('-------------------')
print('char count =',al+char_al-l+1)
print('alphanumeric count =',al)
print('line count =',l)
print('word count =',len(sen))
if h in 'yY':
    for j in bow:
        ans.append(fhash(j,int(m)))
    for q in ans:
        f.append([q,num(ans,q)])
    rrr = cut_repeat(f)
    rrr.sort()
    print('BoW =',rrr)
else:
    for j in bow:
        ans.append([j,num(bow,j)])
    rrr = cut_repeat(ans)
    rrr.sort()
    print('BoW =',rrr)
book.close()
stop.close() # 6330572021  (2021-03-22 15:09) %diff = 44.88

def fhash(w,m):
    p=0
    for i in range(len(w)):
        p+=ord(w[i])*(37**i)
    fh=p%m
    return fh

def bow(sen):
    b=[]
    bow=[]
    for e in sen :
        if not e in b:
            b.append(e)
    c=[0]*len(b)
    for i in range(len(sen)):
        for j in range(len(b)):
            if sen[i]==b[j]:
                c[j]+=1
    for k in range(len(b)):
        bow.append([b[k],c[k]])
    return bow

file_name=input('File name=')
f=input('Use feature hashing ? (y,Y,n,N)')
while f!='y' and f!='Y' and  f!='n' and f!='N':
    print('Try again.')
    f=input('Use feature hashing ? (y,Y,n,N)')
if f=='y' or f=='Y':
    m=input('M=')

file=open(file_name,'r')
stopw=open('stopwords.txt','r')
lines=stopw.readlines()
stopw.close()
lines=[line.strip() for line in lines]
stw=''
for i in range(len(lines)):
    stw+=str(lines[i].lower())+' '
stop=stw.split()
char=0
al=0
l=0
sen=''
for line in file:
    for c in line:
        if c.isalnum()==True:
            char+=1
            al+=1
            sen+=c
        else:
             char+=1
             sen+=' '
    l+=1
s=sen.lower().split()
sent=[]
for p in s:
    if p not in stop:
        sent+=[p]

print('-------------------')
print('char count = ',char-l+1)
print('alphanumeric count = ',al)
print('line count = ',l)
print('word count = ',len(s))
if f=='y' or f=='Y':
    bb=[]
    for q in range(len(sent)):
        bb+=[fhash(sent[q],int(m))]
    BoW=bow(bb)
    BoW.sort()
    print('BoW = ',BoW)
else:
    BoW=bow(sent)
    BoW.sort()
    print('BoW = ',BoW)
file.close()
ALL: cluster #14 (7)
# 6330280821  (2021-03-22 22:05) %diff = 45.24

def char_count(fn):
    file_name = open(fn)
    c = 0
    for e in file_name:
        for a in e:
            if a != "\n":
                c += 1
    file_name.close
    return c

def count_line(fn):
    file_name = open(fn)
    c = 0
    for line in file_name:
        c += 1
    file_name.close()
    return c

def alphanumeric(fn):
    a = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
    file_name = open(fn)
    c = 0
    for e in file_name :
        for d in e:
            if d in a:
                c += 1
    file_name.close()
    return c

def word_count(fn):
    file_name = open(fn)
    c = ""
    for e in file_name:
        for a in e:
            if a not in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789":
                c += " "
            else :
                c += a
    file_name.close
    return c.split()

s = open("stopwords.txt")
stop = ""
for e in s:
    for d in e:
        stop += d
stop2 = stop.lower().split()
s.close()

def bow_no_hashing(fn):
    p = []
    q = word_count(fn)
    for e in q:
        if e.lower() not in stop2:
            p.append(e)
    p.sort()
    p += "!!"
    d = p[0]
    last = []
    num = 1
    for j in range(1,len(p)) :
        if d != p[j]:
            last.append([d,num])
            num = 1
            d = p[j]
        else :
            num += 1
    return last

def fhash(w,M):
    c = 0
    a = 0
    G = 37
    for i in w :
        c += ord(i)*(G**a)
        a += 1
    b = c%M
    return b

def bow_hashing(fn,M):
    p = []
    q = word_count(fn)
    for e in q:
        if e.lower() not in stop2:
            p.append(e)
    s = []
    for i in p :
        v = fhash(i,int(M))
        s.append(v)
    s.sort()
    s += "!!"
    d = s[0]
    last = []
    num = 1
    for j in range(1,len(s)) :
        if d != s[j]:
            last.append([d,num])
            num = 1
            d = s[j]
        else :
            num += 1
    return last
print(bow_hashing("sample.txt",10))


x = input("File name = ")
b = input("Use feature hashing ? (y,Y,n,N) ")
while b not in "yYnN":
    print("Try again.")
    b = input("Use feature hashing ? (y,Y,n,N) ")
if b in "yY":
    M = input("M = ")
    print("-------------------")
    print("char count =",char_count(x))
    print("alphanumeric count =",alphanumeric(x))
    print("line count =",count_line(x))
    print("word count =",len(word_count(x)))
    print("BoW = ",bow_hashing(x,M))
else :
    print("-------------------")
    print("char count =",char_count(x))
    print("alphanumeric count =",alphanumeric(x))
    print("line count =",count_line(x))
    print("word count =",len(word_count(x)))
    print("BoW = ",bow_no_hashing(x)) # 6330485121  (2021-03-21 01:01) %diff = 45.24
def char_count(file_name):
    words = ''
    c = 0
    fn = open(file_name)
    for line in fn:
        words += line
    for e in words:
        if e != '\n':
            c += 1
    fn.close()
    return c

def alphanumeric_count(file_name):
    words = ''
    c = 0
    fn = open(file_name)
    for line in fn:
        words += line
    for e in words:
        if e.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
            c += 1
    fn.close()
    return c

def line_count(file_name):
    c = 0
    fn = open(file_name)
    for line in fn:
        c += 1
    fn.close()
    return c

def list_of_words(file_name):
    words = ''
    listwords = ''
    fn = open(file_name)
    for line in fn:
        words += line
    for e in words:
        if e.lower() not in 'abcdefghijklmnopqrstuvwxyz0123456789' or e.lower() == '\n':
            listwords += ' '
        else: listwords += e.lower()
    listwords = listwords.split()
    fn.close()
    return listwords      # ['it', 'was', 'the', 'best', 'of', ...]

def bag_of_words(file_name):
    listwords = list_of_words(file_name)
    sw = list_of_words('stopwords.txt')
    new = []
    for e in listwords:
        if e not in sw:
            new.append(e)
    word = []; fr = []; bow = []
    for e in new:
        if e.lower() not in word:
            word.append(e.lower())
            fr.append(int(1))
        elif e.lower() in word:
            fr[word.index(e.lower())] += 1
    for i in range(len(word)):
        bow.append([word[i], fr[i]])
    bow.sort()
    return bow

def fhashing(w,m):
    fhash = 0
    g = 37
    for i in range(len(w)):
        fhash += ord(w[i])*(g**i)
    return fhash%m

def feature_hashing(file_name):
    listwords = list_of_words(file_name)
    sw = list_of_words('stopwords.txt')
    new = []
    for e in listwords:
        if e.lower() not in sw:
            new.append(e.lower())
    fhash = []; ordd = []; fr = []; bow = []
    for e in new:
        fhash.append(fhashing(e,m))
    for i in range(len(fhash)):
        if fhash[i] not in ordd:
            ordd.append(fhash[i])
            fr.append(int(1))
        elif fhash[i] in ordd:
            fr[ordd.index(fhash[i])] += 1
    for i in range(len(ordd)):
        bow.append([ordd[i], fr[i]])
    bow.sort()
    return bow
x = ['y', 'Y', 'n', 'N' ]
file_name = input('File name = ')
hashing = input('Use feature hashing ? (y,Y,n,N) ')
while hashing not in x:
    print('Try again.')
    hashing = input('Use feature hashing ? (y,Y,n,N) ')
if hashing in 'yY':
    m = int(input('M = '))
    print('-------------------')
    print('char count = '+str(char_count(file_name)))
    print('alphanumeric count = '+str(alphanumeric_count(file_name)))
    print('line count = '+str(line_count(file_name)))
    print('word count = '+str(len(list_of_words(file_name))))
    print('BoW =',feature_hashing(file_name))
elif hashing in 'nN':
    print('-------------------')
    print('char count = '+str(char_count(file_name)))
    print('alphanumeric count = '+str(alphanumeric_count(file_name)))
    print('line count = '+str(line_count(file_name)))
    print('word count = '+str(len(list_of_words(file_name))))
    print('Bow =',bag_of_words(file_name)) # 6330311621  (2021-03-18 22:11) %diff = 46.64
def num_all(fn):
    c = 0
    fn = open(fn,'r').read()
    for i in fn:
        if i != '\n':
            c += 1
    return c

def num_char(fn):
    out = ''
    fn = open(fn,'r').read()
    for i in fn:
        if 'a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9':
            out += i
    return len(out)

def num_line(fn):
    c = 0
    fn = open(fn,'r')
    for line in fn:
        c += 1
    return c

def num_word(fn):
    out = ''
    fn = open(fn,'r').read()
    for i in fn:
        if not('a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9'):
            i = ' '
        out += i
    word = out.split()
    return len(word)

def listword(fn):
    out = ''
    fn = open(fn,'r').read()
    for i in fn:
        if not('a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9'):
            i = ' '
        out += i
    out = out.lower().split()
    sw = open('stopwords.txt','r').read()
    t = sw.split()
    list_word = []
    for i in out:
        if not i in t:
            list_word.append(i)
    return list_word

def BoW(fn):
    word = listword(fn)
    t_word = []
    for i in word:
        if not i in t_word:
            t_word.append(i)
    slot = [0]*len(t_word)
    for i in word:
        if i in t_word:
            slot[t_word.index(i)] += 1
    f_word = []
    for i in range(len(slot)):
        f_word.append([t_word[i],slot[i]])
    return f_word

def fhash(w,M):
    num = 0
    for i in range(len(w)):
        num += ord(w[i])*(37**i)
    return num%int(M)

def BoW_fhash(fn,M):
    word1st = listword(fn)
    word = []
    for i in word1st:
        word.append(fhash(i,M))
    t_word = []
    for i in word:
        if not i in t_word:
            t_word.append(i)
    slot = [0]*len(t_word)
    for i in word:
        if i in t_word:
            slot[t_word.index(i)] += 1
    f_word = []
    for i in range(len(slot)):
        f_word.append([t_word[i],slot[i]])
    return f_word

file_name = input('File name = ')
choose = input('Use feature hashing ? (y,Y,n,N) ')
while not choose in 'nNyY':
    print('Try again.')
    choose = input('Use feature hashing ? (y,Y,n,N) ')
if choose in 'yY':
    M = input('M = ')
    print('-------------------')
    print('char count =',num_all(file_name))
    print('alphanumeric count =',num_char(file_name))
    print('line count =',num_line(file_name))
    print('word count =',num_word(file_name))
    print('BoW =',BoW_fhash(file_name,M))
else:
    print('-------------------')
    print('char count =',num_all(file_name))
    print('alphanumeric count =',num_char(file_name))
    print('line count =',num_line(file_name))
    print('word count =',num_word(file_name))
    print('BoW =',BoW(file_name)) # 6330564021  (2021-03-22 01:34) %diff = 46.64

#----------------------------------------------------------
def char_count(file_name):
    i = 0
    fn = open(file_name, 'r')
    for line in fn:
        if line[-1] == '\n':
            i += len(line[:-1])
        else:
            i += len(line)
    fn.close()
    return i

def alp_count(file_name):
    i = 0
    fn = open(file_name, 'r')
    for line in fn:
        for c in line.lower():
            if 'a' <= c <= 'z' or '0' <= c <= '9':
                i += 1
    fn.close()
    return i

def line_count(file_name):
    i = 0
    fn = open(file_name, 'r')
    for line in fn:
        i += 1
    fn.close()
    return i

def stop_words(stop_name):
    k = []
    fn = open(stop_name, 'r')
    for line in fn:
        k += line.lower().strip().split()
    fn.close()
    return k

def words(file_name):
    k = []
    fn = open(file_name, 'r')
    for line in fn:
        d = ''
        for c in line.lower():
            if 'a' <= c <= 'z' or '0' <= c <= '9':
                d += c
            else:
                d += ' '
        k += d.strip().split()
    fn.close()
    return k

def BoW(file_name, stop_name):
    a = words(file_name)
    b = stop_words(stop_name)
    k = []
    for c in a:
        if c in b:
            k.append(c)
    p = []
    for d in a:
        if d not in k:
            p.append(d)
    word_c = []
    word = []
    for i in range(len(p)):
        if p[i] in word_c:
            j = word_c.index(p[i])
            word[j] += 1
        else:
            word_c.append(p[i])
            word.append(1)
    r = []
    for i in range(len(word)):
        r.append([word_c[i],word[i]])
    return r

def  f_hashing(file_name, stop_name,M):
    a = words(file_name)
    b = stop_words(stop_name)
    k = []
    for c in a:
        if c in b:
            k.append(c)
    p = []
    for d in a:
        if d not in k:
            p.append(d)
    word_or1 = []
    for e in p:
        n = 0
        for i in range(len(e)):
            x = ord(e[i])
            n += x*((37)**i)
        word_or1.append(n%M)
    word_or1 = sorted(word_or1)
    word_or2 = []
    word_num = []
    for i in range(len(word_or1)):
        if word_or1[i] in word_or2:
            j = word_or2.index(word_or1[i])
            word_num[j] += 1
        else:
            word_or2.append(word_or1[i])
            word_num.append(1)
    z = []
    for i in range(len(word_num)):
        z.append([word_or2[i],word_num[i]])
    return z

#----------------------------------------------------------
stop_name = 'stopwords.txt'
file_name = input('File name = ')
s = 1
while s == 1:
    t = input('Use feature hashing ? (y,Y,n,N) ').lower()
    if t == 'y' or t == 'n':
        s = 0
    else:
        print('Try again.')
        s = 1
if t == 'y':
    M = int(input('M = '))
    print('-------------------')
    print('char count =',char_count(file_name))
    print('alphanumeric count =',alp_count(file_name))
    print('line count =',line_count(file_name))
    print('word count =',len(words(file_name)))
    print('BoW =',f_hashing(file_name, stop_name,M))
else:
    print('-------------------')
    print('char count =',char_count(file_name))
    print('alphanumeric count =',alp_count(file_name))
    print('line count =',line_count(file_name))
    print('word count =',len(words(file_name)))
    print('BoW =',BoW(file_name, stop_name)) # 6330468521  (2021-03-22 16:42) %diff = 49.15
def char_count(file_name) :
    fin = open(file_name, "r")
    char_count = 0
    for line in fin :
        a = line.strip()
        char_count += len(a)
    fin.close()
    return char_count
def alphanumeric_count(file_name) :
    fin = open(file_name, "r")
    alphanumeric_count = 0
    for line in fin :
        for e in line.strip() :
            if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9" :
                alphanumeric_count += 1
            else :
                alphanumeric_count += 0
    fin.close()
    return alphanumeric_count
def line_count(file_name) :
    fin = open(file_name, "r")
    line_count = 0
    for line in fin :
        if len(line) > 0 :
            line_count += 1
    fin.close()
    return line_count
def word_count(file_name) :
    fin = open(file_name, "r")
    words = ""
    for line in fin :
        for e in line.strip() :
            if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9" :
                words += e
            else :
                words += " "
    words_list = words.split()
    word_count = len(words_list)
    fin.close()
    return word_count
def BOW_list(file_name) :
    fin = open(file_name, "r")
    words_of_BOW2 = []
    words_of_BOW = []
    for line in fin :
        words_of_BOW1 = ""
        for e in line.strip() :
            if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9" :
                words_of_BOW1 += e
            else :
                words_of_BOW1 += " "
        words_of_BOW2.append(words_of_BOW1.lower().split())
    for i in range(len(words_of_BOW2)) :
        for e in words_of_BOW2[i] :
            words_of_BOW.append(e)
    fin.close()
    fin1 = open("stopwords.txt", "r")
    stopwords_list1 = []
    stopwords_list = []
    for line in fin1 :
        stopwords_list1.append(line.strip().split())
    for i in range(len(stopwords_list1)) :
        for e in stopwords_list1[i] :
            stopwords_list.append(e)
    fin1.close()
    BOW_list = []
    BOW_list[:] = words_of_BOW
    for e in stopwords_list :
        k = 0
        while k < len(BOW_list) :
            if BOW_list[k] == e :
                BOW_list.pop(k)
            else :
                k += 1
    return BOW_list
def fhash(w, M) :
    G = 37
    s = 0
    for i in range(len(w)) :
        s += ord(w[i])*(G**i)
    s1 = s % int(M)
    return s1
def Bow_yY(p) :
    B = []
    for e in p :
        y = fhash(e, int(M))
        B.append(y)
    B1 = []
    B2 = []
    for e in B :
        x = 0
        for i in range(len(B)) :
            if e == B[i] :
                x += 1
        B1.append([e,x])
    B1.sort()
    for e in B1 :
        if e not in B2 :
            B2.append(e)
    return B2
def BOW_nN(v) :
    b = []
    b[:] = v
    list1 = []  #before
    for e in v :
        c = []
        n = 0
        for i in range(len(v)) :
            if b[i] == e :
                n += 1
        c.append(e)
        c.append(n)
        list1.append(c)
    BOW_nN = []
    for e in list1 :
        if e not in BOW_nN :
            BOW_nN.append(e)
    return BOW_nN
file_name = input("File name = ")
u = input("Use feature hashing ? (y,Y,n,N) ")
while u not in ["y","Y","n","N"] :
    print("Try again.")
    u = input("Use feature hashing ? (y,Y,n,N) ")
if u == "y" or u == "Y" :
    M = input("M = ")
    print("-------------------")
    print("char count = "+str(char_count(file_name)))
    print("alphanumeric count = "+str(alphanumeric_count(file_name)))
    print("line count = "+str(line_count(file_name)))
    print("word count = "+str(word_count(file_name)))
    p = BOW_list(file_name)
    print("BoW = "+str(Bow_yY(p)))
elif u == "n" or u == "N" :
    print("-------------------")
    print("char count = "+str(char_count(file_name)))
    print("alphanumeric count = "+str(alphanumeric_count(file_name)))
    print("line count = "+str(line_count(file_name)))
    print("word count = "+str(word_count(file_name)))
    v = BOW_list(file_name)
    print("BoW = "+str(BOW_nN(v))) # 6330245921  (2021-03-22 20:17) %diff = 49.75
#Prog-08: Bag-of-words
#6330245921 Teetat Karuhawanit

def somchai(c):
    v = open(c,'r')
    x = ''
    b = []
    for j in v.readlines():
        b+= [j.strip()]
    splitted = ''
    for i in b:
        splitted += i.lower()+' '
        x = splitted.split()
    v.close()
    return ' '.join(x)

def paisan(file_name):
    u = open(file_name)
    x = u.readlines()
    alphacount = 0
    for b in range(len(x)):
        x[b] = x[b].strip('\n').lower()
        for n in x[b]:
            if n in 'abcdefghijklmnopqrstuvwxyz0123456789':
                alphacount += 1
    u.close()
    return alphacount

def chate(file_name):
    z = 0
    c = somchai(file_name)
    for i in range(len(c)):
        z+=1
    return((z-thanarat(file_name))+1)

def thanarat(file_name):
    f = open(file_name,'r')
    v = f.readlines()
    f.close()
    return len(v)

def pannarai(file_name):
    d = somchai(file_name)
    c = len(d.split())
    return c

def sukree(file_name):
    a = somchai(file_name)
    b = somchai('stopwords.txt')
    x = ''
    for i in a:
        if i not in 'abcdefghijklmnopqrstuvwxyz0123456789':
            x += ' '
        else:
            x += i
    x = x.split()
    l = []
    for i in x:
        if i in b:
            l += []
        else:
            l += [i]
    return l

def fhash(W,M):
    x = 0
    s = 0
    G = 37
    for i in W:
        x += ord(i)*(G**s)
        s += 1
    d = x % M
    return d

def kirati():
    v = sukree(file_name)
    a = []
    b = []
    c = []
    d = 0
    for i in v:
        if i not in a:
            a.append(i)
            b.append(1)
        else:
            b[a.index(i)] += 1
    for i in a:
        c += [[i,b[d]]]
        d += 1
    return c

def parngod(M):
    a = []
    b = []
    c = []
    d = 0
    for i in sukree(file_name):
        if fhash(i,M) not in a:
            a += [fhash(i,M)]
            b.append(1)
        else:
            b[a.index(fhash(i,M))] += 1
    for i in a:
        c += [[i,b[d]]]
        d += 1
    return c

file_name = input('File name = ')
x = input('Use feature hashing ? (y,Y,n,N) ')
while x not in 'yYnN':
    print('Try again.')
    x = input('Use feature hashing ? (y,Y,n,N) ')
if x in 'Yy':
    M = int(input('M = '))
    print('-------------------')
    print('char count =',chate(file_name))
    print('alphanumeric count =',paisan(file_name))
    print('line count =',thanarat(file_name))
    print('word count =',pannarai(file_name))
    print('BoW =',parngod(M))
else:
    print('-------------------')
    print('char count =',chate(file_name))
    print('alphanumeric count =',paisan(file_name))
    print('line count =',thanarat(file_name))
    print('word count =',pannarai(file_name))
    print('BoW =',kirati()) # 6330412321  (2021-03-22 21:13) %diff = 49.8
file_name = input('File name = ')

def char_count(a):
    a = open(file_name,'r')
    cc = 0
    for line in a:
        cc += int(len(line.strip()))
    a.close()
    return cc

def alphanumeric_count(a):
    a = open(file_name,'r')
    alp = 0
    for line in a:
        for e in line:
            if '0' <= e <= '9' or 'A' <= e <= 'Z' or 'a' <= e <= 'z':
                alp += 1
    a.close()
    return alp
def line_count(a):
    a = open(file_name,'r')
    lc = 0
    for line in a:
        lc += 1
    a.close()
    return lc

def word_count(a):
    a = open(file_name,'r')
    k = ''
    wc = 0
    for line in a:
        for e in line:
            if (e not in 'abcdefghijklmnopqrstuvwxyz') and (e not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') and(e not in '0123456789'):
                k += ' '
            else:
                k += e
        words = k.split()
    wc += len(words)

    a.close()
    return wc
def BoW(a):
    a = open(file_name, "r")
    stop_words = open("stopwords.txt","r")
    new = ''
    for line in a:
        line = line.lower()
        for e in line:
            if (e in 'abcdefghijklmnopqrstuvwxyz') or (e in '0123456789'):
                new += e
            else:
                new += ' '
    new1 = new.split(' ')
    sw = ''
    for line in stop_words:
        line = line.lower()
        for e in line:
            if (e in 'abcdefghijklmnopqrstuvwxyz') or (e in '0123456789'):
                sw += e
            else:
                sw += ' '
    sw1 = sw.split(' ')

    new2 = []
    for e in new1:
        if e in sw1:
            new2.append('')
        else:
            new2.append(e)
    new3 = []
    for e in new2:
        if e != '':
            new3.append(e)
    u = []
    v= []
    for e in new3:
        if e not in u:
            u.append(e)
            v.append([e,1])
        else:
            t = u.index(e)
            v[t] = [e,v[t][1]+1]
    a.close()
    stop_words.close()
    return v
def fhash(w,M):
    s = 0
    for i in range(len(w)):
        s += int(ord(w[i])*((37)**i))
    fhash = s%M
    return fhash
def new_bow(a):
    s = BoW(a)
    u = []
    v = []

    for e in s:
        i = fhash(e[0],M)
        j = e[1]
        if i not in u:
            u.append(i)
            v.append([i,j])
        else:
            k = u.index(i)
            v[k] = [i,v[k][1]+j]
    v.sort()
    return v



fh = input('Use feature hashing ? (y,Y,n,N) ')
while fh != 'n' and fh != 'N' and fh != 'y' and fh != 'Y':
    print('Try again.')
    fh = input('Use feature hashing ? (y,Y,n,N) ')
if fh == 'n' or fh == 'N':
    print('-------------------')
    print('char count = '+str(char_count(file_name)))
    print('alphanumeric count = '+str(alphanumeric_count(file_name)))
    print('line count = '+str(line_count(file_name)))
    print('word count = '+str(word_count(file_name)))
    print('BoW = '+str(BoW(file_name)))
elif fh == 'y' or fh == 'Y':
    M = int(input('M = '))
    print('-------------------')
    print('char count = '+str(char_count(file_name)))
    print('alphanumeric count = '+str(alphanumeric_count(file_name)))
    print('line count = '+str(line_count(file_name)))
    print('word count = '+str(word_count(file_name)))
    print('BoW = '+str(new_bow(file_name)))
ALL: cluster #15 (2)
# 6330319721  (2021-03-22 17:04) %diff = 46.04
def fhash(w,M):
    G = 37
    frac = 0
    letters = list(w)
    for i in letters:
        frac += ord(i)*(G**(len(letters)-1))
    ans = frac % M
    return ans
#=============================================================
def stopwords():
    b = []
    stop = open('stopwords.txt')
    for line in stop:
        if line != "\n":
            line1 = line.strip('\n')
            line2 = line1.split(' ')
            for j in range(len(line2)):
                b.append(line2[j])
    stop.close()
    return b
#=============================================================
def text(file):
    file = open(file)
    a = ''
    for line in file:
        if line != "\n":
            line = line.lower()
            l = line.strip('\n')
            a += ''.join(l)+' '
    file.close()
    return a
#=============================================================
def char(file):
    file = open(file)
    char = ''
    for line in file:
        linex = line.strip()
        if linex != "\n":
            line = line.lower()
            l1 = line.strip('\n')
            char += ''.join(l1)
    file.close()
    ans = len(char)
    return ans
#=============================================================
def alphanum(cn):
    ans = ''
    for i in cn:
        if i == ' ':
            ans += ' '
        elif 48<=ord(i)<=57 or 97<=ord(i)<=122 or 65<=ord(i)<=90:
            ans += i
        else:
            ans += ' '
    return ans
#=============================================================
def line(file_name):
    file = open(file_name)
    ans = 0
    r = file.read()
    r1 = r.strip('\n')
    r2 = r1.split('\n')
    for i in r2:
        ans += 1
    file.close()
    return ans
#=============================================================
def BoW(file_name):
    a1 = file_name.split()
    ans = []
    num = 0
    for i in a1:
        for k in range(len(a1)):
            if i == a1[k]:
                num += 1
        a2 = [i,num]
        if a2 in ans:
            num = 0
        else:
            ans.append([i,num])
            num = 0
    return ans
#=============================================================
def BoWfhash(w,m):
    a1 = w.split()
    ans = []
    list1 = []
    for i in a1:
        feh = fhash(i,m)
        list1.append(feh)
    num = 0
    for j in list1:
        for k in range(len(list1)):
            if j == list1[k]:
                num+=1
        a2 = [j,num]
        if a2 in ans:
            num = 0
        else:
            ans.append(a2)
            num = 0
    return ans
#=============================================================
file_name = input('File name = ')
yn = input('use feature hashing ? (y,Y,n,N) ')
do = 0
b = stopwords()
a = text(file_name)
cn1 = alphanum(a)
cn2 = ''.join(cn1.split())
cut = ' '.join([i for i in cn1.split() if i not in b])
while yn != 'y' or yn != 'Y':
    if yn == 'n' or yn == 'N':
        break
    elif yn == 'y' or yn == 'Y':
        do = 1
        m = input('M = ')
        break
    else:
        print('Try again.')
        yn = input('Use feature hashing ? (y,Y,n,N) ')
if do == 1:
    print('-------------------')
    print('char count =', char(file_name))
    print('alphanumeric count =', len(cn2))
    print('line count =', line(file_name))
    print('word count =', len(a.split()))
    print('BoW =', BoWfhash(cut,int(m)))
else:
    print('-------------------')
    print('char count =', char(file_name))
    print('alphanumeric count =', len(cn2))
    print('line count =', line(file_name))
    print('word count =', len(cn1.split()))
    print('BoW =', BoW(cut)) # 6330354621  (2021-03-22 23:28) %diff = 46.04

file_name=input('File name = ')
#------------------------------------------------------------------------------
def stopwordtolist():
    b=[]
    z=open('stopword.txt')
    for line in z:
        if line != "\n":
            line1=line.strip('\n')
            line2=line1.split(' ')
            for j in range(len(line2)):
                b.append(line2[j]) #word chec use for b.o.w
    z.close()
    return b
#------------------------------------------------------------------------------
def alphanum(word):
    text=''
    for i in word: #alphanumeric
        if i==' ':
            text+=' '
        elif i in 'abcdefghijklmnopqrstuvwxyz0123456789':
            text+=i
        else:
            text+=' '
    return text
#------------------------------------------------------------------------------
def linecount(k):
    file=open(k)
    line_count = 0
    x=file.read()
    x1=x.strip('\n')
    x2=x1.split('\n')
    for i in x2:
        line_count+=1
    file.close()
    #for line in file:
       # if line != "\n":
            #line_count+=1
    return line_count
#------------------------------------------------------------------------------
def texttosent(file):
    file=open(file)
    a=''
    for line in file:
        if line != "\n":
            line=line.lower()
            l1=line.strip('\n')
            a+=''.join(l1)+' '
    file.close()
    return a
#------------------------------------------------------------------------------
def charcount(file):
    file=open(file)
    charcount =''
    for line in file:
        linex=line.strip()
        if linex != "\n":
            line=line.lower()
            l1=line.strip('\n')
            charcount+=''.join(l1)
    ans=len(charcount)
    file.close()
    return ans
#------------------------------------------------------------------------------
def allChar(l1):
    word_stick=''.join(l1.split())#find char count
    ans=alphanum(word_stick)

    return ans
#------------------------------------------------------------------------------
def BoW(word):#word= alpha
    a1=word.split()
    ans=[]
    num=0
    for i in a1:
        for k in range(len(a1)):
            if i == a1[k]:
                num+=1
        a2=[i,num]
        if a2 in ans:
            num=0
        else:
            ans.append([i,num])
            num=0

    return ans

#feature hashing---------------------------------------------------------------
def BoWfe(w,m):#cut_word='best times worst times age wisdom 555'
    a1=w.split()
    listall=[]
    list1=[]
    for i in a1:
        feh=fe(i,m)
        list1.append(feh)
    num=0
    for j in list1:
        for k in range(len(list1)):
            if j == list1[k]:
                num+=1
        a2=[j,num]
        if a2 in listall:
            num=0
        else:
            listall.append(a2)
            num=0
    return listall

#------------------------------------------------------------------------------
def fe(w,m):
    sum1=0
    k=0
    for i in w:
        sum1+=(ord(i)*(37**(w.find(i,0+k))))
        k+=1
    ans=sum1 % m
    return ans

#------------------------------------------------------------------------------

chose=0
choice=input('use feature hashing ? (y,Y,n,N) ')
while choice!='n' or choice!='N':
    if choice=='y' or choice=='Y':
        chose=1
        break
    if choice=='n' or choice=='N':
        chose=0
        break
    else:
        print('Try again.')
        choice=input('use feature hashing ? (y,Y,n,N) ')
if chose == 1:
    m=input('M = ')
    print('-------------------')
    a=texttosent(file_name)
    n=linecount(file_name)
    b=stopwordtolist()
    alpha=alphanum(a)
    alpha2=''.join(alpha.split())
    cut_word =' '.join([i for i in alpha.split() if i not in b])
    print('char count =', charcount(file_name))
    print('alphanumeric count =',len(alpha2))
    print('line count =', n)
    print('word count =', len(alpha.split()))
    print('BoW =',BoWfe(cut_word,int(m)))
elif chose == 0:
    print('-------------------')
    a=texttosent(file_name)
    n=linecount(file_name)
    b=stopwordtolist()
    alpha=alphanum(a)
    alpha2=''.join(alpha.split())
    cut_word =' '.join([i for i in alpha.split() if i not in b]) #word that already cut stopwords usr for b.o.w
    print('char count =', charcount(file_name))
    print('alphanumeric count =',len(alpha2))
    print('line count =', n)
    print('word count =', len(alpha.split()))
    print('BoW =',BoW(cut_word))
ALL: cluster #16 (3)
# 6330250021  (2021-03-20 11:58) %diff = 48.31

def fhash(w,M):
    n=0
    for i in range(len(w)):
        n+=ord(w[i])*37**i
    return n%M

file_name=input('File name = ')
fh=input('Use feature hashing ? (y,Y,n,N) ').lower()
while fh not in 'yn':
    print('Try again.')
    fh=input('Use feature hashing ? (y,Y,n,N) ').lower()
if fh=='y':
    M=int(input('M = '))
stop=open('stopwords.txt','r')
sw=[]
for line in stop:
    if len(line.strip())!=0:
        sw+=line.split()
stop.close()
ch=0
al=0
li=0
word=''
f=open(file_name,'r')
for line in f:
    li+=1
    ch+=len(line.strip())
    for a in line.lower():
        if '0'<=a<='9' or 'a'<=a<='z':
            word+=a
            al+=1
        else:
            word+=' '
wordlist=word.split()
wd=len(wordlist)
fhlist=[]
bow=[]
if fh=='n':
    for w in wordlist:
        if [w,wordlist.count(w)] not in bow and w not in sw:
            bow.append([w,wordlist.count(w)])
elif fh=='y':
    for w in wordlist:
        if w not in sw:
            fhlist.append(fhash(w,M))
    for x in fhlist:
        if [x,fhlist.count(x)] not in bow:
            bow.append([x,fhlist.count(x)])
bow.sort()
print('-------------------')
print('char count =',ch)
print('alphanumeric count =',al)
print('line count =',li)
print('word count =',wd)
print('BoW =',bow) # 6330507321  (2021-03-18 21:59) %diff = 48.31
def fhash(w,M):
    c=0
    for i in range(len(w)):
        c += ord(w[i])*(37**(i))
    c=c%int(M)
    return c
x=input('File name = ',)
y=input('Use feature hashing ? (y,Y,n,N) ',)
while y not in ['y','Y','n','N']:
    print('Try again.')
    y=input('Use feature hashing ? (y,Y,n,N) ',)
if y.lower()=='y':
    M=input('M = ',)
    k=[]

File=open(x,'r')
a=''
lc=0
chc=0
for line in File:
    for e in line.strip():
        if (('a'<=e.lower() and e.lower()<='z') or ('0'<= e<='9')):
            a+=e
        else:
            a+=' '
        chc+=1
    lc+=1
a=a.lower().split()
wc=len(a)
File.close()
stop=open('stopwords.txt','r')
b=''
for line in stop:
    b += line+' '
b=b.split()
alm=0
for e in a:
    l=len(e)
    for i in range(len(e)):
        if not (('a'<=e[i].lower() and e[i].lower()<='z') or ('0'<= e[i]<='9')):
            l-=1
    alm+=l
stop.close()
File=open(x,'r')
B=[]
for e in a:
    if not e in b:
        B.append(e)
B.sort()
B.append(' ')
h=1
j=[]
if y in['N','n']:
    for i in range(len(B)-1):
        if B[i]==B[i+1]:
            h+=1
        else:
            j.append([B[i],h])
            h=1

else:
    k=[]
    B.remove(' ')
    for e in B:
        k.append(fhash(e,M))
        k.sort()
        k.append(111)
    for i in range(len(k)-1):
        if k[i]==k[i+1]:
            h+=1
        else:
            j.append([k[i],h])
            h=1
File.close()
print('-------------------')
print('char count =',chc)
print('alphanumeric count =',alm)
print('line count =',lc)
print('word count =',wc)
print('BoW =',j) # 6330434121  (2021-03-22 17:39) %diff = 48.36

def flash(w,m):
    c = 0
    for i in range(len(w)):
        c = c + (ord(w[i])*(37**i))
    number = c%int(m)
    return number

file_name = input("File name = ")
feature = input("Use feature hashing ? (y,Y,n,N) ")
m = 0
if not feature == "y" and feature == "Y" and feature == "n" and feature == "N":
    a = 0
    while a == 0:
        print("Try again")
        feature = input("Use feature hashing ? (y,Y,n,N) ")
        if feature == "y" or feature == "Y" or feature == "n" or feature == "N":
            if feature == "y" or feature == "Y":
                m = input("M = ")
            break
else:
    if feature == "y" or feature == "Y":
        m = input("M = ")

stopword = []
stopwords = open("stopwords.txt", "r")
for line in stopwords:
    words = line.strip().split()
    for i in range(len(words)):
        stopword.append(words[i])
stopwords.close()

file = []
newword = ""
line_count = 0
word_count = 0
char_count = 0
alphanumeric_count = 0

files = open(file_name, "r",encoding="utf-8")
for line in files:
    line_count += 1
    char_count += len(line.strip("\n"))+1
    check_word = line.strip().lower()


    for i in range(len(check_word)):

        if check_word[i] not in "abcdefghijklmnopqrstuvwxyz0123456789":
            newword = newword+" "
        else:
            newword = newword + check_word[i]
            alphanumeric_count += 1
    file = file + newword.strip().split()
    newword = ""


files.close()
char_count = char_count - line_count
word_count = len(file)
new_file = []
for i in range(len(file)):
    if not file[i] in stopword:
        new_file.append(file[i])
new_file.sort()
bow = []
bow1 = []
c = 1
for i in range(len(new_file)-1):
    if new_file[i]==new_file[i+1]:
        c +=1
    else:
        bow.append([new_file[i],c])
        c = 1
if not m == 0:
    e = ""
    f = []
    for i in range(len(new_file)):
        e = flash(new_file[i],m)
        f.append(e)
    f.sort()
    for i in range(len(f)-1):
        if f[i]==f[i+1]:
            c +=1
        else:
            bow1.append([f[i],c])
            c = 1
    bow1.append([f[-1],c])
print("-------------------")
print("char count = ",char_count)
print("alphanumeric count = ",alphanumeric_count)
print("line count = ",line_count)
print("word count = ",word_count)
if m == 0:
    print("BoW = ",bow)
else:
    print("BoW = ",bow1)
ALL: cluster #17 (2)
# 6330402021  (2021-03-21 23:25) %diff = 48.69

def fhash(w,m):
    c = 0
    for i in range(len(w)) :
        c += ord(w[i])*(37**i)
    return c % m
#-------------------------------------------------------------------------------------
file_name = input("File name = ")

hashing = input("Use feature hashing ? (y,Y,n,N) ")
while hashing not in ["Y","y","n","N"]:
    print("Try again.")
    hashing = input("Use feature hashing ? (y,Y,n,N) ")
if hashing in ["Y","y"]:
    m = int(input("M = "))
print("-------------------")

stopword = open("stopword.txt","r")
sw = []
for i in stopword:
    word_char1 = ""
    for a in i:
        if a.isalpha() == True :
            word_char1 += a.lower()
        elif word_char1 != "" :
            sw.append(word_char1)
            word_char1 = ""
        else:
            word_char1 = ""
if word_char1.isalpha() == True :
    sw.append(word_char1)
stopword.close()

line = 0
char = 0
alnum = 0

file = open(file_name,"r")

for i in file:
    line += 1
    char += len(i)-1
    for a in i:
        if a.isalnum() == True :
            alnum += 1
        h = a
if h != "\n" :
    char += 1
file.close()
print("char count = ",char)
print("alphanumeric count = ",alnum)
print("line count = ",line)
file = open(file_name,"r")

word_char = []
for i in file :
    word_char1 = ""
    for a in i:
        if a.isalnum() == True :
            word_char1 += a.lower()
        elif word_char1 != "" :
            word_char.append(word_char1)
            word_char1 = ""
        else:
            word_char1 = ""
if word_char1.isalnum() == True :
    word_char.append(word_char1)

file.close()

word = len(word_char)
print("word count = ",word)

word_clear = []
for i in word_char:
    if i not in sw :
        word_clear.append(i)
BoW = []
for i in word_clear:
    bow_count = 0
    for a in range(len(word_clear)):
        if i == word_clear[a]:
            bow_count += 1
    if [i,bow_count] not in BoW :
        BoW.append([i,bow_count])

if hashing in "Nn":
    print("BoW = ",BoW)
elif hashing in "Yy":
    BoW_fhash = []
    for i in BoW :
        BoW_fhash.append([fhash(i[0],m),i[1]])
    BoW_fhash_clear = []
    for i in BoW_fhash:
        bow_fhash_count = 0
        for a in range(len(BoW_fhash)):
            if i[0] == BoW_fhash[a][0]:
                bow_fhash_count += BoW_fhash[a][1]
        if [i[0],bow_fhash_count] not in BoW_fhash_clear :
            BoW_fhash_clear.append([i[0],bow_fhash_count])

    print("BoW = ",BoW_fhash_clear) # 6330577221  (2021-03-22 02:52) %diff = 48.69
#Prog-08: Bag-of-Words
#6330577221 Name Akrachai Kovittayanun
def fhash (w,M):
    allord=0
    for i in range(len(w)):
        o=ord(w[i])*37**i
        allord+=o
    return allord%M

filename=input('File name = ')
bow=input('Use feature hashing ? (y,Y,n,N)) ')
while bow not in ('y','Y','n','N'):
    print('Try again.')
    bow=input('Use feature hashing ? (y,Y,n,N)) ')
if bow == 'y' or bow == 'Y':
    m=int(input('M = '))
print('-------------------')

stopwords=[]
s_file=open('stopwords.txt','r')
for line in s_file:
    for w in line.strip().split():
        stopwords.append(w)
s_file.close()

text=''
textalnum=''
wordstext=''
line_count=0
file = open(filename,'r')
for line in file:
    for e in line.strip():
        text+=e
    text+=' '
    line_count+=1
char=len(text)-line_count
print('char count =',char)

for e in text:
    if e.isalnum()!=True:
        textalnum+=''
    else:
        textalnum+=e
alnum=len(textalnum)
print('alphanumeric count =',alnum)

print('line count =',line_count)

for e in text:
    if e.isalnum()==True:
        wordstext+=e.lower()
    else:
        wordstext+=' '
wordslist=wordstext.strip().split()
wordcount=len(wordslist)
print('word count =',wordcount)

uniquelist=[]
for e in wordslist:
    if e not in stopwords:
        uniquelist.append(e)

fhashlist=[]
if bow=='y' or bow=='Y':
    for e in uniquelist:
        fhashwords=fhash(e,m)
        fhashlist.append(fhashwords)
    uniquelist=fhashlist

output=[]
point=0
for i in range(len(uniquelist)):
    if uniquelist[i] not in uniquelist[i+1:] and uniquelist[i] not in uniquelist[:i]:
        output.append([uniquelist[i],1])
    if uniquelist[i] in uniquelist[i+1:] and uniquelist[i] not in uniquelist[:i]:
        point+=1
        j=i+1
        while j in range(len(uniquelist)) and uniquelist[i] in uniquelist[j:]:
            point+=1
            j=uniquelist[j:].index(uniquelist[i])+j+1
        output.append([uniquelist[i],point])
    point=0
print('BoW =',output)
file.close()
ALL: cluster #18 (2)
# 6330266021  (2021-03-22 23:55) %diff = 49.41

def fhash(w,M):
    total = 0
    index = 0
    for i in w:
        if index == 0:
            total += ord(i)
        else:
            total += ord(i)*(37**(index-1))
        index += 1
    total = total%M
    return total

file_name = input("File name = ")
file_name = open(file_name, "r")

while True:
    feature = input("Use feature hashing ? (y,Y,n,N) ")
    if feature == 'Y' or feature == 'y' or feature == 'N' or feature == 'n':
        break;
    else:
        print("Try again.")

if feature == 'Y' or feature == 'y':
    M = int(input("M = "))

print("-----------------------")

with open('stopwords.txt', 'r') as stopwords:
    ban = stopwords.read().replace('\n', ' ')
ban = ban.split();

charCount = 0
alphanumbericCount = 0
lineCount = 0
wordCount = 0

temp = []
BoW = []
res = []

for text in file_name:
    charCount += len(text)

    for t in text:
        if t.isalnum():
            alphanumbericCount += 1

    text = text.lower()
    textArray = text.split()
    wordCount += len(textArray)

    resultwords  = [word for word in textArray if word not in ban]
    result = ' '.join(resultwords)
    result =  ''.join([i for i in result if i.isalnum() or ' ' in i])

    for word in result.split():
        temp.append(word)

    lineCount += 1

print("char count = ", charCount)
print("alphanumberic count = ",alphanumbericCount)
print("line count = " , lineCount)
print("word count = ",wordCount)
if feature == 'Y' or feature == 'y':
    fhashArray = []
    for x in temp:
        fhashArray.append(fhash(x,M))

    BoW = [fhashArray.count(w) for w in fhashArray]

    for i in zip(fhashArray, BoW):
        if i not in res:
            res.append(i)
    print("BoW = ", res)

else:
    BoW = [temp.count(w) for w in temp]

    for i in zip(temp, BoW):
        if i not in res:
            res.append(i)
    print("BoW = ", res) # 6330269021  (2021-03-21 21:55) %diff = 49.41


def fhash(w, m):
    s = 0
    for i in range(len(w)):
        s += ord(w[i]) * 37**i
    return s % m


fileName = input("File name = ").strip()
while True:
    fhashMode = input("Use feature hashing ? (y,Y,n,N) ").strip().lower()
    if fhashMode == 'y' or fhashMode == 'n':
        break
    else:
        print("Try again.")
if fhashMode == 'y':
    m = int(input("M = "))
print("-------------------")


stopWordsFile = open("stopwords.txt")
stopWords = []
for line in stopWordsFile:
    if line != "":
        for e in line.split():
            stopWords.append(e)
stopWordsFile.close()


inputFile = open(fileName)
chCount = 0
alnumCount = 0
lineCount = 0
wordCount = 0
BoW = []
wordTemp = ""
words = []
for line in inputFile:
    lineCount += 1
    for ch in line:
        chCount += 1
        if ch.isalnum():
            alnumCount += 1
            wordTemp += ch
        elif wordTemp != "":
            words.append(wordTemp)
            wordTemp = ""
    words.append(wordTemp)
inputFile.close()
wordCount = len(words)
print("char count =", chCount)
print("alphanumeric count =", alnumCount)
print("line count =", lineCount)
print("word count =", wordCount)


wordsLower = [e.lower() for e in words if e.lower() not in stopWords and e != ""]
wordsLowerNoDuplicate = []
for e in wordsLower:
    if e not in wordsLowerNoDuplicate:
        wordsLowerNoDuplicate.append(e)
for e in wordsLowerNoDuplicate:
    if fhashMode == 'y':
        BoW.append([fhash(e, m), wordsLower.count(e)])
    else:
        BoW.append([e, wordsLower.count(e)])
print("BoW =", BoW)