ALL (408)
# 6030048821 (0.00) 1 (2021-03-21 15:46)
import string
def fhash(w,M):
    G = 37
    s = 0
    for i in range(len(w)):
        s += (ord(w[i]) * (G**i))
    return s % M
def fileProcess(filename):
    outlist = []
    punc = string.punctuation
    char_count = 0
    alnum_count = 0
    line_count = 0
    word_count = 0

    with open(filename) as f:
        content = f.readlines()
    
    for line in content:
        line_count += 1
        line = line.strip().lower()
        char_count += (len(line))
        for c in punc:
            line = line.replace(c,' ')
        alnum_count += (len(line) - line.count(' '))
        outlist += line.split()

    word_count = len(outlist)
    out = [(char_count,alnum_count,line_count,word_count),outlist]

    return out
def listProcess(words):
    outlist = []
    stopwords = fileProcess('stopwords.txt')[1]
    for c in words:
        if c not in stopwords:
            outlist.append(c)

    return outlist
def bow(words, isFeature = False, M = 0):
    unique_words = []
    Bow = []

    for word in words:
        if isFeature:
            hsh = fhash(word,M)
            if hsh not in unique_words:
                Bow.append([hsh,1])
                unique_words.append(hsh)
            else:
                for i in range(len(Bow)):
                    if Bow[i][0] == fhash(word,M):
                        Bow[i][1] += 1

        else:
            if word not in unique_words:
                Bow.append([word,1])
                unique_words.append(word)
            else:
                for i in range(len(Bow)):
                    if Bow[i][0] == word:
                        Bow[i][1] += 1

    return sorted(Bow)
def main():
    filename = input('File name = ').strip()
    while (prompt := input('Use feature hashing ? (y,Y,n,N) ').strip().lower()) not in ['y','n']:
        print('Try again.')
    if prompt == 'y':
        M = int(input('M = '))
    
    fp = fileProcess(filename)
    if prompt == 'y':
        Bow = bow(listProcess(fp[1]),True,M)
    else:
        Bow = bow(listProcess(fp[1]))
    
    print('-------------------')
    print('char count = '+str(fp[0][0]))
    print('alphanumeric count = '+str(fp[0][1]))
    print('line count = '+str(fp[0][2]))
    print('word count = '+str(fp[0][3]))
    print('BoW = '+str(Bow))   

main()


# 6030380021 (30.00) 2 (2021-03-22 22:36)

def fhash(w, M):
    val = 0
    for i, alpha in enumerate(w):
        val += ord(alpha)*pow(37, i)
    else:
        return val % M

file_name = input('File name = ').strip()
h = input('Use feature hashing ? (y,Y,n,N) ').strip()
while h != "n" and h != "N" and h != "y" and h != "Y":
    print("Try again.")
    h = input('Use feature hashing ? (y,Y,n,N) ').strip()
if h == "n" or h == "N":
    hh = False
elif h == "y" or h == "Y":
    hh = True
    try:
        M = int(input('M = '))
    except:
        exit()

stopword = []
with open("stopwords.txt") as f0:
    for i in f0.readlines():
        if i == "\n":
            continue
        datas = i.split()
        for data in datas:
            if data not in stopword:
                stopword.append(data)

allwords = "abcdefghijklmnopqrstuvwxyz"
allwords += allwords.upper()
allwords += "0123456789"

charcount = 0
alphanumcount = 0
wordcount = 0
linecount = 0
bow = []
inbow = []
ffhash = []
infhash = []

with open(file_name) as f1:
    for line in f1.readlines():
        newline = ""
        for alpha in line:
            if alpha == "\n":
                continue
            if alpha not in allwords:
                newline += " "
            else:
                newline += alpha.lower()
                alphanumcount += 1
            charcount += 1
        newline = newline.split()
        for word in newline:
            if word not in stopword:
                if word not in inbow:
                    inbow.append(word)
                    bow.append([word, 1])
                else:
                    idx = inbow.index(word)
                    bow[idx][1] += 1
                if hh:
                    outcome = fhash(word, M)
                    if outcome not in infhash:
                        infhash.append(outcome)
                        ffhash.append([outcome, 1])
                    else:
                        idx = infhash.index(outcome)
                        ffhash[idx][1] += 1
            wordcount += 1
        linecount += 1
print('-------------------')
print('char count =', charcount)
print('alphanumeric count =', alphanumcount)
print('line count =', linecount)
print('word count =', wordcount)
if hh:
    print('BoW =', sorted(ffhash))
else:
    print('BoW =', sorted(bow))

# 6030924521 (20.15) 3 (2021-03-21 12:43)

def read_file(filename):

  def check_alp_num(sentences_):
    
    for i in range(len(sentences_)):
      sentences_[i] = sentences[i].lower()
      sent = ''
      for w in sentences_[i]:
        if w.isalnum() or w == ' ':
          sent += w
      sentences_[i] = sent
      
      

    return sentences_

  line_count = 0
  sentences = []
  with open(filename) as f:
    for i in f:
      line_count += 1

      sentences.append(i.replace('\n', ''))
      
  char_count = sum(list(map(len, sentences)))
  
  sentences = check_alp_num(sentences)
  sent2 = [sentences[i].replace(' ', '') for i in  range(len(sentences)) ]
  
  alnum_count = sum(list(map(len, sent2)))
  
  for j in range(len(sentences)):
    sentences[j] = sentences[j].split(' ')
  
  word_count = sum(list(map(len, sentences)))

  bow = [word for sent in sentences for word in sent]
  
  return line_count, char_count, alnum_count , word_count, bow
def remove_stop_words(stop_file, bagofwords):
  
  stop_list = []
  with open(stop_file) as f:
    stop_list += f.read().split()
  
  finalbow = []
  for word in bagofwords:
    if word not in stop_list:
      finalbow.append(word)

  return finalbow
def select_hashing():
  c = True
  fhash = ''
  while c:
    
    i = input('Use feature hashing ? (y,Y,n,N) ')
    if i in ['y', 'Y', 'n', 'N']:
      fhash = i
      c = False
    
    else:
      print('Try again')

  return fhash
def fhashing(fhash_select, bow):
  if fhash_select == 'y' or fhash_select == 'Y':
    m = int(input('M = '))
    
    bow_list = []

    for word in bow:
      fhash_score = 0
      
      for i in range(len(word)):
        
        fhash_score+= ord(word[i])*(37**i)
      
      fhash_score = fhash_score%m
      
      bow_list.append(fhash_score)
    
    bow_list = sorted(Counter_(bow_list))

    return bow_list

  else:
    return sorted(Counter_(bow))

def Counter_(listt):
  key = []
  count = []
  for i in listt:
    if i in key:
      count[key.index(i)] += 1
    else:
      key.append(i)
      count.append(1)
  counter = [[key[i], count[i]] for i in range(len(key))]
  return counter
def test1():
  filename = input('File name = ')
  line_count, char_count, alnum_count, word_count, bow = read_file(filename)
  final_bow = remove_stop_words('stopwords.txt', bow)
  fhash_select = select_hashing()
  result = fhashing(fhash_select, final_bow)
  print('-------------------')
  print('char count =', char_count)
  print('alphanumeric count =', alnum_count)
  print('line count =', line_count)
  print('word count =', word_count)
  print('BoW =',result)

#========== Run Test ================================================


test1()



#========== End Test ================================================
# 6130097621 (30.00) 4 (2021-03-22 22:09)

def fhash(w, M) :
    a = 0
    for i in range(len(w)) :
        a += ord(w[i])*(37**i)
    b = a % M
    return b
def charcount(file_name) :
    a = 0
    b = 0
    for line in file_name :
        b += 1
        for i in line :
            a += 1
    c = a-b+1        
    return c
def alphanumericcount(file_name) :
    a = 0
    for line in file_name :
        for i in line :
            if i in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" :
                a += 1
    return a
def linecount(file_name) :
    a = 0
    for line in file_name :
        a += 1
            
    return a
def wordcount(file_name) :
    a = ""
    for line in file_name :
        for i in line :
            if i in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" :
                a += i
            if i not in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" :
                a += " "
    b = a.split()
        
    return len(b)
def split1(stop1) :
    a = ""
    for line in stop1 :
        for i in line :
            if i in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" :
                a += i
            if i not in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" :
                a += " "
    a = a.lower()
    b = a.split()
    
    return b
            
def bown(file_name) :
    a = split1(file_name)
    b = split1(stop)
    c = []
    d = []
    e = []
    for i in range(len(a)) :
        if a[i] not in b :
            c += [a[i]]
    for i in c :
        if i not in e :
            e += [i]
    for i in e :
        d += [[i,0]]
    for i in range(len(d)) :  
        for j in range(len(c)) :
            if d[i][0] == c[j] :
                d[i][1] += 1
    d.sort()
        
    return d
def bowy(file_name) :
    a = split1(file_name)
    b = split1(stop)
    c = []
    d = []
    e = []
    f = []
    for i in range(len(a)) :
        if a[i] not in b :
            c += [a[i]]
    for i in c :
        g = fhash(i, M)
        d += [g]
        
    for i in d :
        if i not in e :
            e += [i]
    for i in e :
        f += [[i,0]]
    for i in range(len(f)) :  
        for j in range(len(d)) :
            if f[i][0] == d[j] :
                f[i][1] += 1
    f.sort()
    return f


file_name = str(input("File name = "))
stop = open("stopwords.txt")

while True :
    fh = input("Use feature hashing ? (y,Y,n,N) ")
    
    if fh == "Y" or fh == "y" :
        M = int(input("M = "))
        print("-------------------")
        print("char count =",charcount(open(file_name)))
        print("alphanumeric count =",alphanumericcount(open(file_name)))
        print("line count =",linecount(open(file_name)))
        print("word count =",wordcount(open(file_name)))
        print("BoW =",bowy(open(file_name)))
        exit()
    if fh == "N" or fh == "n" :
        print("-------------------")
        print("char count =",charcount(open(file_name)))
        print("alphanumeric count =",alphanumericcount(open(file_name)))
        print("line count =",linecount(open(file_name)))
        print("word count =",wordcount(open(file_name)))
        print("BoW =",bown(open(file_name)))
        exit()
    elif fh != "Y" or fh != "y" or fh != "N" or fh != "n" :
        print("Try again.")  

# 6130917221 (22.99) 5 (2021-03-22 23:53)

file_name = open(input('File name = '), 'r')
fh = input('Use feature hashing ? (y,Y,n,N) ')
if fh == 'y' or fh == 'Y':
    M = int(input('M = '))
    a = 2
elif fh == 'n' or fh == 'N':
    a = 0
else:
    a = 1
    while a == 1:
        print('Try again')
        fh = input('Use feature hashing ? (y,Y,n,N) ')
        if fh == 'y' or fh == 'Y':
            M = int(input('M = '))
            a = 2
        elif fh == 'n' or fh == 'N':
            a = 0
        else:
            a = 1
print('-------------------')
#---------------------------------------------------------------
stop_words = open('stopwords.txt', 'r')
sw = []
for line in stop_words:
    sw += line.lower().strip().split()
#---------------------------------------------------------------
atoz = 'abcdefghijklmnopqrstuvwxyz'
no = '0123456789'
cc = al = lc = wc = 0
wiwc1 = ''
wiwc2 = []
wiwc3 = []
for line in file_name:
    if line[-2:] == '\\n':
        i = line[:-2].lower()
    else:
        i = line.lower()
    cc += len(i)
    lc += 1
    for e in i:
        if e in atoz or e in no:
            al += 1
            wiwc1 += e
        else:
            wiwc2.append(wiwc1)
            wiwc1 = ''
for i in wiwc2:
    if i != '':
        wiwc3.append(i)
wc = len(wiwc3)
cc = cc - (lc - 1)
print('char count =', cc)
print('alphanumeric count =', al)
print('line count =', lc)
print('word count =', wc)
#---------------------------------------------------------------
BoW = []
x = []
y = []
z = []
c = 0
for i in range(len(wiwc3)):
    if wiwc3[i] not in sw:
        z.append(wiwc3[i])
#---------------------------------------------------------------
def fhash(w, M):
    G = 37
    x = 0
    for i in range(len(w)):
        x += ord(w[i])* (G**i)
    x = x % M
    return x
#---------------------------------------------------------------
if a == 2:
    for i in z:
        b = fhash(i, M)
        x.append(b)
        if b not in y:
            y.append(b)
    for i in y:
        BoW.append([i, x.count(i)])
elif a == 0:
    for i in z:
        x.append(i)
        if i not in y:
            y.append(i)
    for i in y:
        BoW.append([i, x.count(i)])
print('BoW =', BoW)
# 6130924621 (14.05) 6 (2021-03-22 23:51)
def fhash(w,M):
  count = 0
  G = 37
  for i in range(len(w)):
    count += ord(w[i])* (G**i)
  return count%M

filename = input('File name = ')
yesorno = False
while not yesorno:
  isHashing = input('Use feature hashing ? (y,Y,n,N) ')
  if isHashing in 'YyNn':
    yesorno = True
  else:
    print('Try again.')

if isHashing in 'yY':
  m = int(input('M = '))

print('-------------------')
textFile = open(filename)
uselessChar = '(") ?.!/;:\"\'/\\,'
charCount = 0
textAndNumCount = 0
wordCount = 0
lineCount = 0
BoW = []
for line in textFile:
  lineCount += 1
  words = line.strip().split(' ')
  if words != ['']:
    wordCount += len(words)
    for word in words:
      charCount += len(word)
      cleanWord = word.replace(uselessChar,'').lower()
      for charecter in cleanWord:
        if 'a' <= charecter <= 'z' or '0' <= charecter <= '9':
          textAndNumCount += 1  
      isFound = False
      if isHashing in 'yY':
        cleanWord = fhash(cleanWord,m)
      for item in BoW:
        if item[0] == cleanWord:
          item[1] += 1
          isFound = True
      if not isFound:
        BoW.append([cleanWord, 1])
print('char count = ', charCount)
print('alphanumeric count = ', textAndNumCount)
print('line count = ', lineCount)
print('word count = ', wordCount)
print('BoW = ' ,BoW)

          
        
        



# 6230041021 (23.90) 7 (2021-03-21 16:09)

File_name = input('File name = ')
hashimg = input('Use feature hashing ? (y,Y,n,N) ')

File = open(File_name, 'r')
stop = open('stopwords.txt', 'r')
while True :
    if hashimg == 'y' or hashimg == 'Y':
        m = int(input('M = '))
        break
    elif hashimg == 'n' or hashimg == 'N':
        break
    else:
        print('Try again.')
        hashimg = input('Use feature hashing ? (y,Y,n,N) ')

#------------------------------------------------
def Cr_sent (text):
    Total = 0
    line_c = 0
    text_word = []
    for i in text:
        if i != '\n':
           q = '' 
           for e in range(len(i)):
               if i[e] != ' ' and i[e] != '\n' :
                  w = i[e].strip('\n')
                  q += w
               elif i[e] == ' ' :
                  text_word.append(q)
                  q =''
               elif i[e] == '\n' :
                  text_word.append(q)
                  q = ''
           if q != ' ' and q != '':
              text_word.append(q)
        Total += len(i) 
        line_c += 1
    return(text_word, line_c,Total)
def alpha (text):
    r = ''
    for i in text:
        q = i.lower()
        w = q.strip('(').strip(')').strip('"').strip("'").strip('\\').strip(',').strip('.')
        for e in range(len(w)):
            if 'A' <= w[e] <= 'Z' or 'a' <= w[e] <= 'z' or '0' <= w[e] <= '9':
                r += w[e]
    return(r)
def findco (wordf,txt):
    x = txt.find(wordf)
    if x != -1:
       c = 1
    while x != -1 : 
        x = txt.find(wordf,x+1)
        if x != -1:
           c += 1
    return(c)
def bow (Bow_find):
   sen_can = ''
   bow = []
   rtz = ''
   for i in Bow_find:
       sen_can += i
       sen_can += ' '
   for e in Bow_find :
       if e not in rtz:
          v = findco(e,sen_can)
          bow.append([e, v])
       rtz += e
       rtz += ' '
   return(bow)
def lowewr(File_sen_low): 
    file_low = []
    for i in File_sen_low:
        q_low = i.lower()
        w_low = q_low.strip('(').strip(')').strip('"').strip("'").strip('\\').strip(',').strip('.')
        file_low.append(w_low)
    return( file_low)
def fash(File_c):
    G = 37
    bow = []
    bow_q = []
    for i in range(m):
        bow.append([i,0])
    for i in File_c :
        q = 0
        for e in range(len(i)):
            q += ord(i[e])*(G**e)
        w = q%m
        bow[w][1] += 1
    for i in range(len(bow)):
        if bow[i][1] != 0:
           bow_q.append(bow[i])
    return(bow_q)
    

#-----------------------------------------------------

File_sen, line_File, Total_File = Cr_sent(File)
stop_sen, line_stop, Total_stop = Cr_sent(stop)

File_cantstop = []
for i in  File_sen:
    if i not in stop_sen:
        File_cantstop.append(i)
        
flie_cantstop02 = []
for i in  lowewr(File_sen):
    if i not in lowewr(stop_sen):
        flie_cantstop02.append(i)
        
if hashimg == 'n' or hashimg == 'N':
    qwe = bow(flie_cantstop02)
else:
    qwe = fash(flie_cantstop02)


#---------------------------------------------

print('char count =', (Total_File - line_File)+1)
print('alphanumeric count =', len(alpha(File_sen)))
print('line count =', line_File)
print('word count =', len(File_sen))
print('BoW =', qwe)

# 6230092021 (24.00) 8 (2021-03-21 22:31)
def fhash(w,M):
    f = 0
    c = 0
    for i in w:
        f += ord(str(i))*37**(c)
        c +=1
    f = f%int(M)
    return f

file_name = input('File name = ')
fh = input('Use feature hashing ? (y,Y,n,N) ')
if not fh in 'yYnN':
    print('Try again')
    fh = input('Use feature hashing ? (y,Y,n,N) ')
elif fh in 'yY':
    M = int(input('M = '))
infile = open(file_name,'r')

char_count = 0
alph_count = 0
line_count = 0
words = []
for line in infile:
    line = line.lower()
    line = line.strip()
    for i in line:
        char_count += 1
    
    for i in line:
        if 'a' <= i <= 'z' or '0' <= i <= '9':
            alph_count +=1

    line_count += 1
    
    a = ''
    for i in line:
        if 'a' <= i <= 'z' or i == ' ' or '0' <= i <= '9':
            a += i
        else:
            a += ' '
    w1 = a.split()
    words += w1
word_count = len(words)
if fh == 'n' or fh == 'N':
    BoW0 = []
    BoW2 = []
    st = open('stopwords.txt','r')
    stopw = []
    for line in st:
        line = line.split()
        stopw += line
    for i in words:
        if not i in stopw:
            BoW0.append(i)
    BoW1 = []
    for i in range(len(BoW0)):
        if not BoW0[i] in BoW1:
            BoW1.append(BoW0[i])
    for i in BoW1:
        c = 0
        for j in range(len(words)):
            if words[j] == i:
                c += 1
        BoW2.append(c)
    BoW = []
    for i in range(len(BoW1)):
        BoW.append([BoW1[i],BoW2[i]])
        BoW.sort()
    print('char count = ' + str(char_count))
    print('alphanumeric count = ' + str(alph_count))
    print('line count = ' + str(line_count))
    print('word count = ' + str(word_count))
    print('BoW = ' +str(BoW))
elif fh == 'y' or fh == 'Y':
    BoW0 = []
    BoW2 = []
    st = open('stopwords.txt','r')
    stopw = []
    for line in st:
        line = line.split()
        stopw += line
    for i in words:
        if not i in stopw:
            BoW0.append(i)
    BoW1 = []
    for i in range(len(BoW0)):
        if not BoW0[i] in BoW1:
            BoW1.append(BoW0[i])
    for i in BoW1:
        c = 0
        for j in range(len(words)):
            if words[j] == i:
                c += 1
        BoW2.append(c)
    BoWy0 = []
    BoWy1 = BoW2
    BoWy3 = []
    for i in BoW1:
        w = fhash(i,M)
        BoWy0.append(w)
    for i in range(len(BoWy0)):
        BoWy3.append([BoWy0[i],BoWy1[i]])
    a = []
    b = []
    BoWy = []
    for i in BoWy3:
        if not i[0] in a:
            a.append(i[0])

    for i in a:
        c = 0
        for j in range(len(BoWy3)):
            if i == BoWy3[j][0]:
                c += int(BoWy3[j][1])
        b.append(c)
    for i in range(len(a)):
        BoWy.append([a[i],b[i]])
        BoWy.sort()
    print('char count = ' + str(char_count))
    print('alphanumeric count = ' + str(alph_count))
    print('line count = ' + str(line_count))
    print('word count = ' + str(word_count))
    print('BoW = ' +str(BoWy))
infile.close()
st.close()
# 6230131921 (20.55) 9 (2021-03-22 22:13)

def fhash(w,M):
    a = 0
    for i in range(len(w)):
        ans = ord(w[i])*(37**i)
        a += ans
    a1 = a%M
    return a1

file_name = input("File name = ")
text1 = open(file_name,"r")
text2 = open(file_name,"r")
text3 = open(file_name,"r")
text4 = open(file_name,"r")
text5 = open(file_name,"r")
stp   = open("stopwords.txt","r")

x = 0
y = 0
z = 0


while True:
    fhinput = input("Use feature hashing ? (y,Y,n,N) ")
    
    if fhinput == "y" or fhinput == "Y":
        mInput= int(input("M = "))
        break
        
    
    elif fhinput == "n" or fhinput == "N":
        break
    
    else:
        print("Try again.")

for line in text1:
    ls = line.strip()
    long = len(ls)
    x+= long


for line in text2:
    ls2 = line.strip()
    for e in ls2:
        if "0" <= e <= "9" or "a" <= e <= "z" or "A" <= e <= "Z":
            y+=1


for line in text3:
    z += 1        
 
 
 
def countword(text4):
    b = ""
    c = []
    for line in text4:
        for e1 in line:
            if "0" <= e1 <= "9" or "a" <= e1 <= "z" or "A" <= e1 <= "Z":
                b += e1
            else:
                if b!= "":
                    c.append(b)
                    b = ""
    return(c)



print("-------------------")
print("char count =",x)
print("alphanumeric count =",y)
print("line count =",z)
print("word count =",len(countword(text4)))

def BoW(text5):
    f = []
    g = []
    result  = []
    result2 = []
    new  = []
    new2 = []
    bowsub  = []
    bowbig  = []
    bowsub2 = []
    bowbig2 = []
    flash   = []
    stplist = countword(stp)
    
    for e2 in countword(text5):
        e2 = e2.lower()
        f.append(e2)
        
    
    for e3 in f:
        if e3 not in stplist:
            g.append(e3)
    g.sort()
   
    
    for e in g:
        if e not in result:
            result.append(e)
            wtf = g.count(e)
            new.append(wtf)
            
    
    for i in range(len(result)):
        bowsub.append(result[i])
        bowsub.append(new[i])
        bowbig.append(bowsub)
        bowsub=[]
    
    if fhinput == "y" or fhinput == "Y":
        for i in range(len(g)):
            flash.append(fhash(g[i],mInput))
        flash.sort()
    
    for e in flash:
        if e not in result2:
            result2.append(e)
            wtf2 = flash.count(e)
            new2.append(wtf2)
            
    for i in range(len(result2)):
        bowsub2.append(result2[i])
        bowsub2.append(new2[i])
        bowbig2.append(bowsub2)
        bowsub2=[]
    
    if fhinput == "y" or fhinput == "Y":
        final = bowbig2
    elif fhinput == "n" or fhinput == "N":
        final = bowbig
        
    return (final)
    
print("BoW =",BoW(text5))
        
# 6230133121 (30.00) 10 (2021-03-20 03:19)
file_name = input('File name = ')
yn = input('Use feature hashing ? (y,Y,n,N) ')
yn = yn.lower()
while yn not in ['y','n']:
    print('Try Again.')
    yn = input('Use feature hashing ? (y,Y,n,N) ')
    yn = yn.lower()
#--------------------------------
def char_count(file):
    F = open(file,'r')
    char = 0
    for line in F:
        char += len(line.strip())
    return char
def stop_words(stop_file):
    ST = open(stop_file,'r')
    STOP = []
    for line in ST:
        WORD = line.strip().lower().split()
        for e in WORD:
            STOP.append(e)
    ST.close()
    return STOP
def BoW(word):
    bow = []
    for i in range(len(word)):
        n = 0
        if word[i] not in word[0:i] :
            for j in range(len(word)):
                if word[i] == word[j]:
                    n+=1
            bow.append([word[i],n])
    bow.sort()
    return bow
def fhash(a,M):
    f = 0
    ORD=[]
    for s in a:
        ORD.append(ord(s))
    for i in range(len(a)):
        f += ORD[i]*37**i
    f = f % M
    return f
#---------------------------------
stop = stop_words('stopwords.txt')
fn = open( file_name ,'r')
word = []
alphanumeric = 0
linc = 0
wordc = 0
for line in fn:
    linc += 1
    S = ''
    for s in line:
        if 'a' <= s <= 'z' or 'A' <= s <= 'Z' or '0' <= s <= '9':
            S += s
        else:
            S += ' '
    WORD = S.strip().lower().split()
    wordc += len(WORD)
    for e in WORD:
        alphanumeric += len(e)      
        if e not in stop:
            word.append(e)   
fn.close()
if yn == 'n':
    print('-------------------')
    print('char count =',char_count(file_name))
    print('alphanumeric count =',alphanumeric)
    print('line count =',linc)
    print('word count =',wordc)
    print('BoW =',BoW(word))
else:
    M = int(input('M ='))
    numword = []
    for e in word:
        numword.append(fhash(e,M))
    print('-------------------')
    print('char count =',char_count(file_name))
    print('alphanumeric count =',alphanumeric)
    print('line count =',linc)
    print('word count =',wordc)
    print('BoW =',BoW(numword))
# 6230153721 (0.00) 11 (2021-03-22 21:21)
o='.,\"\'[]:<>/#%!^{}$  +=*_-|&'
def fhash(w,M):
    r=0
    for i in range(len(w)):
        n=ord(w[i])
        r+=n*(37**i)
    x=r%M
    return x
fin=open('stopwords.txt')
s=[]
line=fin.readline()
while line!='':
    k=''
    for i in line:
        if i not in o and i!='\n':
            k+=i
        elif i in o and i!='\n':
            s.append(k.lower())
            k=''
        elif i=='\n':
            if k!='':
                s.append(k.lower())
            line=fin.readline()

file_name=input('File name = ')
a=input('Use feature hashing ? (y,Y,n,N) ')
while a!='y' and a!='Y' and a!='n' and a!='N':
    print('Try again.')
    a=input('Use feature hashing ? (y,Y,n,N) ')
if a=='y' or a=='Y':
    print('-------------------')
    M=int(input('M = '))
    fine=open(file_name)
    character=0
    alpha=0
    lines=0
    line=fine.readline()
    while line!='':
        for i in range(len(line)):
            if line[i] !='\n':
                character+=1
            if 'A'<=line[i]<='Z' or 'a'<=line[i]<='z' or line[i] in '0123456789':
                alpha+=1
            if line[i]=='\n':
                lines+=1
                line=fine.readline()
    print('char count =',character)
    print('alphanumeric count =',alpha)
    print('line count =',lines-1)
    g=''
    bow=[]
    BoW=[]
    word=0
    fine=open(file_name)
    line=fine.readline()
    while line!='':
        for i in line:
            if i not in o and i!='\n':
                g+=i.lower()
            elif i in o and g not in s and fhash(g,M) not in bow and i!='\n' and g!='':
                bow.append(fhash(g,M))
                BoW.append([fhash(g,M),1])
                word+=1
                g=''
            elif i in o and g not in s and fhash(g,M) in bow and i!='\n' and g!='':
                BoW[bow.index(fhash(g,M))][1]+=1
                word+=1
                g=''
            elif i in o and g in s and g!='':
                word+=1
                g=''
            elif i=='\n':
                if g!='':
                     word+=1
                     if g not in s and fhash(g,M) not in bow:
                          bow.append(fhash(g,M))
                          BoW.append([fhash(g,M),1])
                          g=''
                     elif g not in s and fhash(g,M) in bow:
                           BoW[bow.index(fhash(g,M))][1]+=1
                           g=''
                     elif g in s:
                           g=''
                line=fine.readline()
    print('word count =',word)
    print('BoW =',BoW)
elif a=='n' or a=='N':
    print('-------------------')
    fine=open(file_name)
    character=0
    alpha=0
    lines=0
    line=fine.readline()
    while line!='':
        for i in range(len(line)):
            if line[i] !='\n':
                character+=1
            if 'A'<=line[i]<='Z' or 'a'<=line[i]<='z' or line[i] in '0123456789':
                alpha+=1
            if line[i]=='\n':
                lines+=1
                line=fine.readline()
    print('char count =',character)
    print('alphanumeric count =',alpha)
    print('line count =',lines-1)
    g=''
    bow=[]
    BoW=[]
    word=0
    fine=open(file_name)
    line=fine.readline()
    while line!='':
        for i in line:
            if i not in o and i!='\n':
                g+=i.lower()
            elif i in o and g not in s and g not in bow and i!='\n' and g!='':
                bow.append(g)
                BoW.append([g,1])
                word+=1
                g=''
            elif i in o and g not in s and g in bow and i!='\n' and g!='':
                BoW[bow.index(g)][1]+=1
                word+=1
                g=''
            elif i in o and g in s and g!='':
                word+=1
                g=''
            elif i=='\n':
                if g!='':
                    word+=1
                    if g not in s and g not in bow:
                        bow.append(g)
                        BoW.append([g,1])
                        g=''
                    elif g not in s and fhash(g,M) in bow:
                        BoW[bow.index(fhash(g,M))][1]+=1
                        g=''
                    elif g in s:
                        g=''
                line=fine.readline()
    print('word count =',word)
    print('BoW =',BoW)

    
    
                    

            
                
    
    
    



# 6230154321 (18.10) 12 (2021-03-22 20:23)
x=input('File name = ')
ccount = 0
acount = 0
lcount = 0
wcount = 0
ch='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
num='0123456789'
s=[]
e=[]
f=''
k=[]
BOW=[]
BO=[]
p=[]
infile=open(x,"r")
def fhash(w,M):
    o=0
    for i in range (len(w)):
        o+=ord(w[i])*(37**i)
    return o%M
for line in infile:
    lcount+=1
    for i in line.strip():
        s.append(i)
        ccount+=1
    for  a in line:
        if 'A'<=a<='Z' or 'a'<=a<='z' or '0'<=a<='9':
            acount+=1
    for n in line.strip():
        
        if n in ch or n in ch.lower() or n in num  :
            f+=n
        elif n not in ch and n not in ch.lower() and n not in num:
            e.append(f)
            f=''
for g in e:
    if g!='':
        k.append(g)
        
    wcount=len(k)
infile.close()
file=open('stopwords.txt','r')
d=[]
for line in file:
    d+=line.split()
file.close()
j=[]

for u in k:
    if u.lower() not in d:
        p.append(u)
a=input('Use feature hashing ? (y,Y,n,N) ')
while a!='Y' and a!='y' and a!='n' and a!='N':
    print('Try again.')
    a=input('Use feature hashing ? (y,Y,n,N) ')
if a=='Y' or a=='y':
    m=int(input('M = '))
    for i in p:
        y = fhash(i,m)
        if y not in BO:
            BOW.append([y,1])
            BO.append(y)
        elif y in BO:
            for i in BOW:
                if i[0]==y:
                    i[1]+=1
elif a=='n' or a=='N':
    for i in p:
        if i not in BO:
            BOW.append([i,1])
            BO.append(i)
        elif i in BO:
            for v in BOW:
                if v[0]==i:
                    v[1]+=1
print('-------------------')    
print('char count = ',ccount)
print('alphanumeric count = ',acount)
print('line count = ',lcount)
print('word count = ',wcount)
print('BoW = ',BOW)


# 6230444321 (20.00) 13 (2021-03-22 23:15)
arabic = ["1","2","3","4","5","6","7","8","9","0","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"]
def Bow(n):
    a = onlyword(n)
    x = []
    cnt = 1
    for i in range(len(a)-1):
        if a[i] != a[i+1] :
            x.append([a[i],cnt])
            cnt = 1
        else:
            cnt += 1
    x.append([a[-1],cnt])
    return x
def onlyword(n):
    a = n.split()
    q = ""
    for i in range(len(a)):
        x = 0
        z = ""
        while x != len(a[i]):
            if a[i][x] in arabic:
                z += a[i][x]
                x += 1
            else:
                x +=1
        q += z + " "
    h = q.lower()
    k = h.split()
    k.sort()
    k1 = []
    for i in range(len(k)):
        if len(k[i]) >=3 and k[i] not in  ["was","she","them","they","this","the","there","are"]:
            k1.append(k[i])
    return k1
def fhash(w,M):
    a = onlyword(w)
    q = []
    i = 0
    while i <=  len(a)-1:
        if len(a[i]) <=2:
            i +=1
        else:
            x = 0
            z = 0
            while x <= len(a[i])-1:
                z += ord(a[i][x])*(37**x)
                x +=1
            y = z%int(M)
            q.append(y)
            i +=1
    q.sort()
    s = []
    cnt = 1
    for i in range(len(q)-1):
        if q[i] != q[i+1] :
            s.append([q[i],cnt])
            cnt = 1
        else:
            cnt += 1
    s.append([q[-1],cnt])
    return s
def charcount(n):
    return len(n)-linecount(k)+1
def alphacount(n):
    x = 0
    for i in range(len(n)):
        if n[i] not in arabic:
            x +=1
    return len(n)-x
def wordcount(n):
    copy_n = ''
    for i in range(len(n)):
        if n[i] in arabic:
            copy_n += n[i]
        else :
            copy_n += " "
    A = copy_n.strip().split()
    return len(A)
def linecount(n):
    c = 0
    for line in n :
        if line == "\n" :
            c += 1
    return c+1
n=input("File name = ")
n1 = input("Use feature hashing ? ")
while n1 not in ["y","Y","n","N"]:
    print("Try again.")
    n1 = input("Use feature hashing ? ")
if n1 == "y" or n1 == "Y":
    m = input("M = ")
a = open(n,"r")
k = a.read()
print("char count = " + str(charcount(k)))
print("alphanumeric count = " + str(alphacount(k)))
print("line count = " + str(linecount(k)))
print("word count = " + str(wordcount(k)))
if n1 == "y" or n1 =="Y":
    print("BoW = " + str(fhash(k,m)))
elif n1 == "n" or n1 =="N":
    print("BoW = " + str(Bow(k)))
# 6230585121 (29.00) 14 (2021-03-22 22:06)

def fhash(w, M):
    total = 0
    for i in range(len(w)):
        total += (ord(w[i])*(37**i))
    a = total%int(M)
    return a
def alphanum_count(a):
    n = 0
    for i in range(len(a)):
        if a[i] in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789':
            n += 1
    return n
def word_list(a):
    b = ''
    for i in range(len(a)):
        if a[i] not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789':
            b+=' '
        else:
            b+=a[i]
    c = b.strip().split()
    return c

file_name = input('File name = ')
fh = input('Use feature hashing ? (y,Y,n,N) ')
while fh not in 'yYnN':
    print("Try again.")
    fh = input('Use feature hashing ? (y,Y,n,N) ')
if fh in 'yY':
    M = input('M = ')
    print('-------------------')
if fh in 'nN':
    print('-------------------')
stop_w = []
file = open("stopwords.txt", "r")
for line in file:
    a = line.strip().split()
    stop_w += a
file.close()
char_c = 0
alpha_c = 0
line_c = 0
word_c = 0
list_w = []
infile = open(file_name, "r")
for line in infile:
    x = line.strip()
    char_c += len(x)
    a = alphanum_count(x)
    alpha_c += a
    line_c += 1
    b = word_list(x)
    word_c += len(b)
    for i in range(len(b)):
        d = b[i].lower()
        list_w.append(d)
infile.close()
print('char count = ',char_c)
print('alphanumeric count = ',alpha_c)
print('line count = ',line_c)
print('word count = ',word_c)
new_list = []
for i in range(len(list_w)):
    if list_w[i] not in stop_w:
        new_list.append(list_w[i])
BoW = []
for i in range(len(new_list)):
    x = new_list.count(new_list[i])
    if new_list[i] not in BoW:
        BoW.append(new_list[i])
        BoW.append(x)
if fh in 'yY':
    BoW_hash = []
    BoW_c = []
    list_A = []
    for i in range(0,len(new_list)):
        h = fhash(new_list[i], M)
        BoW_hash.append(h)
    for i in range(len(BoW_hash)):
        z = BoW_hash.count(BoW_hash[i])
        if str(BoW_hash[i]) not in BoW_c:
            BoW_c.append(str(BoW_hash[i]))
            BoW_c.append(z)
    for i in range(0,len(BoW_c),2):
        k = []
        k.append(int(BoW_c[i]))
        k.append(BoW_c[i+1])
        list_A.append(k)
        k = []
    list_A.sort()
    print("BoW = ",list_A)
if fh in 'nN':
    list_B = []
    for i in range(0,len(BoW),2):
        l = []
        l.append(BoW[i])
        l.append(BoW[i+1])
        list_B.append(l)
        l = []
    list_B.sort()
    print("BoW = ",list_B)
    
    
    
# 6231004021 (24.00) 15 (2021-03-21 22:24)
def make_words(s):
    pos = 0
    for i, c in enumerate(s):
        if not c.isalnum():
            yield s[pos:i]
            pos = i + 1
    yield s[pos:]
 
 
def fhash(w, m):
    g = 37
    res = 0
    for i in range(len(w)):
        res = (res + ord(w[i]) * (g ** i)) % m
    
    return res
 
def count_bow(bow, word):
    idx = 0
    for i in range(len(bow)):
        if bow[i][0] == word:
            bow[i][1] += 1
            return bow
 
    bow.append([word, 1])
 
    return bow
 
 
file_name = input('File name = ')
 
txt = open(file_name, 'r').read()
stop_words = open('stopwords.txt', 'r').read().split()
 
while True:
    hash = input('Use feature hashing ? (y,Y,n,N) ')
    if hash not in ['y', 'Y', 'n', 'N']:
        print('Try again.')
        continue
    break
 
if hash in ['y', 'Y']:
    m = int(input('M = '))
print('-------------------')
 
words = []
s = 0
 
new_txt = ' '.join(txt.splitlines())
 
words = [word.lower() for word in list(make_words(new_txt)) if len(word) > 0]
 
bow = []
for word in words:
    if word not in stop_words:
        bow = count_bow(bow, word)
 
 
 
 
if hash in ['y', 'Y']:
    h_bow = [[fhash(b[0], m), b[1]]for b in bow]
    h_bow = sorted(h_bow, key=lambda x: x[0])
    bow = []
    bow.append(h_bow[0])
    for i in range(1, len(h_bow)):
        if h_bow[i][0] == bow[-1][0]:
            bow[-1][1] += h_bow[i][1]
        else:
            bow.append(h_bow[i])
 
bow = sorted(bow, key=lambda x: x[0])
 
     
 
print('char count =', sum([1 for c in txt if c != '\n']))
print('alphanumeric count =', sum([1 for c in txt if c.isalnum()]))
print('line count =', len(txt.splitlines()))
print('word count =', len(words))
print('BoW =', bow)
 
# print(stop_words)
 
 
# 6231008621 (30.00) 16 (2021-03-22 09:49)

def fhash(w, M):
    G = 37 % M
    s = 0
    g = 1
    for i in range(len(w)):
        s = (s + ((ord(w[i]) % M) * g)) % M
        g = (g * G) % M
    return s
def splitWords(line):
    out = []
    for w in line.lower().split():
        if w.isalnum():
            out.append(w)
        else:
            buffer = ""
            for c in w:
                if c.isalnum():
                    buffer += c
                elif buffer != "":
                    out.append(buffer)
                    buffer = ""
            if buffer != "":
                out.append(buffer)
    return out
def getStopWords():
    fp = open("stopwords.txt", "r")
    words = []
    for line in fp:
        for w in splitWords(line):
            if w not in words:
                words.append(w)
    fp.close()
    return words
def readFile(fp, stopWords):
    charCount = 0
    alnumCount = 0
    lineCount = 0
    wordCount = 0
    bow = []
    for line in fp:
        if line[-1:] == "\n":
            charCount += len(line) - 1
        else:
            charCount += len(line)
        lineCount += 1
        for w in splitWords(line):
            alnumCount += len(w)
            wordCount += 1
            if w in stopWords:
                continue
            added = False
            for i in range(len(bow)):
                if w == bow[i][0]:
                    bow[i][1] += 1
                    added = True
            if not added:
                bow.append([w, 1])
    print("char count =", charCount)
    print("alphanumeric count =", alnumCount)
    print("line count =", lineCount)
    print("word count =", wordCount)
    return bow
def readFileHash(fp, stopWords, M):
    charCount = 0
    alnumCount = 0
    lineCount = 0
    wordCount = 0
    bow = [0] * M
    for line in fp:
        if line[-1:] == "\n":
            charCount += len(line) - 1
        else:
            charCount += len(line)
        lineCount += 1
        for w in splitWords(line):
            alnumCount += len(w)
            wordCount += 1
            if w not in stopWords:
                hw = fhash(w, M)
                bow[hw] += 1
    print("char count =", charCount)
    print("alphanumeric count =", alnumCount)
    print("line count =", lineCount)
    print("word count =", wordCount)
    return [[i, bow[i]] for i in range(M) if bow[i] != 0]



filename = input("File name = ")
fp = open(filename, "r")
temp = input("Use feature hashing ? (y,Y,n,N) ")
while temp not in ("y", "Y", "n", "N"):
    print("Try again.")
    temp = input("Use feature hashing ? (y,Y,n,N) ")

if temp == "y" or temp == "Y":
    M = int(input("M = "))
    stopWords = getStopWords()
    bow = readFileHash(fp, stopWords, M)
else:
    stopWords = getStopWords()
    bow = readFile(fp, stopWords)

print("BoW =", bow)
# 6231012021 (30.00) 17 (2021-03-22 22:51)

def BoW_N(x):
    a=[]
    for j in x:
        y=''
        for i in range(len(j)):
            if ('a' <= j[i].lower() <= 'z' ) or ('0' <= j[i] <= '9' ):
                y+=j[i]
        a.append(y)
    final = []
    bow_n = []
    for e in a:
        if e not in final:
            final.append(e)
            bow_n.append([e, 1])
        else:
            b = final.index(e)
            bow_n[b][1] += 1
    return bow_n
def fhash(w,M):
    cal=0
    for i in range(len(w)):
        ord_value=ord(w[i])*(37**i)
        cal+=ord_value
    fhash_value=cal%M
    return fhash_value

    
def all_fhash(lst,M):
    all_f=[]
    for i in lst:
        q=fhash(i,M)
        all_f.append(q)
    not_repeat=[]
    for i in all_f:
        if i not in not_repeat:
            not_repeat.append(i)
    ans_f=[]
    for i in not_repeat:
        c=0
        for j in range(len(all_f)):
            if all_f[j] == i:
                c += 1
        ans_f.append([i,c])
        ans_f.sort()
    return ans_f

#------------------------------------
file_name=input('File name = ')
BoW=input('Use feature hashing ? (y,Y,n,N)')
while BoW not in ['y','Y','n','N']:
    print('Try Again')
    BoW=input("Use feature hashing ? (y,Y,n,N) ")

file=open(file_name)
char_count=0
line_count=0
string=''
for char in file:
    string+=str(char)
    char_count+=len(char.strip())
    line_count+=1
word=''
for i in string.lower():
    if i in ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0']:
        word+=i
    else:
        word+=' '
word=word.split()
word_count=len(word)
    


a=d=0
for i in range(len(string)):
    if(string[i].isalpha()):
        a +=1 
    elif(string[i].isdigit()):
        d += 1

stopfile = open('stopwords.txt')
stop=open('stopwords.txt')
stopword=[]
for i in stopfile:
    i=i.strip().split()
    for e in i:
        stopword.append(e)
no_stopword=[]
for i in word:
    if i not in stopword:
        no_stopword.append(i)
        no_stopword.sort()

if BoW in ['y','Y']:
    M=int(input('M = '))
    print('-------------------')
    print('char count =',char_count)
    print('alphanumeric count =',a+d)
    print('line count =',line_count)
    print('word count =',word_count)
    print('BoW =',all_fhash(no_stopword,M))
    
else:
    print('-------------------')
    print('char count =',char_count)
    print('alphanumeric count =',a+d)
    print('line count =',line_count)
    print('word count =',word_count)
    print('BoW =',BoW_N(no_stopword))

# 6231019521 (19.00) 18 (2021-03-22 15:34)
#-------------------------------------------------------------
def count(infile):
    charcount=0
    linecount=0
    alphanum=0
    word=''
    for line in infile:
        charcount+=len(line.strip())
        linecount+=1
        line.strip()
        word+=str(line)
        for c in line.lower():
            if (c in Alpha) or (c in Number):
                alphanum+=1
    wordcount=len(word.split())
    word_use=word.split()
         
    return charcount,alphanum,linecount,wordcount,word_use

#------------------------------------------------------------
def BoWN(worduse):
    words=[]
    for c in worduse:
        if c.lower() not in stopwo:
            words.append(c)
    use=[]
    for j in words:
        w=''
        for i in range(len(j)):
            if (j[i].lower() in Alpha) or (j[i] in Number):
                w+=j[i]
        use.append(w)
    final = []
    bown = []
    for v in use:
        if v not in final:
            final.append(v)
            bown.append([v, 1])
        else:
            a = final.index(v)
            bown[a][1] += 1
    return bown,use

#----------------------------------------------------------------
def flash(w,M):
    G=37
    ans=0
    for i in range(len(w)):
        ans+=(ord(w[i].lower())*(G**i))
    answer=ans%(int(M))

    return answer

#----------------------------------------------------------------
def BowY(e):
    wo = []
    bowy = []
    for r in e:
        if r not in wo:
            wo.append(r)
            bowy.append([r, 1])
        else:
            t = wo.index(r)
            bowy[t][1] += 1
    return bowy
#----------------------------------------------------------------
def forbowy(use,M):
    e=[]
    for c in use:
        r=flash(c,M)
        e.append(r)

    return e

#----------------------------------------------------------------
Alpha=['a','b','c','d','e','f','g','h','i','j','k','l',\
       'm','n','o','p','q','r','s','t','u','v','w','x','y','z']
Number=['1','2','3','4','5','6','7','8','9']
#----------------------------------------------------------------
file_name=input('File name = ')
infile=open(file_name)
stop=open('stopwords.txt')
stopwo=[]
for k in stop:
    k=k.strip().split()
    for e in k:
        stopwo.append(e)

#-------------------------------------------------------------
feature= input('Use feature hashing ? (y,Y,n,N) ')
#-------------------------------------------------------------
while feature.upper()!='Y' and feature.upper()!='N':
    print('Try again.')
    feature= input('Use feature hashing ? (y,Y,n,N) ')
if feature.upper()=='Y':
    M=input('M = ')
    print('-------------------')
    x=count(infile)
    bn=BoWN(x[4])
    by=forbowy(bn[1],M)
    bowy=BowY(by)
    print('char count =',x[0])
    print('alphanumeric count =',x[1])
    print('line count =',x[2])
    print('word count =',x[3])
    print('BoW =',bowy)
if feature.upper()=='N':
    print('-------------------')
    y=count(infile)
    print('char count =',y[0])
    print('alphanumeric count =',y[1])
    print('line count =',y[2])
    print('word count =',y[3])
    z=BoWN(y[4])
    print('BoW =',z[0])


infile.close()
stop.close()
    




# 6231205921 (24.85) 19 (2021-03-22 11:42)

file_name = input('File name = ')
b = input('Use feature hashing ? (y,Y,n,N) ')
while b != 'y' and b != 'Y' and b != 'n' and b != 'N' :
    print('Try again.')
    b = input('Use feature hashing ? (y,Y,n,N) ')
if b == 'y' or b == 'Y' :
    m = int(input('M = '))
    
#open stopwords
fin = open("stopwords.txt", 'r')
stop = ""
for line in fin :
    if line[-1:] == "\n" :
        stop += line[:-1]+" "
    else : 
        stop += line

#open file_name
file = open(file_name, 'r')
#character
num_string = 0
for line in file :
    if line[-1:] == '\n' :
        line = line[:-1]
    num_string += len(line)
print('-'*19)
print('char count =',num_string)

#alphanumeric
file = open(file_name, 'r')
alpha = 0
for line in file :
    if line[-1:] == '\n' :
        line = line[:-1]
    for i in range(len(line)) :
        if 'a' <= line[i] <= 'z' or 'A' <= line[i] <= 'Z' or '0' <= line[i] <= '9' :
            alpha += 1
print('alphanumeric count =',alpha)

#line count
file = open(file_name, 'r')
line_count = 0
for line in file :
    if len(line) != 0 :
        line_count += 1
print('line count =',line_count)

#cut stopwords
file = open(file_name, 'r')
words = ""
for line in file :
    if line[-1:] == '\n' :
        line = line[:-1]
    for i in line :
        if i in "\'\".,:;()" :
            words += ""
        else :
            words += i
    words += " "        
words = words.lower().split()
words2 = list(words)
for w in words2 :
    if w in stop :
        words.remove(w)
print('word count =',len(words2))

#feature hashing
if b == 'y' or b == 'Y' :
    list_flash = []
    flash = 0
    for k in words :
        for i in range(len(k)) :
            flash += ord(k[i])*(37**i)
        flash = flash%m
        list_flash.append(flash)
        flash = 0
    list_flash.sort()
    list_flash.append(100000)
    bow = []
    m2 = 0
    for j in range(1,len(list_flash)) :
        m2 += 1
        if list_flash[j-1] != list_flash[j] :
            bow.append([list_flash[j-1],m2])
            m2 = 0
    
    print('BoW =',bow)


#no feature hashing
elif b == 'n' or b == 'N' :
    words.sort()
    words.append("###")
    list_bow = []
    num = 0
    for i in range(1,len(words)) :
        num += 1
        if words[i-1] != words[i] :
            list_bow.append([words[i-1],num])
            num = 0
    
    print('BoW =',list_bow)


    


# 6231207121 (30.00) 20 (2021-03-22 03:12)

file_name = str(input("File name = "))
def FHashOrNot() :
    Loop = True
    while Loop :
        p1 = input("Use feature hashing ? (y,Y,n,N) ")
        if p1 == "y" or p1 == "Y" :
            return True
        elif p1 == "n" or p1 == "N" :
            return False
        else :
            print("Try again.")   
def STWRead() :
    STW_lst = list()
    STW_str = str()
    fin = open("stopwords.txt", "r")
    for line in fin :
        for e in line :
            if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9" :
                STW_str += e.lower()
            else :
                STW_str += " "
    fin.close()
    STW_lst = STW_str.split()
    return STW_lst
def FILRead(p) :
    FIL_lst = list()
    FIL_str = str()
    PURE = str()
    fin = open(file_name, "r")
    for line in fin :
        for e in line :
            PURE += e
            if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9" :
                FIL_str += e.lower()
            else :
                FIL_str += " "         
    fin.close()
    FIL_lst = FIL_str.split()
    FIL_str = "".join(FIL_lst)
    if p == "STR" :
        return FIL_str
    elif p == "LST" :
        return FIL_lst
    elif p == "PURE" :
        return PURE
def CharCount() :
    n_char = 0
    for e in FILRead("PURE") :
        if e != "\n" :
            n_char += 1
    print("char count =",n_char)
def AlphCount() :
    n_alph = 0
    for e in FILRead("STR") :
        if "A" <= e <= "Z" or "a" <= e <= "z" or "0" <= e <= "9" :
            n_alph += 1
    print("alphanumeric count =",n_alph)
def LineCount() :
    n_line = 0
    for e in FILRead("PURE") :
        if e == "\n" :
            n_line += 1
    if e != "\n" :
        n_line += 1
    print("line count =",n_line)
def WordCount() :
    n_word = len(FILRead("LST"))
    print("word count =",n_word)
    
def BOW() :
    ListBOW = list()
    STW = STWRead()
    FIL = FILRead("LST")
    for e in FIL :
        if e not in STW :
            ListBOW.append(e)
    if FH == True :
        ListBOW2 = list()
        for e in ListBOW :
            ListBOW2.append(fhash(e,M_))
        ListBOW[:] = ListBOW2
    ListBOW.sort()
    ListBOW.append([])
    BOWFIN = list()
    cs = 1
    for i in range(len(ListBOW)-1) :
        if ListBOW[i] == ListBOW[i+1] :
            cs += 1
        else :
            BOWFIN.append([ListBOW[i], cs])
            cs = 1
    print("BoW =",BOWFIN)
def fhash(w,M) :
    G = 37
    r = 0
    for i in range(len(w)) :
        r += ord(w[i])*G**i
    r %= M
    return r

# ===== Run =====
FH = FHashOrNot()
if FH == True :
    M_ = int(input("M = "))
print("-------------------")
CharCount()
AlphCount()
LineCount()
WordCount()
BOW()
# 6231214521 (24.95) 21 (2021-03-22 18:30)
file_name = input("File name = ")
feature = input("Use feature hashing ? (y,Y,n,N) ")
feature = feature.upper()
while True:
    if feature == "N":
        print("-"*19)
        break

    elif feature =="Y":
        M = int(input("M = "))
        print("-"*19)
        break
    elif feature !="Y" or feature !="N":
        print("Try again.")
        feature = input("Use feature hashing ? (y,Y,n,N) ")
        feature = feature.upper()
fn = open(file_name,"r")
#print (fn.read())
lines = []

char_count =0
for line in fn:
    newline =""
    for c in line.lower():
        if c != "\n":
            newline +=c
    lines.append(newline)
        
for i in range(len(lines)):
    char_count += len(lines[i])
print("char count =",char_count) 

alphanum = 0
for i in range(len(lines)):
    for e in lines[i]:
        if ord("a")<=ord(e)<=ord("z"):
            alphanum +=1
        if ord("0")<=ord(e)<=ord("9"):
            alphanum +=1
print("alphanumeric count =",alphanum)
print("line count =",len(lines))

        
words = ""
for i in range(len(lines)):
    for e in lines[i]:
        if e in "\"\'/\\,.:;()[]{}":
            words += " "
        else:
            words += e
    words += " "
        
#print(words)
words = words.split()

print("word count =",len(words))

re = open("stopwords.txt","r")
stword = ""
for p in re:
    for e in p.lower():
        if e != "\n":
            stword +=e
        else:
            stword +=" "

stword = stword.split()
#print(words)
out_stword = []
for w in words:
    if w not in stword:
        out_stword.append(w)
#print(out_stword)
def fhash(w,M):
    G = 37
    ans = 0
    for i in range(len(w)):
        ans += (ord(w[i]))*(G**i)
    return ans % M


                
if feature == "n" or feature == "N":
    result = []
   
    for e in out_stword:
        if e not in result:
            result.append(e)
    bow = []
    for k in result:
        c = 0
        for e in words:
            if k == e:
                c += 1
        bow.append([k,c])
    print("BoW =",bow)
else:
    word_hash = []
    for e in out_stword:
        word_hash.append(fhash(e,M))
#    print(word_hash)
    
    result = []
    for e in word_hash:
        if e not in result:
            result.append(e)
    bow = []
    for k in result:
        c = 0
        for e in word_hash:
            if k == e:
                c += 1
        bow.append([k,c])
    print("BoW =",bow)

              
    
    
        
        
        
            
   
           
    
        

    

    
            
            




    
            
               
    
    
    
    
    
    

    
    


    
    
    
    
            
    





# 6231220221 (26.85) 22 (2021-03-21 22:35)

def fhash(w,M):
    a = 0
    G = 37
    for i in range(len(w)):
        a += ord(w[i])*((37)**i)
    a %= int(M)
    
    return a
def char_count(file):
    a = open(file)
    b = 0
    for line in a:
        s = line.strip()
        b += len(s)
    a.close()
    
    return b
def alphanumeric_count(file):
    a = open(file)
    b = ''
    for line in a:
        if len(line) > 0:
            for i in line:
                if i not in "\\\"\'-()[].,><?:;#@!$%^&*_+=":
                    b += i
                else:
                    b += ' '
    b = b.strip().split()
    s = len(''.join(b))
    a.close()
    
    return s
def line_count(file):
    a = open(file)
    s = 0
    for line in a:
        s += 1
    a.close()
    
    return s
def word_count(file):
    a = open(file)
    b = ''
    for line in a:
        if len(line) > 0:
            t = line.strip()
            for i in t:
                if i not in "\\\"\'-()[].,><?:;#@!$%^&*_+=":
                    b += i
                else:
                    b += ' '
            b += ' '
    c = len(b.strip().split())
    
    return c
    
def BoW(file,YN):
    a = open(file)
    b = open('stopwords.txt')
    c = []
    d = ''
    e = []
    for line in b:
        if len(line) > 0:
            s = line.strip().split()
            for i in s:
                c.append(i)
    
    for line in a:
        if len(line) > 0:
            t = line.strip().lower()
            for i in range(len(t)):
                if t[i] not in "\\\"\'-()[].,><?:;#@!$%^&*_+=":
                    if i == len(t)-1 :
                        d += t[i]+' '
                    else:
                        d += t[i]
                else:
                    d += ' '
    
    if YN == 'N':
        d = d.strip().split()
        for i in d:
            if i not in c and i not in e:
                e.append(i.lower())
        final = []
        for i in e:
            k = d.count(i)
            final.append([i,k])
        final.sort()
    else:
        d = d.strip().split()
        for i in d:
            if i not in c:
                e.append(i.lower())
        final = []
        coll = []
        f = []
        for i in e:
            q = fhash(i,M)
            f.append(q)
            if q not in coll:
                coll.append(q)
        coll.sort()
        for i in coll:
            k = f.count(i)
            final.append([i,k])
                     
    a.close()
    b.close()
    
    return final
                

file_name = input('File name = ')
fh = input('Use feature hashing ? (y,Y,n,N) ').upper()
b = False

while b == False:
    if fh == 'Y':
        M = int(input('M = '))
        print('-------------------')
        b = True
    elif fh == 'N':
        print('-------------------')
        b = True
    else:
        print('Try again.')
        fh = input('Use feature hashing ? (y,Y,n,N) ').upper()
        

print('char count =',char_count(file_name))
print('alphanumeric count =',alphanumeric_count(file_name))
print('line count =',line_count(file_name))
print('word count =',word_count(file_name))
print('BoW =',BoW(file_name,fh))
# 6231222521 (20.10) 23 (2021-03-22 16:35)

def fhash(s,M) :
    n = len(s)
    q = 0
    for i in range(n) :
        q += ord(s[i])*(37**i)
        q %= M
    return q

an = "abcdefghijklmnopqrstuvwxyz"
an += an.upper()
an += "0123456789"

file_name = input("File name = ").strip()
yn = ""
symbol = '''!@#$%^&*()_+':./'",'''
while True :
    yn = input("Use feature hashing ? (y,Y,n,N) ").strip()
    if yn.lower() == "y" or yn.lower() == "n" :
        break
    else :
        print("Try again.")
        
stopwords = []
f2 = open("stopwords.txt","r")
for q in f2 :
    q = q.strip()
    q = q.split()
    for i in q :
        stopwords.append(i.lower())


if yn.lower() == "y" :
    M = int(input("M = "))

file = open(file_name,"r")
char_count = 0
alphanum_count = 0
line_count = 0
word_count = 0
used = []
count = []

for line in file :
    line = line.strip()
    line_count += 1
    for c in line :
        if c in an :
            alphanum_count += 1
        char_count += 1
    line = line.split()
    for i in line :
        for s in symbol :
            i = i.replace(s,"")
        if i in used :
            idx = used.index(i)
            count[idx] += 1
        else :
            used.append(i)
            count.append(1)
print("char count =",char_count)
print("alphanumeric count =",alphanum_count)
print("line count =",line_count)
print("word count =",word_count)
            
word_count += len(line)
bow = [[used[i],count[i]] for i in range(len(used)) if used[i].lower() not in stopwords]
bow.sort()

if yn.lower() == "y" :
    b2 = [[fhash(bow[i][0],M),bow[i][1]] for i in range(len(bow))]
    bow = []
    used = [0]*37
    for i in range(len(b2)) :
        used[b2[i][0]] += b2[i][1]
    for i in range(len(used)) :
        if used[i] != 0 :
            bow.append([i,used[i]])

print("BoW =",bow)
# 6231223121 (30.00) 24 (2021-03-22 12:01)
def fhash(w,M):
    a=0
    for i in range(len(w)):
        a+=ord(w[i])*(37**i)
    b=a%int(M)
    return b
file_name=input('File name = ')
bow=input('Use feature hashing ? (y,Y,n,N) ')
q='y,Y,n,N'
if bow=='n' or bow=='N':
    print('-'*19)
elif bow=='y' or bow=='Y':
    M=input('M = ')
    print('-'*19)
while not bow in q: 
    print('Try again.')
    bow=input('Use feature hashing ? (y,Y,n,N) ')
    if bow=='n' or bow=='N':
        print('-'*19)
    elif bow=='y' or bow=='Y':
        M=input('M = ')
        print('-'*19)
f=open('stopwords.txt')
f1=open(file_name)
stop=[]
sam=[]
for line in f:
    line=line.lower()
    line=line.split()
    stop+=line
sam1=[]
nsam1=[]
for line in f1:
    line=line.lower()
    line1=''
    sam1+=line.split()
    nsam1.append(line)
    for c in line:
        for x in c:
            if 'a'<=x<='z' or '0'<=x<='9':
                line1+=x
            else:
                line1+=' '       
    line1=line1.split()
    sam+=line1
f.close()
f1.close()
kk=0
for tt in nsam1:
    kk+=len(tt)
ccc=0
for cc in nsam1:
    if cc[-1]=='\n':
        ccc+=1
char_count=kk-ccc
print('char count = '+str(char_count))
al=0
for t in sam:
    al+=len(t)
alpha=al
print('alphanumeric count = '+str(alpha))
ll=0
for aa in range(len(nsam1)-1):
    if nsam1[aa][-1]=='\n':
        ll+=1
if nsam1[-1]!='':
    lll=1
else:
    lll=0
line_count=ll+lll
print('line count = '+str(line_count))
word_count=len(sam)
print('word count = '+str(word_count))
out=[]
for i in range(len(sam)):
    if not sam[i] in stop:
        out.append(sam[i])
wd=[]
for e in out:
    if not e in wd:
        wd.append(e)
fe=[]
for k in range(len(wd)):
    num=out.count(wd[k])
    fe.append(str(num))
BoW=[]
if bow=='n' or bow=='N':
    for g in range(len(fe)):
        fe[g]=int(fe[g])
        BoW.append([wd[g],fe[g]])
    print('BoW = '+str(BoW))
elif bow=='y' or bow=='Y':
    n2=[]
    for u in range(len(out)):
        n1=fhash(out[u],M)
        n2+=str(n1)
    n2.sort()
    n3=[]
    nc1=[] 
    for d in n2:
        if not d in n3:
            n3+=d
    for q in n3:
        x=n2.count(q)
        nc1.append(x)
    for j in range(len(n3)):
        BoW.append([int(n3[j]),nc1[j]])
    print('BoW = '+str(BoW))

      
            
        

        
        

# 6231510221 (21.40) 25 (2021-03-22 11:23)
#Prog-08: Bag-of-words
#6231510221 (21.40) Pleumpiti Pholphakwaen

alpha = 'abcdefghijklmnopqrstuvwxyz0123456789'
stop = open('stopwords.txt').read().lower().splitlines()
stop_word = []
for line in stop:
    stop_word += line.split()
def fhash(w,m):
    return sum([ord(c)*37**i for i,c in enumerate(w)])%m
def convert(words,m,hash):
    result = []
    for word in words:
        if word in stop_word:
            continue
        if hash:
            word = fhash(word,m)
        result.append(word)
    return result
def get_unique(words):
    unique_words = []
    u_w = []
    for word in words:
        if word not in u_w:
            unique_words.append([word,1])
            u_w.append(word)
        else:
            i = u_w.index(word)
            unique_words[i][1]+=1
    return unique_words

m=0
file_name = input('File name = ')
file = open(file_name).read().lower().splitlines()
hash = False
while (True):
    command = input('Use feature hashing ? (y,Y,n,N) ')
    if command.lower() == 'y':
        hash = True
        m = int(input('M = '))
        break
    elif command.lower() == 'n':
        break
    else:
        print('Try again.')
print('-'*19)
count_char  = 0
count_alpha = 0
words = []
word = ''
for line in file:
    for c in line:
        if c in alpha:
            count_alpha+=1
            word+= c
        else:
            if word != '':
                words.append(word)
            word = ''
        count_char+=1
if word != '':
    words.append(word)
print('char count =',count_char)
print('alphanumeric count =',count_alpha)
print('line count =',len(file))
print('word count =',len(words))
words = get_unique (convert(words,m,hash))
print('BoW =',words)
# 6231511921 (26.55) 26 (2021-03-22 19:24)

def fhash(w,M):
    output = 0
    for i in range(len(w)):
        output += ord(w[i])*(37**i)
    return output%int(M)
def to_word(w):
    output  = []
    word = ""
    for i in w:
        p = i.lower()
        if p in "abcdefghijklmnopqrstuvwxyz0123456789":
            word += p
        elif word != "":
            output.append(word)
            word = ""
    if word not in output:
        if word != "":
            output.append(word)
    return output

file_name = input("File name = ")
while True:
    x = input("Use feature hashing ? (y,Y,n,N) ")
    if x in "yYnN": break
    else: print("Try again.")
if x in "yY":
    M = input("M = ")
stop_word = []
f1 = open("stopwords.txt","r")
for line in f1:
    stop_word += line.lower().strip().split()
f1.close()
char_c = 0
alpha_c = 0
line_c = 0
word_c = 0
f2 = open(file_name,"r")
BoW = []
chw = []
for line in f2:
    char_c += len(line.strip())
    a = line.lower().strip().split()
    for i in a:
        t = to_word(i)
        for j in t:
            if j not in stop_word:
                if j not in chw:
                    chw += [j]
                    BoW += [[j,1]]
                else:
                    ind = chw.index(j)
                    BoW[ind][1] += 1
            alpha_c += len(j)
            word_c += 1   
    line_c += 1
f2.close()
print("-------------------")
print("char count =",char_c)
print("alphanumeric count =",alpha_c)
print("line count =",line_c)
print("word count =",word_c)
if x in "yY":
    BoW_n = []
    cBoW_n = []
    for i in BoW:
        q = fhash(i[0],M)
        if q not in cBoW_n:
            BoW_n.append([fhash(i[0],M),i[1]])
            cBoW_n.append(q)
        else:
            ind = cBoW_n.index(q)
            BoW_n[ind][1] += i[1]
    print("BoW =",sorted(BoW_n))
else:
    print("BoW =",sorted(BoW))
# 6231707621 (24.45) 27 (2021-03-20 16:55)
def save(x):
    data=[]
    for line in x:
        s=''
        for e in line:
            if e in '\'\"();:.,?/\\<>=[]{}':
                s+=' '
            else:
                s+=e
        data.append(s)
    return data
def char(x):
    count=1
    for e in x:
        count+=len(e)-1
    return count
def alphanumeric(x):
    count=0
    for e in x:
        for i in range(len(e)):
            if 'a'<=e[i]<='z' or 'A'<=e[i]<='Z' or '0'<=e[i]<='9':
                count+=1
    return count
def word(x):
    s=[]
    for e in x:
        s.extend(e.split())
    s.sort()
    return s
def count(z):
    z.append(',?.')
    bow=[]
    k=1
    for i in range(len(z)-1):
        if z[i]==z[i+1]:
            k+=1
        else:
            bow.append([z[i],k])
            k=1
    return bow
    
def bow(x,m):
    f=open('stopwords.txt')
    data2=save(f)
    f.close()
    stop=word(data2)
    y=[]
    for e in x:
        y.append(e.lower())
    stopwords=[]
    for e in stop:
        stopwords.append(e.lower())
    z=[]
    for e in y:
        if e not in stopwords:
            z.append(e)
    if m==None:
        bow=count(z)
    else:
        bow0=[]
        bow=[]
        for e in z:
            s=0
            for i in range(len(e)):
                s+=ord(e[i])*(37**i)
            s=s%int(m)
            bow0.append(s)
        bow0.sort()
        bow=count(bow0)        
    return bow
        
a=input('File name = ')
b=input('Use feature hashing ? (y,Y,n,N) ')
while b not in ['y','Y','n','N']:
    print('Try again.')
    b=input('Use feature hashing ? (y,Y,n,N) ')
if b in ['n','N']:
    m=None
    print('-------------------')
else:
    m=input('M = ')
    print('-------------------')
File_name=open(a,'r')
data=save(File_name)
File_name.close()
#print(data)
print('char count =',char(data))
print('alphanumeric count =',alphanumeric(data))
print('line count = ',len(data))
words=word(data)
#print(words)
print('word count = ',len(words))
print('BoW = ',bow(words,m))






# 6231709921 (30.00) 28 (2021-03-21 19:38)

def fhash(w,m) :
    f = 0
    for i in range(len(w)) :
        f += (ord(w[i])*(37**i))
    f = f%m
    return f
def unique(list1):
    ul = []
    for e in list1:
        if e not in ul :
            ul.append(e)
    return ul
        
alp = 'abcdefghijklmnopqrstuvwxyz0123456789'
file_name = input('File name = ')
feat = input('Use feature hashing ? (y,Y,n,N) ')
while feat not in ['y','Y','n','N'] :
    print('Try again.')
    feat = input('Use feature hashing ? (y,Y,n,N) ')
if feat in ['y','Y'] :
    m = int(input('M = '))
print('-------------------')

infile = open(file_name,'r')
instop = open('stopwords.txt','r')

sent = ''
words = []
lc = 0
cc = 0
ac = 0
wc = 0
for line in infile :
    line = line.strip()
    lc += 1
    for e in line :
        cc += 1
        e = e.lower()
        if e in alp :
            sent += e
        else :
            sent += ' '
    spi = sent.split()
    for e in spi :
        ac += len(e)
    words += spi
    sent = ''
    wc = len(words)
infile.close()

print('char count = '+str(cc))
print('alphanumeric count = '+str(ac))
print('line count = '+str(lc))
print('word count = '+str(wc))

s = ''
stw = []
for line in instop :
    line = line.strip()
    for e in line :
        e = e.lower()
        if e in alp :
            s += e
        else :
            s += ' '
    sss = s.split()
    stw += sss
    s = ''
instop.close()

cutw = []
for e in words :
    if e not in stw :
        cutw.append(e)
cutw.sort()

if feat in ['n','N'] :
    bow = []
    aws = unique(cutw)
    for e in aws :
        bow.append([e,cutw.count(e)])
    print('BoW = '+str(bow))
    
elif feat in ['y','Y'] :
    fha = []
    bow = []
    for e in cutw :
        fha.append(fhash(e,m))
    fha.sort()
    aws = unique(fha)
    for e in aws :
        bow.append([e,fha.count(e)])
    print('BoW = '+str(bow))
# 6231718521 (30.00) 29 (2021-03-22 21:21)
def Bagofword(w):
    B = []
    for i in range(len(w)):
        n = 0
        if not w[i] in w[0:i] :
            for j in range(len(w)):
                if w[i] == w[j]:
                    n+=1
            B.append([w[i],n])
    return B
def fhash(z,M):
    f = 0
    o=[]
    for s in z:
        o.append(ord(s))
    for i in range(len(z)):
        f += o[i]*37**i
    f = f % M
    return f
#--------------------------------
file_name = input('File name = ')
t = input('Use feature hashing ? (y,Y,n,N) ')
t = t.lower()
while t not in ['y','n']:
    print('Try Again.')
    t = input('Use feature hashing ? (y,Y,n,N) ')
    t = t.lower()
stop = open('stopwords.txt','r')
S = []
for line in stop:
    stopword = line.strip().lower().split()
    for e in stopword:
        S.append(e)
stop.close()
file = open( file_name ,'r')
W = []
al = 0
ch = 0
lc = 0
wc = 0
for line in file:
    ch += len(line.strip())
    lc += 1
    A = ''
    line = line.lower()
    for s in line:
        if 'a' <= s <= 'z' or '0' <= s <= '9':
            A += s
        else:
            A += ' '
    w = A.strip().split()
    wc += len(w)
    for e in w:
        al += len(e)      
        if e not in S:
            W.append(e)   
file.close()
if t == 'n':
    print('-------------------')
    print('char count = '+str(ch))
    print('alphanumeric count = '+str(al))
    print('line count = '+str(lc))
    print('word count = '+str(wc))
    print('BoW = '+str(Bagofword(W)))
else:
    M = int(input('M ='))
    N = []
    for e in W:
        N.append(fhash(e,M))
    print('-------------------')
    print('char count = '+str(ch))
    print('alphanumeric count = '+str(al))
    print('line count = '+str(lc))
    print('word count = '+str(wc))
    print('BoW = '+str(Bagofword(N)))
# 6330170421 (22.95) 30 (2021-03-21 17:50)
file_name = input('File name = ')
fh = input('Use feature hashing ? (y,Y,n,N) ')
nsw = []
def remove_punc(t):
    out = ""
    for i in t:
        if i in "\"\'/\\().,;:":
            out += ""
        else :
            out += i
    return out
fn = open("stopwords.txt", "r")
sw = []
for line in fn:
        
    line = line.lower()
    line = line.strip().split()
    for i in range (len(line)):
            
        sw.append(line[i])
        
fn.close()

fs = open(file_name,"r")
sp = []
char = []
line_count = 0

for line in fs:
    line = line.strip()
    char.append(line)
    line1 = remove_punc(line)
    sp.append(line1)
    line_count += 1

fs.close()

chx = ''.join(char)        
char_count = len(chx)    
     
x = ' '.join(sp)
y = x.split()
z = ''.join(y)
word_count = len(y)
alpha_count = len(z)


yl = []
for i in range (len(y)):
    yl.append(y[i].lower())

for i in range (len(yl)):
    if yl[i] not in sw:
        
        nsw.append(yl[i])
def bowf(c):
    nb = 0
    for i in range (len(nsw)):
        if c == nsw[i]:
            nb += 1
    return nb
def fhashf(c):
    nfh = 0
    for i in range (len(b2)):
        if c == b2[i]:
            nfh += 1
    return nfh




while fh not in 'y,Y,n,N' :
    print('Try again.')
    
    fh = input('Use feature hashing ? (y,Y,n,N) ')
if fh == 'y' or fh == 'Y':
    M = input('M = ')    
    def fhash(w,M):
        nfh = 0
        for i in range (len(w)):
            nfh += (int(ord(w[i]))*(37**i))
        nfh = nfh%int(M)
        return nfh
    
    BoW = []
    b2 = []
    b3 = []
    
    
    for i in range(len(nsw)):
         
            
            b2.append(fhash(nsw[i],M))

    for i in range(len(b2)):
        if b2[i] not in b3:
            
            b3.append(b2[i])
        
    for i in range (len(b3)):
        BoW.append([b3[i],fhashf(b3[i])])
    
    
    
    
    print('-------------------')
    print('char count = '+str(char_count))
    print('alphanumeric count = '+str(alpha_count))
    print('line count = '+str(line_count))
    print('word count = '+str(word_count))
    print('BoW = '+str(BoW))




elif fh == 'n' or fh == 'N':
    b1 = []
    BoW = []
    for i in range(len(nsw)):
         if nsw[i] not in b1:
            b1.append(nsw[i])
    for i in range (len(b1)):
        BoW.append([b1[i],bowf(b1[i])])
        
    print('-------------------')
    print('char count = '+str(char_count))
    print('alphanumeric count = '+str(alpha_count))
    print('line count = '+str(line_count))
    print('word count = '+str(word_count))
    print('BoW = '+str(BoW))
        
         


        
        
        
        
        
    






        
    
        


# 6330171021 (15.00) 31 (2021-03-21 13:00)
#-------------------------------------------
def wordAlpha_count(file_name) :
	eng = "abcdefghijklmnopqrstuvwxyz0123456789"
	list_txt = []
	txt = ''
	for i in readFilename:
            if i in eng:
                txt += i 
            else:
                if txt != '' :
                    list_txt.append(txt)
                    txt = ''
	return (' '.join(list_txt).strip().split()), len(''.join(list_txt))
#----------------------------------------------
def line_count(file_name) :
	return (len(readFilename.splitlines()))
#----------------------------------------------
def char_count(file_name) :
	char = readFilename.splitlines()
	new_char = ''.join(char)
	return len(new_char)
#----------------------------------------------
def list_stopword(file_name) :
	eng = "abcdefghijklmnopqrstuvwxyz0123456789"
	list_txt = []
	txt = ''
	for i in readfh :
		if i in eng:
			txt += i 
		else:
			if txt != '' :
				list_txt.append(txt)
				txt = ''
	return (' '.join(list_txt).strip().split())
#----------------------------------------------
def bow_N(new_word) :
	listBowN = []
	test = []
	new_word.sort()
	for i in new_word:
		txt= []
		if i not in test :
			txt.append(i)
			txt.append(new_word.count(i))
			test.append(i)
			listBowN.append(txt)
	return listBowN
#----------------------------------------------
def bow_Y(new_word) :
	new_word.sort()
	list_fhash = []
	for i in new_word :
		f = fhash(i,M)
		list_fhash.append(str(f))

	listBowY = []
	test = []
	list_fhash.sort()
	for i in list_fhash:
		txt= []
		if i not in test :
			txt.append(int(i))
			txt.append(list_fhash.count(i))
			test.append(i)
			listBowY.append(txt)
	return listBowY
#-----------------------------------
def fhash(w,M):
	f = 0
	for i in range(len(w)) :
		f += ord(w[i])*(37**i) 
	f = f%M
	return f

#----------------main----------------

file_name = open(input('File_name = '),"r")
fh = open('stopwords.txt','r')
ufh = input('Use feature hashing ? (y,Y,n,N) ')
yesno = 'yYnN'
while ufh not in yesno :
	print('Try again.')
	ufh = input('Use feature hashing ? (y,Y,n,N) ')
readFilename = file_name.read().lower()
word,alpha = wordAlpha_count(file_name)
line = line_count(file_name)
char = char_count(file_name)
readfh = fh.read().lower()
stopword = list_stopword(file_name)
new_word = [r for r in word if r not in stopword]
if ufh == 'n' or ufh == 'N' :
	BoW = bow_N(new_word)
if ufh == 'y' or ufh == 'Y' :
	M = int(input('M = '))
	BoW = bow_Y(new_word)
#---------------print----------------
print('-------------------')
print('char count =',char)
print('alphanumeric count =',alpha)
print('line count =',line)
print('word count =',len(word))
print('BoW =',BoW)
#-------------close-----------------
file_name.close()
fh.close()
# 6330172721 (26.00) 32 (2021-03-20 18:42)
def fhash(word,m) :
    ttl = 0
    for i in range(len(word)) :
        ttl += ord(word[i])*(37**(i))
    return ttl%m
def bow(_lst, m) :                  # have fhash
    bow_lst = [0]*m
    for ele in _lst :
        bow_lst[fhash(ele,m)] += 1
    bow_p_lst = []
    for i in range(m) :
        if bow_lst[i] != 0 :
            bow_p_lst.append([i,bow_lst[i]])
    return bow_p_lst
def bow_wrd(_lst) :                 # no fhash
    n_lst = []
    for ele in _lst :
        if ele not in n_lst :
            n_lst.append(ele)
    for i in range(len(n_lst)) :
        n_lst[i] = [n_lst[i],_lst.count(n_lst[i])]
    return sorted(n_lst)
            

file_name = input('File name = ')
f = open(file_name, "r")
wrd_lst = []
alpnm_cnt = 0
ch_cnt = 0
lne_cnt = 0
wrd_cnt = 0
for line in f :
    lne = ''
    ch_cnt += len(line)-1
    lne_cnt += 1
    for ch in line.lower() :
        if ch in 'abcdefghijklmnopqrstuvwxyz0123456789' :
            lne += ch
            alpnm_cnt += 1 
        else : lne += ' '
    wrd_lst += lne.split()
f.close()
wrd_cnt = len(wrd_lst)

stp_lst = []                        # filter stopwords
stp_wrd = open("stopwords.txt","r")
for line in stp_wrd :
    stp_lst += line.lower().split()
stp_wrd.close()
for i in range(len(wrd_lst)-1,-1,-1) :
    if wrd_lst[i] in stp_lst :
        wrd_lst.pop(i)
            
while 1 :
    alter = input('Use feature hashing ? (y,Y,n,N) ')
    if alter == 'n' or alter == 'N' :
        alter_tmp = 0
        break
    elif alter == 'y' or alter == 'Y' :
        alter_tmp = 1
        M = int(input('M = '))
        break
    print("Try again.")
print('-------------------')
print("char count =",ch_cnt)
print("alphanumeric count =",alpnm_cnt)
print("line count =",lne_cnt)
print("word count =",wrd_cnt)
if alter_tmp  :
    Bw = bow(wrd_lst,M)
    print("BoW =",Bw)
else :
    Bw = bow_wrd(wrd_lst)
    print("BoW =",Bw)



# 6330173321 (22.20) 33 (2021-03-21 18:57)

def remove(t):
    out = ""
    for e in t:
        if e in "\"\'/\\().,;:->":
            out += ""
        else : out += e
    return out
def fhash(t,m):
    re = 0
    for i in t:
        re += ord(i)
    return re%int(m)
        
file_name = input("File name = ")
use_hash = input("Use feature hashing ? (y,Y,n,N) ")
k = ['y', 'Y', 'n', 'N']
r = ['y' , 'Y']
while use_hash not in k :
    print("Try again.")
    use_hash = input("Use feature hashing ? (y,Y,n,N) ")
if use_hash in r:
    m = int(input("M = "))
get = []
stp = []
chcount = 0
file1 = open("stopwords.txt", "r")
file2 = open(file_name, "r")
for line in file1 :
    if "\n" in line:
        stp.append(line[0:-1:])
    else :
        stp.append(line)
for line in file2 :
    if "\n" in line:
        get.append(line[0:-1:])
    else :
        get.append(line)
file1.close()
file2.close()
print("-------------------")
alp = 0
linecount = len(get)
for i in get:
    chcount += len(i)
print("char count =",chcount)
word = []
stopword = []
remain = []
BoW = []
BoW2 = []
wa = []
hashx = []
h2 = []
for i in get:
    i = remove(i)
    i = i.lower()
    x = i.split()
    word += x
for i in word:
    alp += len(i)
print("alphanumeric count =",alp)
print("line count =", linecount)
for i in stp:
    i = remove(i)
    x = i.split()
    stopword += x
for i in word:
    if i not in stopword:
        remain.append(i)
print("word count =",len(word))
n = 0
remain.sort()
for i in remain :
    if i not in wa:
        wa.append(i)
        BoW.append([i,1])
    else:
        fix = BoW[-1]
        t = int(fix[1])
        BoW.remove(BoW[-1])
        BoW.append([i,t+1])
if use_hash in r :
    for [word, num] in BoW:
        hashx.append([fhash(word,m),num])
    hashx.sort()
    for [a,b] in hashx:
        if a not in h2:
            h2.append(a)
            BoW2.append([a,b])
        else:
            fix = BoW2[-1]
            t = int(fix[1])
            BoW2.remove(BoW2[-1])
            BoW2.append([a,t+b])    
if use_hash not in r:
    print("BoW =",BoW)
elif use_hash in r :
    print("BoW =",BoW2)


# 6330174021 (18.43) 34 (2021-03-18 14:47)

def is_alpha(n):
    try:
        int(n)
        return True
    except:
        return ord(n) in range(ord('a'), ord('z') + 1) or ord(n) in range(ord('A'), ord('Z') + 1)
def stopword():
    f = open("stopwords.txt", 'r')
    stp = []
    for line in f:
        stp += [word for word in line.strip().split()]
    f.close()
    return stp
def fhash(w, M):
    return sum([ord(w[i])*37**(i) for i in range(len(w))]) % M
def display(c, a, l, w, BoW):
    print('-------------------')
    print('char count = ' + str(c))
    print('alphanumeric count = ' + str(a))
    print('line count = ' + str(l))
    print('word count = ' + str(w))
    print('BoW = ' + str(BoW))


stp = stopword()

file_name = input("File name = ")
fh = input("Use feature hashing ? (y,Y,n,N) ")
while fh not in ('y', 'Y', 'n', 'N'):
    print("Try again.")
    fh = input("Use feature hashing ? (y,Y,n,N) ")
if fh in ('Y', 'y'):
    M = int(input("M = "))

f = open(file_name.strip(), 'r')
n_cha = 0
n_alnum = 0
n_line = 0
words = []
BoW = []

for line in f:
    n_cha += len(line) - 1
    n_line += 1
    line = line.strip()
    word = ""
    for c in line:
        if is_alpha(c):
            n_alnum += 1
            word += c
        elif word != "":
            words.append(word)
            word = ""
f.close()

n_words = len(words)
words = [word.lower() for word in words if word.lower() not in stp]
if fh in ('Y', 'y'): words = [fhash(word, M) for word in words]


b_BoW = [[word, words.count(word)] for word in words]
for mem in b_BoW:
    if mem not in BoW:
        BoW.append(mem)
BoW.sort()


display(n_cha, n_alnum, n_line, n_words, BoW)
# 6330176221 (19.05) 35 (2021-03-21 17:46)
#Prog_08: Bag-of-words
#6330176221 (19.05) Natthawut Sapwatthanaphaisan
def flas_h(w, M) :
    a = 0
    G = 37
    n = 1
    for i in w :
        a += ord(i)*(G**(n-1))
        n += 1
    flash = a % M
    return flash
def strr_1(line) :
    strr = ''
    for e in line :
        if e in "\"\'/\\,.:;()?!#><-_~" :
            strr += ' '
        else :
            strr += e
    return strr
def stop_words() :
    bow = '' 
    stop = open('stopwords.txt', 'r')    
    s = []
    for line in stop :
        line = line.strip()
        if line == '' :
            pass
        else :
            s += line.split()
    file_name = open(inn1, 'r')
    for line in file_name :
        bow += line.lower()
    bow = strr_1(bow).split()
    Bow = []
    for i in range(len(bow)) :
        if bow[i] in s :
            pass
        else :
            Bow.append(bow[i])
    file_name.close()
    stop.close()
    return Bow
def BO_W(Bow) :
    stack = 1
    Stk = []
    BoW = []
    for i in Bow[:-1] :
        if i in Bow[stack:] :
            Stk.append(i)
            stack += 1
        elif i in Stk :
            Stk.append(i)
            stack += 1
        else :  
            BoW.append([i,1])
            stack += 1
    if Bow[-1] in Stk :
        Stk.append(Bow[-1])
    else :  
        BoW.append([Bow[-1],1])
    words = []
    count = []
    for i in range(len(Stk)) :
        if Stk[i] in words :
            for a in range(len(words)) :
                if Stk[i] == words[a] :
                    count[a] = count[a] + 1
        else :
            words.append(Stk[i])
            count += [1]
    BB = []
    for i in range(len(words)) :
        BB.append([words[i], count[i]])
    BoW += BB
    return BoW
def prin_t() :
    print('-------------------')
    print('char count =', char_count)
    print('alphanumeric count =', alphanumeric_count)
    print('line count =', line_count)
    print('word count =', word_count)

#-------------------------------------
inn1 = input('File name = ')
file_name = open(inn1, 'r')
word = []
char_count = 1
line_count = 0
for line in file_name :
    char_count += len(line)-1 
    strr = strr_1(line)
    word += strr.split()
    line_count += 1
alphanumeric_count = len(''.join(word))
word_count = len(word)
inn2 = input('Use feature hashing ? (y,Y,n,N) ')
while inn2 != 'y' and inn2 != 'Y' and inn2 != 'n' and inn2 != 'N' :
    print('Try again.')
    inn2 = input('Use feature hashing ? (y,Y,n,N) ')
if inn2 == 'y' or inn2 == 'Y' :
    M = int(input('M = '))
    prin_t()
    bow = stop_words()
    Bow = []
    for i in bow :
        i = flas_h(i, M)
        Bow.append(i)
    BoW = BO_W(Bow)
    print('BoW =', BoW)        

if inn2 == 'n' or inn2 == 'N' :
    prin_t()
    Bow = stop_words()
    BoW = BO_W(Bow)
    print('BoW =', BoW)
    
file_name.close()
# 6330177921 (30.00) 36 (2021-03-21 21:22)
file_name = input('File name = ')
use = input('Use feature hashing ? (y,Y,n,N) ')
fn = open(file_name,'r')
stw = open('stopwords.txt','r')
char = []; char_stw = []
charw = []; charw_stw = []
word = ''; word_stw = ''
alnum = []
line_c = 0
same = []; bow = []; c = 1
sbo = []
def fhash(a,m):
    b = 0
    for i in range(len(a)):
        b += ord(a[i])*(37**i)
    return b%m

if use not in ['n','N','y','Y']:
    while use not in ['n','N','y','Y']:
        print('Try again.')
        use = input('Use feature hashing ? (y,Y,n,N) ')

if use == 'n' or use == 'N':
    #char count
    print('-------------------')
    for line in fn:
        for e in line:
            if e != '\n':
                char.append(e.lower())
                charw.append(e.lower())
        charw.append(' ')
        line_c += 1
    print('char count =',len(char))
    print(char)
    
    #alphanumeric count
    for e in char:
        if e in 'abcdefghijklmonpqrstuvwxyz0123456789':
            alnum.append(e)
    print('alphanumeric count =',len(alnum))
      
    print('line count =',line_c)
    
    #word count
    for e in charw:
        if e not in 'abcdefghijklmonpqrstuvwxyz0123456789':
            word += ' '
        else:
            word += e
    word = word.split()
    print('word count =',len(word))
    
    #stw
    for line in stw:
        for e in line:
            if e != '\n':
                char_stw.append(e.lower())
                charw_stw.append(e.lower())
        charw_stw.append(' ')
    for e in charw_stw:
        if e not in 'abcdefghijklmonpqrstuvwxyz0123456789':
            word_stw += ' '
        else:
            word_stw += e
    word_stw = word_stw.split()
    
    #BoW
    for e in word:
        if e not in word_stw:
            same.append(e)
    same.sort()
    same += ' '
    for i in range(len(same)-1):
        if same[i] == same[i+1]:
            c += 1
        else:
            bow.append([same[i],c])
            c = 1
    print('BoW =',bow)


else:
    m = int(input('M = '))
    print('-------------------')
    
    #char count
    for line in fn:
        for e in line:
            if e != '\n':
                char.append(e.lower())
                charw.append(e.lower())
        charw.append(' ')
        line_c += 1
    print('char count =',len(char))
    
    #alphanumeric count
    for e in char:
        if e in 'abcdefghijklmonpqrstuvwxyz0123456789':
            alnum.append(e)
    print('alphanumeric count =',len(alnum))
      
    print('line count =',line_c)
    
    #word count
    for e in charw:
        if e not in 'abcdefghijklmonpqrstuvwxyz0123456789':
            word += ' '
        else:
            word += e
    word = word.split()
    print('word count =',len(word))
    
    #stw
    for line in stw:
        for e in line:
            if e != '\n':
                char_stw.append(e.lower())
                charw_stw.append(e.lower())
        charw_stw.append(' ')
    for e in charw_stw:
        if e not in 'abcdefghijklmonpqrstuvwxyz0123456789':
            word_stw += ' '
        else:
            word_stw += e
    word_stw = word_stw.split()
    
    #BoW
    for e in word:
        if e not in word_stw:
            same.append(e)
    same.sort()
    for e in same:
        sbo.append(fhash(e,m))
    sbo.sort()
    sbo += ' '
    for i in range(len(sbo)-1):
        if sbo[i] == sbo[i+1]:
            c += 1
        else:
            bow.append([sbo[i],c])
            c = 1
    print('BoW =',bow)
fn.close()
stw.close()

    
    

        
    
# 6330178521 (24.35) 37 (2021-03-21 17:43)

file_name = input('File name = ')
fha = input('Use feature hashing ? (y,Y,n,N) ')
while not fha in ['y','Y','n','N']:
    print('Try again.')
    fha = input('Use feature hashing ? (y,Y,n,N) ')

linecount = 0
char = 0
alpha = 0
word = 0
file = open(file_name,'r')
x = ''
y = []
    
for line in file:
    linecount += 1
    for i in range(len(line)):
        if line[i] == '\n':
            pass
        else:
            char += 1
            if '0' <= line[i] <= '9' or 'A' <= line[i] <= 'z':
                alpha += 1
    for i in range(len(line)):
        if '0' <= line[i] <= '9' or 'A' <= line[i] <= 'z':
            x += line[i]
        else:
            x += ' '
    words = (x.lower().strip().split())
    wordcount = len(words)
file.close()
            


r = []
file = open('stopwords.txt','r')
for line in file:
    s1 = line.lower()
    s = s1.strip().split()
    for i in range(len(s)):
        r.append(s[i])
file.close()
        
def fhash(w, M):
    r = 0
    for i in range(len(w)):
        r += ord(w[i])*(37**i)
        a = r%int(M)
    return a
word = []
for i in range(len(words)):
        if not words[i] in r:
            word.append(words[i])
            
            

if fha == 'n' or fha == 'N':
    print('-------------------')
    print('char count =', char)
    print('alphanumeric count =', alpha)
    print('line count =', linecount)
    print('word count =', wordcount)     
    z = []
    b = []
    p = []
    for i in range(len(word)):
        if word[i] in z:
            h = z.index(word[i])
            b[h] += 1
        else:
            z.append(word[i])
            b.append(1)
    for i in range(len(z)):
        p.append([z[i],b[i]])
    print('BoW =', p)
else:
    M = input('M = ')
    print('-------------------')
    print('char count =', char)
    print('alphanumeric count =', alpha)
    print('line count =', linecount)
    print('word count =', wordcount)
    
    
    z2 = []
    b2 = []
    p2 = []
    for i in range(len(word)):
        if fhash(word[i],M) in z2:
            h = z2.index(fhash(word[i],M))
            b2[h] += 1
        else:
            z2.append(fhash(word[i],M))
            b2.append(1)
    for i in range(len(z2)):
        p2.append([z2[i],b2[i]])       
    print('BoW =', p2)
# 6330179121 (17.85) 38 (2021-03-21 17:08)
x = str(input('File name = '))
def yes() :
    m = int(input('M = '))
    print('-------------------')
    print('char count =',char_count(file_name))
    print('alphanumeric count =',alphanumeric_count(file_name))
    print('line count =',line_count(file_name))
    print('word count =',word_count(file_name))
    print('BoW =',byes(file_name,m))
    return None
def no() :
    print('-------------------')
    print('char count =',char_count(file_name))
    print('alphanumeric count =',alphanumeric_count(file_name))
    print('line count =',line_count(file_name))
    print('word count =',word_count(file_name))
    print('BoW =',bno(file_name))
    return None
def open_stop() :
    file_name = []
    file = open('stopwords.txt','r')
    line = file.read().split()
    file_name += line
    file.close()
    return file_name
def word_count(file_name) :
    a = []
    for i in file_name :
        i = i.split()
        a += i
    x = len(a)
    return x
def char_count(file_name) :
    a = ''
    for i in file_name :
        a += i
    x = len(a)
    return x
def line_count(file_name) :
    x = len(file_name)
    return x
def alphanumeric_count(file_name) :
    keep = ''
    a = ''
    for i in file_name :
        a += i.lower()
    c = 'abcdefghijklimopqrstuvwxyz1234567890'
    subkeep = ''
    for j in a :
        if j in c :        
            subkeep += j
        else :
            keep += subkeep
            subkeep = ''
    return len(keep)
def byes(file_name,m) :
    keep = []
    a = ''
    for i in file_name :
        a += i.lower()
    c = 'abcdefghijklimopqrstuvwxyz1234567890'
    subkeep = ''
    for j in a :
        if j in c :        
            subkeep += j
        else :
            keep.append(subkeep)
            subkeep = ''
    an = []
    o = open_stop()
    for i in keep :
        if i not in o and len(i) != 0 :
            an.append(i)
    k1 = []
    for i in an :
        k1.append(fhash(i,m))
    ans = []
    k = []
    for i in k1 :
        if i not in k :
            k.append(i)
            ans.append([i,1])
        else :
            k.append('')
            n = k.index(i)
            ans[n][1] += 1
            ans.append('')
    real_ans = []
    for i in ans:
        if len(i) != 0 :
            real_ans.append(i)
    real_ans.sort()    
    return real_ans
def bno(file_name) :
    keep = []
    a = ''
    for i in file_name :
        a += i.lower()
    c = 'abcdefghijklimopqrstuvwxyz1234567890'
    subkeep = ''
    for j in a :
        if j in c :        
            subkeep += j
        else :
            keep.append(subkeep)
            subkeep = ''
    an = []
    o = open_stop()
    for i in keep :
        if i not in o and len(i) != 0 :
            an.append(i)
    ans = []
    k = []
    for i in an :
        if i not in k :
            k.append(i)
            ans.append([i,1])
        else :
            k.append('')
            n = k.index(i)
            ans[n][1] += 1
            ans.append('')
    real_ans = []
    for i in ans:
        if len(i) != 0 :
            real_ans.append(i)
    real_ans.sort()    
    return real_ans
def fhash(word,m) :
    g = 37
    k = []
    a = 0
    for i in word :
        k.append(ord(i))
    for i in range(len(k)) :
        a += k[i]*g**i
    ans = a % m
    return ans

file_name = []
file = open(x,'r')
line = file.read().split('\n')
file_name += line
file.close()
i = '1'
while i == '1' :
    has = str(input('Use feature hashing ? (y,Y,n,N) '))
    if has == 'Y' or has == 'y' :
        ans = yes()
        i = '0'
    elif has == 'N' or has == 'n' :
        ans = no()
        i = '0'
    else :
        print('Try again')
        i = '1'
# 6330180721 (30.00) 39 (2021-03-18 17:39)
#Prog-08: Bag-of-words
#6330180721 (30.00) Nichakul Pichitwutikorn
def fhash(w,m):
    a = 0
    for e in range(len(w)):
        a+= ord(w[e])*(37**e)
    return a%m
def num(lis,word):
    c = 0
    for t in lis:
        if t == word:c+=1
    return c
def cut_repeat(listt):
    qr = []
    for e in listt:
        if not e in qr:
            qr.append(e)
    return qr

file_name = input('File name = ')
h = input('Use feature hashing ? (y,Y,n,N) ')
while h!='y' and h!='Y' and h!='n' and h!='N':
    print('Try again.')
    h = input('Use feature hashing ? (y,Y,n,N) ')
if h in 'yY':
    m= input('M = ')
    
book = open(file_name,'r')
stop = open('stopwords.txt','r')
char_al = 0; al = 0;l = 0
sen = ''; st = ''
for line in book:
    for i in line:
        if 'a'<=i<='z' or 'A'<=i<='Z' or '0'<=i<='9':
            al+=1
            sen+=i
        else:
            char_al+=1
            sen+=' '
    l+=1
sen = sen.lower().split()
for t in stop:
    for s in t:
        if s==' ':st+=' '
        else:
            st+=s
st = st.lower().split()
bow = []; ans = []; f = []
for p in sen:
    if not p in st:bow.append(p)

print('-------------------')
print('char count =',al+char_al-l+1)
print('alphanumeric count =',al)
print('line count =',l)
print('word count =',len(sen))
if h in 'yY':
    for j in bow:
        ans.append(fhash(j,int(m)))
    for q in ans:
        f.append([q,num(ans,q)])
    rrr = cut_repeat(f)
    rrr.sort()
    print('BoW =',rrr)
else:
    for j in bow:
        ans.append([j,num(bow,j)])
    rrr = cut_repeat(ans)
    rrr.sort()
    print('BoW =',rrr)
book.close()
stop.close()
# 6330181321 (21.05) 40 (2021-03-22 23:49)

file_name=input('File name = ')
x=input('Use feature hashing ? (y,Y,n,N) ')
while x not in['y','Y','n','N']:
        print('Try again.')
        x=input('Use feature hashing ? (y,Y,n,N) ')
def fhash(w,m):
    ans=0
    for i in range(len((w))):
        ans+=ord((w[i]))*37**i
        fans=ans%m
    return(fans)
    #feature hashing
if x=='y' or x=='Y':
    m=int(input('M = '))

#ข้อ3
stop=open('stopwords.txt','r')
s1=[]
for line in stop:
    s=line.split()
    s1+=s
stop.close()
#ข้อ4
t=''
al=''
w=''
n=0
f=open(file_name,'r')
for line in f:
    n+=1
    for i in line:
        if  i=='\n':
            t+=''
        else:
            t+=i
        
        if  i=='\n'or i in[' ',',', '"', "'", '-', '_', '=', '.', '(', ')', '>', '<', ';', ':']:
            al+=''
            w+=' '
            
        else:
            al+=i
            w+=i
lw=w.lower()
answ=[]
lww=lw.split()
wlww=[]
for i in range(len(lww)):
    if lww[i] not in s1:
        wlww.append(lww[i])
for i in range(len(wlww)):
    if [wlww[i],wlww.count(wlww[i])]not in answ:
        answ+=[[wlww[i],wlww.count(wlww[i])]]
answw=[]
answww=[]
        
print('-------------------')
print('char count =',len(t))
print('alphanumeric count =',len(al))
print('line count =',n)
print('word count =',len(w.split()))            
if x=='n' or x=='N':
    print('BoW =',answ)
if x=='y' or x=='Y':
    for i in range(len(answ)):
        answw+=[[fhash(answ[i][0],m),answ[i][1]]]
    answw.sort()
    nar=[]
    
    
    
        
              
    for i in range(len(answw)-1):
        if answw[i][0]!=answw[i+1][0]:
            nar.append(answw[i][0])
    nar.append(answw[-1][0])
    lang=[0]*len(nar)
    for i in range(len(answw)):
        if answw[i][0] in nar:
            k=nar.index(answw[i][0])
            lang[k] += answw[i][1]
    for i in range(len(nar)):
        answww+=[[nar[i],lang[i]]]
    
    
    print('BoW =',answww)
f.close()
        
# 6330182021 (14.55) 41 (2021-03-22 00:09)
file_name = input('File name = ',)
x=input('Use feature hashing ? (y,Y,n,N) ',)
read_file = open(file_name, 'r')
st=open('stopwords.txt','r')
sw=''
for l in st:
    if l!='\n':
        sw+=l
sw=sw.split()
line=''
ao=''
c=0
li=''
ww=''
for l in read_file:
    c+=1
    line+=l
for e in line:
    if e!='\n':
        li+=e
for e in li:
    if e!=' ' and e!='\"' and e!="\'" and e!=',' and e!='.' and e!= '!' and e!='?' and e!='/' and e!=':' and e!=';':
        ao+=e
for e in li:
    if e!=' ' and e!='\"' and e!="\'" and e!=',' and e!='.' and e!= '!' and e!='?' and e!='/' and e!=':' and e!=';':
        ww+=e
    else:
        ww+=' '
wc=(ww.lower()).split()
t=[]
q=[]
for e in wc:
    if e not in sw:
        t.append(e.lower())
    if e not in sw and e not in q:
        q.append(e.lower())
n=[]
for i in range(len(q)):
    f=0
    for j in range (len(t)):
        if q[i]==t[j]:
            f+=1
    n.append([q[i], f])
if x== 'y' or x== 'Y':
    M = input('M = ',)
    print('char count =',len(li))
    print('alphanumeric count =',len(ao))
    print('line count =',c)
    print('word count =',len(wc))
    y=[]
    yn=[]
    G=37
    for i in range (len(t)):
        nn=0
        fhash=0
        for e in t[i]:
            fhash+=ord(e)*(G**nn)
            nn+=1
        yn.append(fhash%int(M))
    for i in range(len(yn)):
        if yn[i]not in y:
            y.append(yn[i])
    yy=[]
    for i in range (len(y)):
        fr=0
        for j in range (len(yn)):
            if y[i]==yn[j]:
                fr+=1
        yy.append([y[i], fr])
    print('BoW =',yy)
elif x=='n' or x=='N':
    print('char count =',len(li))
    print('alphanumeric count =',len(ao))
    print('line count =',c)
    print('word count =',len(wc))
    print('BoW =',n)
else:
    print('Try again.')
read_file.close()
st.close()
# 6330183621 (13.00) 42 (2021-03-21 15:07)
#-------------------------------------------------
def fhash(word,M):
    
    num=0
    for i in range (len(word)):
        num+=ord(word[i])*(37**i)
    x=num%M
    
    return x

#-------------------------------------------------
def remove_stopword(list_of_data_lower):
    
    a=open('stopwords.txt')
    b=[]
    d=True
    while d == True:
        c=a.readline().lower()
        e=c.split()
        for q in range(len(e)):
            b.append(e[q])
        if len(c) == 0:
            d=False
    new_data=[]
    for e in range(len(list_of_data_lower)):
        if list_of_data_lower[e] in b:
            pass
        else:
            new_data.append(list_of_data_lower[e])
    a.close()
    
    return new_data #list of data without stop word

#-------------------------------------------------
def check(a):
    
    c=''
    b=['/','\\','"',"'",'(',')','-','.','>','<',';',':',',']
    for i in range(len(a)):
        if a[i] in b:
            c+=' '
        else:
            c+=a[i]
            
    return c

#-------------------------------------------------
def char_count(file_name): 
    
    data=open(file_name)
    a=''
    for line in data:
        a+=line.strip()
    data.close()
    
    return len(a)

#-------------------------------------------------
def alphanum_count(file_name):
    
    data=open(file_name)
    a=''
    for line in data:
        a+=line.strip()
    b=''
    c=[',','.',"'",'"',':',';','[',']','{','}','\\','/','-','_','=','*','^','!',' ']
    for e in range (len(a)):
        if a[e] not in c:
            b+=a[e]
        else:
            pass
    data.close()
        
    return len(b)

#-------------------------------------------------
def line_count(file_name):
    
    data=open(file_name)
    n=0
    for line in data:
        if len(line) != 0:
            n+=1
    data.close()
            
    return n

#-------------------------------------------------
def word_count(file_name):
    
    data=open('sample.txt')
    a=''
    for line in data:
        a+=line.strip()
    list_of_word=check(a).split()
    data.close()
    
    return len(list_of_word)

#-------------------------------------------------
def bow(file_name,M):
    
    data=open(file_name)
    a=''
    for line in data:
        a+=line.strip().lower()
    b=a.split()
    ccc=remove_stopword(b)
    c=check(' '.join(ccc)).split()
    if M != None:
        for g in range (len(c)):
            c[g]=fhash(c[g],M)
    d=[] #list of words
    e=[] #list of  words and times
    for i in range (len(c)):
        if c[i] in d:
            pass
        else:
            d.append(c[i])
    data.close()
    for f in range(len(d)):
        e.append([d[f],c.count(d[f])])
    e.sort()
        
    return e
    
#-------------------------------------------------
    
def run():
    print('-------------------')
    print('char count = '+str(char_count(file_name)))
    print('alphanumeric count = '+str(alphanum_count(file_name)))
    print('line count = '+str(line_count(file_name)))
    print('word count = '+str(word_count(file_name)))
    
#-------------------------------------------------    

file_name = input( 'File name = ')
again=0
while again==0:
    yes_or_no=input( 'Use feature hashing ? (y,Y,n,N) ' )
    if yes_or_no.lower() == 'y':
        again=1
        M=int(input('M = '))
        run()
        print('BoW =',bow(file_name,M))
    elif yes_or_no.lower() == 'n':
        again=1
        M=None
        run()
        print('BoW =',bow(file_name,None))
    else:
        again=0
        print('Try again.')

# 6330184221 (17.45) 43 (2021-03-22 19:29)

def fhash(w,M) :
    s = []
    a = 0
    G = 37
    n = 1
    for e in w :
        for y in e:
            a += (ord(y)*(G**(n-1)))
            n += 1
            if n == (len(e)+1) :
                a %= M
                s.append(a)
                n = 1
                a = 0
    s.sort()
   
    count = 1
    new = s[0]
    P = []
    for i in range(1,len(s)) :
        if s[i] == new :
            count += 1
        else :
            P.append([(s[i-1]), (count)])
            new = s[i]
            count = 1
        
    P.append([(s[i]), count])
    return P
def nhash(a) :

    s = []
    a.sort()
    new = a[0]
    count = 1
    for i in range(1,len(a)) :
        if a[i] == new :
            count += 1
        else :
            s.append([a[i-1], count])
            new = a[i]
            count = 1
    s.append([a[i], count])
    return s

file_name = input('File name = ')
fh = input('Use feature hashing ? (y,Y,n,N) ')
check = False
while check == False :
    if fh == 'y' or fh == 'Y' or fh == 'n' or fh == 'N' :
        break
    else :
        print('Try again')
        fh = input('Use feature hashing ? (y,Y,n,N) ')

if fh == 'y' or fh == 'Y' :
    M = int(input('M = '))
print("-------------------")

linecount = 0
charcount = 0
s = ''
word = ''
fn = open(file_name, 'r')
for line in fn :
    s += ' '+line.strip().lower()
    linecount += 1
print('char count = '+str(len(s)-linecount))

for e in s :
    if e in "\"\'/\\().,;:" :
        word += ''
    else :
        word += e
data = word.split()    
fn.close()

alphanumeric = ''
for e in word :
    if e != ' ' :
        alphanumeric += e
    else :
        alphanumeric += ''
print('alphanumeric count = '+str(len(alphanumeric)))
print('line count = '+str(linecount))
print('word count = '+str(len(data)))

BoW = ''
stopwords = ''
sw = open('stopwords.txt','r')
for line in sw :
    stopwords += ' '+line.strip()
for e in data :
    if e in stopwords :
        BoW += ''
    else :
        BoW += e+' '
sw.close
BoW = BoW.split()

if fh == 'y' or fh == 'Y' :
    print('BoW = '+str(fhash(BoW,M)))

else :
    print(nhash(BoW))

# 6330185921 (8.35) 44 (2021-03-22 23:08)

Filename = input("File name = ")
file_name = open(Filename, "r")
def removelist(the_list, val):
   return [value for value in the_list if value != val]

use = input("Use feature hashing ? (y,Y,n,N) ")
while use != 'Y' and (use != 'y') and use !='N' and use != 'n':    
    print('Try again.')
    use = input("Use feature hashing ? (y,Y,n,N) ")
if use =='Y' or use == 'y':
    use = 1
    M = input("M = ")

elif use =='N' or use == 'n':
    use = 2        

stops = open('stopwords.txt', "r")
a=''
for line in stops :
    a=a+line
b=a.split() 
c=''
for line in file_name:
    c=c+line

#1
print('-------------------')
t = c.replace('\n','')

charcount=int(len(t))
print('char count =',charcount)

#2
p='''!()-[]{};:'"\,<>./?@#$%^&*_~'''
for e in p:
    if e in t:
        t=t.replace(e,'')
        t=t.replace(' ','')
        g=''
for e in t:
    if 'a'<=e<='z' or 'A' <= e <= 'Z' or '1'<= e <='9':
        g=g+e
t=g        
alphanumericcount=int(len(t))
print('alphanumeric count =',alphanumericcount)


#3 line count
file_name = open(Filename, "r")
line_count = 0
for line in file_name:
    line_count += 1
print('line count =',line_count)

#4word count
file_name = open(Filename, "r")
c=''
for line in file_name:
    c=c+line
    t = c.replace('\n',' ')
p='''!()-[]{};:'"\,<>./?@#$%^&*_~'''
for e in p:
    if e in t:
        t=t.replace(e,' ')
word=t.split()    
wordcount=len(word)
print('word count =',wordcount)

#bow
if use ==2:
    file_name = open(Filename, "r")
    stops = open('stopwords.txt', "r")

    c=''
    for line in file_name:
        c=c+line
    t = (c.replace('\n',' ')).lower()
    p='''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    for e in p:
        if e in t:
            t=t.replace(e,' ')
    for e in t:
        if 'a'<=e<='z' or 'A' <= e <= 'Z' or '1'<= e <='9':
            g=g+e
    
    t=t.split()
    t.sort()

    a=''
    for line in stops :
        a=a+line
    c=a.split()
    for e in c:
        t=removelist(t,e)

    a=[]
    for i in range ((len(t))-1):
        if t[i]==t[i+1]:
            u=2
        else:
            a.append(t[i])
    a.append(t[-1])
    c=[]
    for e in a:
        w=t.count(e)
        c.append([e,w])
    bow=c
    print('BoW =',bow)

#bow fh
def flash(a,M):
    y=0
    for i in range (len(a)):
        y=y+(ord(a[i])*(37**(int(i))))
    flash=y%M
    return y%M


if use == 1:
    c=''
    file_name = open(Filename, "r")
    stops = open('stopwords.txt', "r")
    for line in file_name:
        c=c+line
        t = c.replace('\n',' ')
        t=t.lower()
    p='''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    for e in p:
        if e in t:
            t=t.replace(e,' ')
    for e in t:
        if 'a'<=e<='z' or 'A' <= e <= 'Z' or '1'<= e <='9':
            g=g+e
    t=t.split()
            
    a=''
    for line in stops :
        a=a+line
    c=a.split()
    for e in c:
        t=removelist(t,e)

    r=[]
    for i in range (len(t)):
        r.append(flash(t[i],int(M)))
    r.sort()
    bow=[]
    u=[]
    for i in range (len(r)-1):
        if r[i]==r[i+1]:
            h=1
        
        else:
            u.append(r[i])
    u.append(r[-1])
    for e in u:
        bow.append([e,(r.count(e))])
    print('BoW =',bow)
  






stops.close()
file_name.close()
# 6330186521 (22.99) 45 (2021-03-22 23:30)


file_name = input('File name = ')
yesno = input('Use feature hashing ? (y,Y,n,N) ')
while yesno not in ['y','Y','n','N']:
    print('Try again.')
    yesno = input('Use feature hashing ? (y,Y,n,N) ')
if yesno == 'y' or yesno == 'Y':
    M = int(input('M = '))
print('-------------------')

fn = open(file_name,'r')
chcount = 0
alnum = 0
lncount = 0
wcount = 0
wlist=[]
for line in fn:
    lncount +=1
    for ch in line:
        if ch != '\n':
            chcount += 1                    
        if ch.isalnum() == True:
            alnum +=1
    w = ''
    for ch in line:
        if ch.isalnum() == True:
            w+=ch
        else:
            if w != '':
                wlist.append(w)
            w=''
w=''
wcount = len(wlist)
fn.close()

print('char count = ' + str(chcount))
print('alphanumeric count = ' + str(alnum))
print('line count = ' + str(lncount))
print('word count = ' + str(wcount))
def fhash(w, M):
    hashed = 0
    G = 37
    for i in range(len(w)):
        hashed += ord(w[i]) * G**i
    re = hashed % M
    return re

fstop = open('stopwords.txt','r')
stoplist = []
for line in fstop:
    for word in line.strip().split():
        word = word.lower()
#         print(word)       
        if word not in stoplist:
            stoplist.append(word)
fstop.close()
# hash..only in bow????
K = []
for w in wlist:    
    w = w.lower()
#     print(w)
    if w not in stoplist:
        added = False
        if yesno == 'y' or yesno == 'Y':
             h = int(fhash(w, M))
           
             for i in range(len(K)):                 
                 if K[i][0] == h:
                     K[i][1]+=1
                     added = True
                     break
             if added == False: K.append([h,1])
        elif yesno == 'n' or yesno == 'N':
             for i in range(len(K)):
                 if K[i][0] == w:
                    K[i][1] +=1
                    added = True
                    break
             if added==False: K.append([w,1])
K.sort()                       
print('BoW = ' + str(K))

# 6330187121 (29.00) 46 (2021-03-21 21:20)

#1 and #2-----------------------------------------------------------
file_name = input('File name = ')
fhornot = input('Use feature hashing ? (y,Y,n,N) ')
while not fhornot in 'yYnN':
    print('Try again.')
    fhornot = input('Use feature hashing ? (y,Y,n,N) ')
if fhornot.lower() == 'y':
    M = int(input('M = '))
print('-------------------')

#3------------------------------------------------------------------
#list_of_words------------------------------------------------------
def listword(file_name):
    fn = open(file_name)
    newword = ''
    for line in fn:
        for e in line:
            if e in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz':
                newword += e.lower()
            elif e in '0123456789':
                newword += e
            else:
                newword += ' '
    wordlist = newword.split()
    fn.close()
    return wordlist

#stopword-----------------------------------------------------------
def stopword(file_name):
    fn = open(file_name)
    stopwo = ''
    for line in fn:
        stopwo += line
    stopword = stopwo.split()
    return stopword

#removestopword-----------------------------------------------------
def removestopword(file_name):
    wordlist = listword(file_name)
    stoplist = stopword('stopwords.txt')
    stopwor = ''
    for e in stoplist:
        stopwor += e
    lastcode = ''
    for x in wordlist:
        s = x.lower()
        if not s in stopwor:
            lastcode += x + ' '
    codelist = lastcode.split()
    codelist.sort()
    return codelist

#4------------------------------------------------------------------
#char_count---------------------------------------------------------
def char_count(file_name):
    fn = open(file_name)
    nc = 0
    nl = 0
    for line in fn:
        nc += len(line)
        nl += 1
    n = nc - nl + 1
    fn.close()
    return n
    
#alphabet_count-----------------------------------------------------
def alphabet_count(file_name):
    fn = open(file_name)
    new = ''
    for line in fn:
        for e in line:
            if e.lower() in 'abcdefghijklmnopqrstuvwxyz':
                new += e
            elif e in '0123456789':
                new += e
    fn.close()
    return len(new)

#line_count---------------------------------------------------------
def line_count(file_name):
    fn = open(file_name)
    c = 0
    line = fn.readline()
    while len(line) > 0:
        c += 1
        line = fn.readline()
    fn.close()
    return c

#word_count---------------------------------------------------------
def word_count(file_name):
    wordlist = listword(file_name)
    return len(wordlist)

#BoW----------------------------------------------------------------
def BoWnofh(file_name):
    codelist = removestopword(file_name)
    lastcode = ' '.join(codelist)
    ans = []
    for m in codelist:
        num = 0
        for i in range(len(codelist)):
            if m == codelist[i]:
                num += 1
        ans.append(num)
    nnn = []*len(codelist)
    repeat = []
    for i in range (len(codelist)):
        ann = []
        ann.append(codelist[i])
        ann.append(ans[i])
        if not ann in repeat:
            nnn.append(ann)
            repeat.append(ann)
    
    return nnn

#fh-----------------------------------------------------------------
def fh(word):
    fhw = 0
    for i in range (len(word)):
        fhw += ord(word[i])*(37**i)
    fhn = fhw%M
    return fhn

#BoWwithfh----------------------------------------------------------
def BoWwithfh(file_name):
    codelist = removestopword(file_name)
    lastcode = ' '.join(codelist)
    ana = []
    for m in codelist:
        ff = fh(m)
        ana.append(ff)
    ana.sort()
    ans = []
    for e in ana:
        fi = ana.count(e)
        ans.append(fi)
    nnn = []*M
    repeat = []
    for i in range (len(ans)):
        ann = []
        ann.append(ana[i])
        ann.append(ans[i])
        if not ann in repeat:
            nnn.append(ann)
            repeat.append(ann)
    
    return nnn

#-------------------------------------------------------------------

print('char count =' ,char_count(file_name))
print('alphanumeric count =',alphabet_count(file_name))
print('line count =',line_count(file_name))
print('word count =',word_count(file_name))
if fhornot.lower() == 'n':
    print('BoW =',BoWnofh(file_name))
else :
    print('BoW =',BoWwithfh(file_name))
# 6330188821 (14.00) 47 (2021-03-22 23:18)


file_name = input("File name = ")



BoW = input("feature hashing ? (y,Y,n,N) ")
M = - 1
while BoW not in "nNyY":
    print ("try again")
    BoW = input("feature hashing ? (y,Y,n,N) ")
if BoW in "Yy":
    M = int(input("M = ")) 
    BoW = True
else:
    BoW = False
print("-------------------")
    
    
    


a = []
stop = open("stopwords.txt" , "r")
for line in stop:
     for x in line.strip().split():
         x = x.lower()
         if x not in a:
            a.append(x)
stop.close()
    


len1 = 0
len2 = 0
linecount = 0
words = []

file = open(file_name , "r")
for line in file:
    linecount += 1
    for b in line:
        len1 += 1
        if ("A"<= b <= "Z") or  ("a"<= b <="z") or ("0" <= b <= "9"):
            len2 += 1
        if b == "\n":
            len1 -= 1
        
        word = ''
    for b in line:
        if ('A' <= b <= 'Z') or ('a' <= b <= 'z') or ('0' <= b <= '9'):
              word += b
        else:
            if len(word) != 0:
                words.append(word)
            word = ""
file.close()
def get(words, stopWords, isBoW, M):
    k = []
    for p in words:
        p = p.lower()
        if p in stopWords:
            pass
        else:
            found = False
            if BoW:
                G = 37
                r = 0
                for i in range(len(x)):
                    p = ord(x[i])
                    p = p * (G**i)
                    r += p
                    Edit = r % M
                else:
                    for i in range(len(k)):
                        if k[i][0]==Edit:
                            k[i][1] += 1
                            found = True
                            break
                    if not found:
                        k.append([Edit, 1])
                    else:
                        for i in range(len(k)):
                            if k[i][0] == p:
                                k[i][1] += 1
                                found = True
                                break
                        if not found :
                            k.append([p, 1])
    return k
                
print("char count=", len1)
print("alphanumeric count", len2)
print("line count=", linecount)    
print("word count =", len(words))
print("BoW =", get(words, a, BoW, M))











    





# 6330189421 (30.00) 48 (2021-03-22 22:34)
def fhash(w,M):
    sum=0
    for i in range(len(w)):
        sum+= ord(w[i])*((37)**i)
    return sum % M
n=input("File name = ")
file_name= open(n,"r")
m=input("Use feature hashing ? (y,Y,n,N) ")
while m != "y" and m!= "Y" and m!= "n" and m!="N":
    print("Try again.")
    m=input("Use feature hashing ? (y,Y,n,N) ")
stops=open("stopwords.txt","r")
st=[]
count=0
alpha=0
lcount=0
sti=""
for line in stops:
    st+=line.split()
for line in file_name:
    for i in line.lower():
        if i != "\n":
            count+=1
        if i in "abcdefghijklmnopqrstuvwxyz0123456789":
            alpha+=1
        if i not in "abcdefghijklmnopqrstuvwxyz0123456789":
            sti+=" "
        else :sti+=i
    lcount+=1
wcount=len(sti.split())
word=[]
for i in sti.split():
    if i not in st:
        word.append(i)
if m == "n" or m =="N":
    print("-------------------")
    print("char count =",count)
    print("alphanumeric count =",alpha)
    print("line count =",lcount)
    print("word count =",wcount)
    a=[]
    b=[]
    for i in range(len(word)):
        if word[i] not in a:
            a.append(word[i])
            b.append(1)
        else:b[a.index(word[i])]+=1
    bow=[]
    for i in range(len(a)):
        bow+=[[a[i],b[i]]]
    print("BoW =",bow)
if m == "Y" or m =="y":
    M=int(input("M = "))
    print("-------------------")
    print("char count =",count)
    print("alphanumeric count =",alpha)
    print("line count =",lcount)
    print("word count =",wcount)
    bow1=[]
    A=[]
    Z=[]
    U=[]
    for i in range(len(word)):
        A.append(fhash(word[i],M))
    for i in range(len(A)):
        if A[i] not in U:
            U.append(A[i])
            Z.append(1)
        else:Z[U.index(A[i])]+=1
    for i in range(len(U)):
        bow1+=[[U[i],Z[i]]]
    print("BoW =",bow1)
        

    
        
            





        
# 6330190021 (19.20) 49 (2021-03-21 14:24)

#-------------------------------------------
def char_count( s ) :
    count = 0
    for e in s :
        if 'A' <= e <= 'z' : count += 1
    return count
    
def num_count( s ) :
    count = 0
    for e in s :
        if '0' <= e <= '9' : count += 1
    return count
def BoW( l, stopwords, condi ) :
    x, y, z, w = [], [], [], []
    count = 0
    for e in l :
        x.append(e.lower())
    x.sort()
    for e in x :
        if e not in stopwords :
            y.append(e)
    if condi in 'yY' :
        for i in range(len(y)) :
            y[i] = fhash(y[i],M)         
    for e in y :
        if e not in z :
            z.append(e)
    for e in z :
        for k in y :
            if e == k :
                count += 1
        w.append([e,count])
        count = 0
    w.sort()
    return w
def fhash( w, M ) :
    n = len(w)
    fhash = 0
    for i in range(n) :
        fhash += ord(w[i])*37**i
    return fhash%M

#-------------------------------------------

file_name = input('File name = ')
hashing = input('Use feature hashing ? (y,Y,n,N) ').strip()
while hashing not in 'yYnN' :
    print('Try again.')
    hashing = input('Use feature hashing ? (y,Y,n,N) ').strip()
if hashing in 'yY' :
    M = int(input('M = '))
fl = open('stopwords.txt', 'r')
stopwords = []
for line in fl :
    for e in line.strip().split() :
        stopwords.append(e.lower())
fl.close()
fl = open(file_name, 'r')
line_count = 0
words_info = ''
for line in fl :
    words_info += line[:-1]
    line_count += 1
words_info = words_info+line[-1]
fl.close()
print('-------------------')
print('char count =',len(words_info))
alp_count = char_count(words_info)+num_count(words_info)
print('alphanumeric count =',alp_count)
print('line count =',line_count)
special_char = '!@#$%^&*()_+-*/{}[]():;\"\',.?<>'
new_words = ''
for e in words_info :
    if e in special_char :
        new_words += ' '
    else:
        new_words += e
new_words = new_words.split()
print('word count =',len(new_words))
if hashing in 'yY' :
    LoW = BoW(new_words,stopwords,hashing)
    print('BoW =',LoW)    
else:
    print('BoW =',BoW(new_words,stopwords,hashing))
# 6330191621 (18.32) 50 (2021-03-22 12:59)

def remove_n(text):
    r = ''
    for e in text :
        if e == '\n':
            r += ' '
        else: r += e
    return r
def info(text):
    c1 = 0
    c2 = 0
    for item in text:
        if item in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789':
            c1 += 1
    for i in range (len(text)):
        if text[i:i+1] == '\n':
            c2 += 1
    onelinetext = remove_n(text)
    wordlst = onelinetext.split(' ')
    print('char count = ' + str(len(text) - c2))
    print('alphanumeric count = ' + str(c1))
    print('line count = ' + str(c2))
    print('word count = ' + str(len(wordlst)-1))
def frq(item,list):
    f = 0
    for i in range(len(list)):
        if list[i] == item:
            f += 1
    return f
def remove_punction(text):
    r = ''
    for e in text :
        if e in '\"\'/\\().,;:':
            r += ' '
        else: r += e
    return r
def get_unique(list):
    ulst = []
    for item in list:
        if item not in ulst:
            ulst.append(item)
    return ulst
def bow(text,stopword):
    bowlst = []
    cleantxt = remove_punction(text)
    cleantxt = cleantxt.casefold()
    wordlst = cleantxt.split()
    stopwordlst = stopword.split()
    for word in wordlst:
        if not word in stopwordlst:
            bowlst.append([word,frq(word,wordlst)])
    return get_unique(bowlst)
def fhash(w,M):
    n = 0
    for i in range(len(w)):
        n += ord(w[i])*(37**i)
    f = n % M
    return f
def bowf(text,stopword,M):
    flst = []
    fhlst = []
    cleantxt = remove_punction(text)
    cleantxt = cleantxt.casefold()
    wordlst = cleantxt.split()
    stopwordlst = stopword.split()
    for word in wordlst:
        if word not in stopwordlst:
            flst.append(fhash(word,M))
    for x in flst:
        fhlst.append([x, frq(x, flst)])
    return get_unique(fhlst)
def main():
    fn = open(input('File name = '),'r')
    fn2 = open('stopwords.txt','r')
    n = ''
    t = ''
    s = ''
    for line in fn:
        t += line
    for line in fn2:
        s += line
    while not n in ['y','Y','n','N']:
        n = input('Use feature hashing ? (y,Y,n,N) ')
        if not n in ['y','Y','n','N']:
            print('Try again.')
    if n in 'Yy':
        M = int(input('M = '))
        print('-------------------')
        info(t)
        print('BoW =', bowf(t,s,M))
    if n in 'Nn':
        print('-------------------')
        info(t)
        print('BoW =', bow(t,s))
    fn2.close()
    fn.close()


#Program
main()
# 6330192221 (22.99) 51 (2021-03-22 23:09)
c=0
alpha=0
l=0
w=[]
x=[]
b=[]
Bag=[]
logic=['y','Y','n','N']
def char_count(line):
    c=0
    c+=(len(line)-1)
    return c
#-------------------------------
def alphanumeric_count(line):
    c1=0
    for t in line:
        if 'a'<=t<='z' or 'A'<=t<='Z' or '0'<=t<='9':
            c1+=1
    return c1
#-------------------------------
def line_count(line):
    l=0
    if len(line)!=0:
        l+=1
    return l 
#-------------------------------
def word_count(line):
    word=[]
    wordn=[]
    s=""
    for t in line:
        if 'a'<=t<='z' or 'A'<=t<='Z' or '0'<=t<='9':
            s+=t
        else:
            if s!="":
                word.append(s.lower())
            s=""
    return word
#-------------------------------
def BoW(lis):
    c=[]
    for i in range(len(lis)):
        if [lis[i],lis.count(lis[i])] not in c:
            c.append([lis[i],lis.count(lis[i])])
    return c 
#-----------------------------
def fhash(w,M):
    G=37
    s=0
    for i in range(len(w)):
        s+=ord(w[i])*(G**i)
        s=s%M
    return s 
#-----------------------------
def hashedBoW(wordlist,M):
    A=[]
    for word in wordlist:
        A.append(fhash(word,M))
    return BoW(A)
#-----------------------------
file_name=input("File name = ")
fin = open(file_name,"r")
fin2= open("stopwords.txt","r")
a=input("Use feature hashing ? (y,Y,n,N) ")
for line in fin:
    c+=char_count(line)
    alpha+=alphanumeric_count(line)
    l+=line_count(line)
    w+=word_count(line)
else:
    c+=1
lw=len(w)
for line in fin2:
    x+=line.split()
for i in range(len(w)):
    if w[i] not in x:
        b.append(w[i].lower())
if a=='y' or a=='Y':
    M=int(input("M = "))
    print("-------------------")
    print("char count =",c)
    print("alphanumeric count =",alpha)
    print("line count =",l)
    print("word count =",lw)
    print("BoW =",hashedBoW(b,M))
elif a=='n' or a=='N':
    print("-------------------")
    print("char count =",c)
    print("alphanumeric count =",alpha)
    print("line count =",l)
    print("word count =",lw)
    print("BoW =",BoW(b))
else:
   while a not in logic:
        print("Try again.")
        a=input("Use feature hashing ? (y,Y,n,N) ")
        if a=='y' or a=='Y':
            M=int(input("M = "))
            print("-------------------")
            print("char count =",c)
            print("alphanumeric count =",alpha)
            print("line count =",l)
            print("word count =",lw)
            print("BoW =",hashedBoW(b,M))
        elif a=='n' or a=='N':
            print("-------------------")
            print("char count =",c)
            print("alphanumeric count =",alpha)
            print("line count =",l)
            print("word count =",lw)
            print("BoW =",BoW(b))
    







# 6330193921 (30.00) 52 (2021-03-20 22:26)

#--------------------------------------------------
def fhash(w, M):
    fh = 0
    for i in range(len(w)):
        fh += ord(w[i])*(37**i)
    fh = fh%int(M)
    return fh
def ch_count(file_name):
    file = open(file_name)
    c = 0
    for line in file:
        for e in line:
            if e == '\n':
                c += 0
            else:
                c += 1
    file.close()
    return c
def ch_num(file_name):
    file = open(file_name)
    c = 0
    for line in file:
        for e in line:
            if 'A' <= e.upper() <= 'Z':
                c += 1
            if '0' <= e <= '9':
                c += 1
    file.close()
    return c
def w_count(file_name):
    file = open(file_name)
    c = 0
    for line in file:
        s = ''
        for e in line:
            if 'A' <= e.upper() <= 'Z' or '0' <= e <= '9':
                s += e
            else:
                s += ' '
        s = s.split()
        c += len(s) 
    file.close()
    return c
def wl_count(words,word):
    c = 0
    for e in words:
        if e == word:
            c += 1
    return c
        
def l_count(file_name):
    file = open(file_name)
    c = 0
    for line in file:
        c += 1
    file.close()
    return c
def BoW(file_name):
    words = only_words(file_name)
    stopwords = only_words('stopwords.txt')
    cut = []
    BoW = []
    repeat = []
    fh = []
    for e in words:
        if e not in stopwords:
            cut.append(e)
    if YON in ['y','Y']:
        for i in range(len(cut)):
            fh.append(fhash(cut[i],M))
            if fh[i] not in repeat:
                repeat.append(fh[i])
        for i in range(len(repeat)):
            BoW.append([repeat[i],fh.count(repeat[i])])
        BoW.sort()
        
    if YON in ['n','N']:
        for e in cut:
            if e not in repeat:
                BoW.append([e, wl_count(cut,e)])
                repeat.append(e)
        BoW.sort()
    return BoW
        
def only_words(file_name):
    file = open(file_name)
    s = ''
    for line in file:
        for e in line:
            if 'A' <= e.upper() <= 'Z' or '0' <= e <= '9':
                s += e.lower()
            else:
                s += ' '
    file.close()
    return s.split()

#------------------------------------------------

file_name = input('File name = ')
YON = input('Use feature hashing ? (y,Y,n,N) ')
if YON in ['y','Y']:
    M = input('M = ')
while YON not in ['y','Y','n','N']:
    print('Try again.')
    YON = input('Use feature hashing ? (y,Y,n,N) ')
    if YON in ['y','Y']:
        M = input('M = ')

print('-------------------')
print('char count = '+str(ch_count(file_name)))
print('alphanumeric count = '+str(ch_num(file_name)))
print('line count = '+str(l_count(file_name)))
print('word count = '+str(w_count(file_name)))
print('BoW = '+str(BoW(file_name)))
# 6330194521 (13.16) 53 (2021-03-22 00:22)
#Prog-08: Bag-of-words
#6330194521 (13.16) Name Taechit Pornsukasem
fn = input("File name: ")
file_name = open(fn , 'r')
f = file_name.readline()
word = []
while len(f) > 0:
    word.append(f)
    f = file_name.readline()
file_name.close()

while True:
    ufh = input("Use feature hashing? (y,Y,n,N) ")
    if ufh == 'y':
        M = int(input(("M = ")))
        break
    elif ufh == 'Y':
        M = int(input(("M = ")))
        break
    elif ufh == 'n':
        break
    elif ufh == 'N':
        break
    else:
        print("Try again.")
        continue
def fhash(w,M):
    summ = 0
    for i in range(len(w)):
        summ += ord(w[i])*(37**i)
    last = summ % M
    return last

cc = 0
for i in range(len(word)):
    for j in word[i]:
        cc += 1
char_count = cc-len(word)
print('char count =',char_count)

alp = []
for i in range(len(word)):
    for j in word[i]:
        if j != " ":
            alp.append(j)
for i in alp:
    if i in ['\'','\"',',','.','(',')',':',';','\\','/']:
        alp.remove(i)
alp_count = len(alp)-len(word)
print('alphanumeric count =',alp_count)
line_count = len(word)
print('line count =',line_count)

res = ""
for i in range(len(word)):
    for j in word[i]:
        if j not in ['\'','\"',',','.','(',')',':',';','\\','/']:
            res += j.lower()
res = res.split()
print('word count =',len(res))

stop = open('stopword.txt',"r")
sto = stop.readline()
stp = []
while len(sto) > 0:
    stp.append(sto)
    sto = stop.readline()
stop.close()

stopword = ""
for i in range(len(stp)):
    for j in stp[i]:
        stopword += j
stopword = stopword.split()

want = []
for i in res:
    if i not in stopword:
        want.append(i)
def bow1(want):
    w = []
    count = []
    for i in want:
        if i not in w:
            w.append(i)
            count.append(1)
        else:
            for j in range(len(w)):
                if w[j] == i:
                    count[j] += 1
    fin = []
    for i in range(len(w)):
        tmp = []
        tmp.append(w[i])
        tmp.append(count[i])
        fin.append(tmp)
    return fin
def bow2(want):
    num = []
    count = []
    fq = []
    for i in want:
        num.append(fhash(i,M))
    for j in num:
        if j not in fq:
            fq.append(j)
            count.append(1)
        else:
            for k in range(len(fq)):
                if fq[k] == j:
                    count[k] += 1
    fin = []
    for r in range(len(fq)):
        tmp = []
        tmp.append(fq[r])
        tmp.append(count[r])
        fin.append(tmp)
    fin.sort()
    return fin
if ufh == 'y':
    print('BoW =',bow2(want))
elif ufh == 'Y':
    print('BoW =',bow2(want))
elif ufh == 'n':
    print('BoW =',bow1(want))
elif ufh == 'N':
    print('BoW =',bow1(want))
# 6330197421 (22.90) 54 (2021-03-21 13:13)
file_name = input("File name = ")

feature_hashing = input("Use feature hashing ? (y,Y,n,N) ")
while True:
    if feature_hashing == "y" or feature_hashing == "Y":
        feature_hashing = True
        M = int(input("M = "))
        break
    elif feature_hashing == "n" or feature_hashing == "N":
        feature_hashing = False
        break
    else:
        print("Try again.")
        feature_hashing = input("Use feature hashing ? (y,Y,n,N) ")

print("-------------------")
def fhash(w,M):
    answer = 0
    for i in range(len(w)):
        answer += ord(w[i])*37**i
    answer = answer%M
    return answer

stopwords = []
with open("stopwords.txt") as Filehandler:
    for line in Filehandler:
        stopwords += line.split()
for word in stopwords:
    word = word.lower()
        
character_count = 0
alphanumeric_count = 0
line_count = 0
words = []
file_name_no_stopwords = []

with open(file_name) as Filehandler:
    for line in Filehandler:
        if line[-1] == "\n":
            character_count += len(line[:-1])
        else:
            character_count += len(line)
        
        for e in line:
            if "0" <= e <= "9" or "A" <= e <= "z":
                alphanumeric_count += 1
        
        line_count += 1
        
        for word in line.split():
            f = ""
            for e in word:
                if "0" <= e <= "9" or "A" <= e <= "z":
                    f += e
            words.append(f.lower())
            if f.lower() not in stopwords:
                file_name_no_stopwords.append(f.lower())
        word_count = len(words)
        
                
        if feature_hashing == False:
            BoW = []
            for word in file_name_no_stopwords:
                duplicate = False
                for word_and_number_of_word in BoW:
                    if word_and_number_of_word[0] == word:
                        word_and_number_of_word[1] += 1
                        duplicate = True
                        break
                if duplicate == False:
                    BoW.append([word, 1])
            BoW.sort()
        else:
            list_of_fhash = []
            for w in file_name_no_stopwords:
                list_of_fhash.append(fhash(w,M))
            BoW = []
            for n in list_of_fhash:
                duplicate = False
                for fhash_and_number_of_fhash in BoW:
                    if fhash_and_number_of_fhash[0] == n:
                        fhash_and_number_of_fhash[1] += 1
                        duplicate = True
                        break
                if duplicate == False:
                    BoW.append([n, 1])
            BoW.sort()
            
print("char count = ", character_count)
print("alphanumeric count =", alphanumeric_count)
print("line count =", line_count)
print("word count =", word_count)
print("BoW =", BoW)
# 6330198021 (21.80) 55 (2021-03-22 13:36)
def fhash(w,M):
    result = 0
    result1 = 0
    for i in range(len(w)):
        result += ord(w[i])*37**i
    result1 += int(result)%M
    return result1
#x = fhash('football',4)
#print(x)
#กรณีทดสอบ
def remove_punc(t):
    output =''
    for e in t:
        if e in '\'\"/\\().,;:':
            output += ' '
        else:
            output += e
    return output
def remove(u):
    output = ''
    for e in u:
        if e == '\n':
            output += ''#ทำให้หายไปเลย
        else:
            output += e
    return output
def Remove(v):
    output = ''
    for e in v:
        if e == '\n':
            output += ' '#ให้เป็นช่องว่าง
        else:
            output += e
    return output

file = input()
print('File name = '+file)
file_name = open(file,'r')
file_name2 = open('stopwords.txt','r')
LINE = ''
Line = ''
for line in file_name:
    LINE += line #อ่านทีละอัน
    Line += remove(line) #อ่านทีละอันโดยตัดตัวขึ้นบรรทัดใหม่ออก
LINE2 = ''
Line2 = ''
for line in file_name2:
    LINE2 += line
    Line2 += Remove(line)
 
x1 = input()     
while x1 not in ['y','Y','n','N']:
    print('Use feature hashing ? (y,Y,n,N)' ,x1)
    print('Try again.')
    x1 = input()
    if x1 in ['y','Y','n','N']:
        break
if x1 in ['y','Y']:
    print('Use feature hashing ? (y,Y,n,N)' ,x1)
    m = int(input())
    print('M =' ,m)
    print('-'*19)
elif x1 in ['n','N']:
    print('Use feature hashing ? (y,Y,n,N)' ,x1)
    print('-'*19)
    
x2 = []    
x2 = remove_punc(Line)
x2 = x2.lower()
x2 = x2.split()#คำใน sample.txt

x3 = []
x3 = remove_punc(Line2)
x3 = x3.lower()
x3 = x3.split()#คำใน stopwords.txt
    
char_count = 0
char_count += len(Line)
    
print('char count =' ,char_count)

alphanumeric = 0
Line = Line.lower()
for e in Line:
    if e in ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','0','1','2','3','4','5','6','7','8','9']:
        alphanumeric += 1
        
print('alphanumeric count =' ,alphanumeric)

line_count = 1
for e in LINE:
    if e == '\n':
        line_count += 1
    
print('line count =' ,line_count)

word_count = 0
x2 = remove_punc(Line)
x2 = x2.split()
word_count += len(x2)
        
print('word count =' ,word_count)

if x1 in ['n','N']: #เอาคำที่เหลือรอดมาทำ BoW    
    f = []
    for i in range(len(x2)):
        if x2[i] not in x3:
            f.append(x2[i]) #คำที่เหลือรอด

    result = []
    for r1 in f:
        k = 0
        for r2 in f:
            if r1 == r2:
                k += 1
        result.append([r1,k])
        result.sort()

    BoW =[]
    for i in range(len(result)):
        if result[i] != result[i-1]:
            BoW.append(result[i])
    
    print('BoW =' ,BoW)
    
if x1 in ['y','Y']:
    f = []
    for i in range(len(x2)):
        if x2[i] not in x3:
            f.append(x2[i])

    Result = []
    for x4 in f:
        Result.append(fhash(x4,m))
        Result.sort() # list ค่าจากการคำนวณ fhash

    real_Result = []
    for R1 in Result: #ไล่ค่าทีละตัว  
        k = 0
        for R2 in Result:
            if R1 == R2:
                k += 1
        real_Result.append([R1,k]) # นับจำนวนว่ามีกี่ตัว
    
    BOW = []       
    for i in range(len(real_Result)):
        if real_Result[i] != real_Result[i-1]:
            BOW.append(real_Result[i])
                
    print('BoW =' ,BOW)

file_name.close()
file_name2.close()
# 6330199721 (0.00) 56 (2021-03-22 17:35)

file_name = input('File name = ')
fn = open(file_name.strip(), 'r')
a = input('Use feature hashing ? (y,Y,n,N) ')
while 1>0:
    if a in 'nN':
        break
    elif a in 'yY':
        M = input('M = ')
        break
    else :
        print('Try again.')
        a = input('Use feature hashing ? (y,Y,n,N) ')
print('-------------------')


s = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890'
cc = 1
ac = 0
lc = 0
wc = 0
for i in fn:
    lc+=1
    for b in range(len(i)):
        cc+=1
        if i[b] in s:
            ac+= 1
            if i[b+1] not in s:
                wc+=1
print('char count =',cc-lc)
print('alphanumeric count =',ac)
print('line count =',lc)
print('word count =',wc)



if a in 'nN':
    fn = open(file_name.strip(), 'r')
    st = open("stopwords.txt", 'r')
    s = 'abcdefghijklmnopqrstuvwxyz1234567890'
    w = ''
    for l in fn:
        l = l.strip('\n')
        l = l.lower()
        for i in l:
            if i in s:
                w += i
            else:
                w += ' '
    w = w.split()
    for l in st:
        l = l.split()
        for i in l:
            while i in w:
                w.remove(i)
    bow = []
    for i in w:
        x = w.count(i)
        if [i,x] not in bow:
            bow += [[i,x]]

    print('BoW =',bow)


def fhash(x,M):
    a = 0
    for i in range(len(x)):
        c = ord(x[i])*(pow(37,i))
        a += c
    d =  a%int(M)
    return str(d)



if a in 'Yy':
    fn = open(file_name.strip(), 'r')
    st = open("stopwords.txt", 'r')
    s = 'abcdefghijklmnopqrstuvwxyz1234567890'
    w = ''
    for l in fn:
        l = l.strip('\n')
        l = l.lower()
        for e in l:
            if e in s:
                w += e
            else:
                w += ' '
    w = w.split()
    for l in st:
        l = l.split()
        for i in l:
            while i in w:
                w.remove(i)
    j=[]
    for i in w:
        j += fhash(i,M)
    bow = []
    for i in j:
        x = j.count(i)
        if [int(i),x] not in bow:
            bow += [[int(i),x]]
            bow.sort()
    print('BoW =',bow)
    
fn.close()
st.close()
# 6330200621 (19.80) 57 (2021-03-22 00:15)

file_name = input('File name = ')
fn = open(file_name.strip(), 'r')
fh = input('Use feature hashing ? (y,Y,n,N) ')
while True:
    if fh not in 'yYnN':
        print('Try again.')
        fh = input('Use feature hashing ? (y,Y,n,N) ')
    if fh in 'yY':
        M = input('M = ')
        break
    if fh in 'nN':
        break
#------------------------------------------------------------   
def everything(fn):
    ac = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0']
    anc = 0 
    cc = 0
    lc = 0
    tap = ''
    for line in fn:
        line = line.strip('\n')
        line = line.lower()
        lc += 1
        for e in line:
            cc += 1
            if e in ac:
                anc += 1
                tap += e
            else:
                tap += ' '
    tap = tap.split()        
    wc = len(tap)
    return anc,cc,lc,wc
#-------------------------------------------------------------

anc,cc,lc,wc = everything(fn) 
print('-'*len('Use feature hashing'))
print('char count =',cc)
print('alphanumeric count =',anc)
print('line count =',lc)
print('word count =',wc)
fn.close()

#---------------------------------------------------
def bow1():
    fn = open(file_name.strip(), 'r')
    st = open("stopwords.txt", 'r')
    ac = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0']
    b = ''
    for line in fn:
        line = line.strip('\n')
        line = line.lower()
        for e in line:
            if e in ac:
                b += e
            else:
                b += ' '
    b = b.split()
    for line in st:
        line = line.split()
        for e in line:
            while e in b:
                b.remove(e)

    bow = []
    for e in b:
        z = b.count(e)
        if [e,z] not in bow:
            bow.append([e,z])
    fn.close()
    st.close()
    return bow
#---------------------------------------------------

if fh == 'n' or fh == 'N':
    bow = bow1()
    print('BoW =',bow)
    
#---------------------------------------------------
def fhash(a,M):
    summ=0
    for i in range (len(a)):
        summ += ord(a[i])*(37)**i
    c = summ % int(M)
    return str(c)
#----------------------------------------
def bow2():
    fn = open(file_name.strip(), 'r')
    st = open("stopwords.txt", 'r')
    ac = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0']
    b = ''
    for line in fn:
        line = line.strip('\n')
        line = line.lower()
        for e in line:
            if e in ac:
                b += e
            else:
                b += ' '
    b = b.split()
    for line in st:
        line = line.split()
        for e in line:
            while e in b:
                b.remove(e)
    c=[]
    for f in b:
        c.append(fhash(f,M))
        
    bow = []
    for e in c:
        z = c.count(e)
        if [int(e),z] not in bow:
            bow.append([int(e),z])
            bow.sort()
    fn.close()
    st.close()
    return bow
#-------------------------------------------
if fh == 'y' or fh == 'Y':
    bbb = bow2()
    print('BoW =',bbb)
# 6330201221 (24.90) 58 (2021-03-22 19:50)
#Prog-08: Bag-of-words
#6330201221 (24.90) Thatphong Hengchun
num = ['0','1','2','3','4','5','6','7','8','9']
alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

file_name = input('File name = ').strip()
check = ''
M = 0
while check == '':
    useHash = input('Use feature hashing ? (y,Y,n,N) ').lower()
    if useHash == 'y':
        M += int(input('M = '))
        check += 'y'
    elif useHash == 'n':
        check += 'n'
    else :
        print('Try again')
def nFhash(w): # w = list of words
    freq = []
    checker = []
    for word in w :
        if word not in checker:
            checker.append(word)
    freq += [0]*len(checker)
    for i in range(len(w)):
        for j in range(len(checker)):
            if w[i] == checker[j]:
                freq[j] += 1
                break
    return [[checker[k],freq[k]] for k in range(len(checker))]
def fhash(word,m):
    f = 0
    for i in range(len(word)):
        f += ord(word[i])* (37 ** i)
    return f%m
def yFhash(w,n): #w = list of words
    freq = []
    checker = []
    for i in range(len(w)):
        w[i] = fhash(w[i],n) # w = list of fhase(word,m)
    return nFhash(w)
def charInLine(s):
    return len(s.strip())

#w = ['shane','likes','football','big','fan','football','team','arsenal']
#for e in w:
#    print(fhash(e,M))
stop_words = []
stFile = open('stopwords.txt','r')
for line in stFile :
    stop_words += line.strip().split()
stop_words = list(map(str.lower, stop_words))
#print(stop_words)
txFile = open(file_name,'r')
charCount = 0
lineCount = 0
text = ''
words = []
for line in txFile :
    words += line.lower().split()
    charCount += charInLine(line)
    lineCount += 1
#print(words)
alpnumCount = 0
for word in words:
    for alp in word :
        if alp not in alphabet and alp not in num :
            continue
        else :
            text += alp
            alpnumCount += 1
    text += ' '
#print(text)
cWords = text.split()
wordCount = len(cWords)
dWords = []
for f in cWords:
    if f not in stop_words:
        dWords.append(f)
print('-------------------')
print('char count =',charCount)
print('alphanumeric count =',alpnumCount)
print('line count =',lineCount)
print('word count =',wordCount)
if check == 'y':
    print('BoW =',sorted(yFhash(dWords,M)))
else :
    print('BoW =',sorted(nFhash(dWords)))
# 6330202921 (21.05) 59 (2021-03-22 23:21)

def listOfStopWord():
    l = []
    f = open('stopwords.txt','r')
    for line in f:
        l += line.split()
    f.close()
    return l
def fhash(w,M):
    val = 0
    for i in range(len(w)):
        val += ord(w[i])*(37**i)
        val %= M
    return val
def check(word,l,noHash,M):
    stop = listOfStopWord()
    if word.lower() in stop:
        return
    if not noHash:
        word = fhash(word,M)
    found = False
    for p in l:
        if p[0]==word:
            p[1] = p[1]+1
            found = True
            return
    if not found:
        l.append([word,1])
    return
def cal(filename,noHash,M):
    l = []
    f = open(filename, "r")
    stop = listOfStopWord()
    charcount = 0
    alphanumericcount = 0
    linecount = 0
    wordcount = 0
    for line in f:
        tmp = 0
        b = False
        line = line.strip()
        for i in range(len(line)):
            if line[i] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789':
                alphanumericcount+=1
            if line[i] not in 'abcdefghijklmnoqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' and b == True:
                if i-tmp>0 :
                    check(line[tmp:i],l,noHash,M)
                    wordcount+=1
                b = False
            if line[i] in 'abcdefghijklmnoqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'and b == False:
                tmp = i
                b = True
            charcount+=1
        if b == True:
            check(line[tmp:i],l,noHash,M)
            wordcount+=1
        linecount+=1
    f.close()
    print('char count =',charcount)
    print('alphanumeric count =',alphanumericcount)
    print('line count =',linecount)
    print('word count =',wordcount)
    return l


def main():
    filename = input('File name = ')
    check = input('Use feature hashing ? (y,Y,n,N) ')
    while True:
        print('-------------------')
        if check.lower()=='y':
            M = int(input('M = '))
            BoW = cal(filename,False,M)
            break    
        elif check.lower()=='n':
            M = 0
            BoW = cal(filename,True,M)
            break
        else:
            check = input('Try again. ')
    print('BoW =',BoW)
main()
# 6330203521 (30.00) 60 (2021-03-22 13:03)

def char_count (file_name) :
    a = 0
    for line in file_name :
        for c in line :
            if c != "\n" :  #à¸•à¸à¸™à¹€à¸„à¸²à¸°à¸šà¸£à¸£à¸—à¸±à¸”à¹ƒà¸«à¸¡à¹ˆà¸ˆà¸°à¸¡à¸µà¸£à¸«à¸±à¸§à¸™à¸µà¹‰à¸‹à¹ˆà¸à¸™à¸à¸¢à¸¹à¹ˆ!
                a += 1
    return a
def alpha_count (file_name) :
    a = 0
    for line in file_name :
        for c in line :
            if ("A"<= c <= "Z") or ("a"<= c <= "z") or (c in "0123456789") :
                a += 1
    return a
def line_count (file_name) :
    a = 0
    for line in file_name :
        a += 1
    return a
def word_count (file_name) :
    sen = ""
    for line in file_name :
        for c in line :
            if ("A"<= c <= "Z") or ("a"<= c <= "z") or (c in "0123456789") :
                sen += c
            else :
                sen += " "
    a = sen.lower().strip().split()
    return a
def show_result (a) :
    file_name = open(a,"r")
    print("char count =",char_count(file_name))
    file_name.close()

    file_name = open(a,"r")   #à¸–à¹‰à¸²à¹„à¸¡à¹ˆà¸¡à¸µà¸¡à¸±à¸™à¸ˆà¸°à¸£à¸±à¸šà¹à¸¥à¸°à¸à¹ˆà¸²à¸™à¸„à¸£à¸±à¹‰à¸‡à¹€à¸”à¸µà¸¢à¸§
    print("alphanumeric count =",alpha_count (file_name))
    file_name.close()

    file_name = open(a,"r")
    print("line count =",line_count(file_name))
    file_name.close()

    file_name = open(a,"r")
    print("word count =",len(word_count(file_name)))
    file_name.close()
def BoW(a) :
    data_s = []
    file_name = open(a,"r")
    stop = open("stopwords.txt","r")
    for line in stop :
        data_s += line.lower().strip().split()
    data = word_count(file_name)
    ans  = []
    for c in data :
        if c not in data_s :
            ans.append(c)
    file_name.close()
    stop.close()
    return ans
def fhash(w,M) :
    data = []
    for c in w :
        num = 0
        i = 0
        for a in c :
            num += ord(a)*(37**i)
            i += 1
        data.append(num%int(M))
    ans = []
    count = 1
    try :
        data = data + [max(data)+1]
    except :
        return []
    data.sort()
    f = data[0]
    for i in range(1,len(data)) :
        if f == data[i] :
            count += 1
        else :
            ans.append([f,count])
            count = 1
            f = data[i]
    return ans
def nofhash(data) :
    ans = []
    count = 1
    data.sort()
    data.append(" ")
    f = data[0]
    for i in range(1,len(data)) :
        if f == data[i] :
            count += 1
        else :
            ans.append([f,count])
            count = 1
            f = data[i]
    return ans
#-----------------------------------------------------------------------------------
    
a = input("File name = ")
while True :
    b = input("Use feature hashing ? (y,Y,n,N) ")
    if b == "y" or b == "Y" :
        M = input("M = ")
        print("-------------------")
        show_result(a)
        word = BoW(a)
        ansf = fhash(word,M)
        print("BoW =",ansf)
        break
    elif b == "n" or b == "N" :
        print("-------------------")
        show_result(a)
        word = BoW(a)
        ansnf = nofhash(word)
        print("BoW =",ansnf)
        break
    else :
        print("Try again.")   
# 6330205821 (30.00) 61 (2021-03-22 14:48)
def alc(line):
    c = 0
    for i in range(len(line)):
        if 'a' <= line[i] <= 'z':
            c +=1
        elif '0' <= line[i] <= '9':
            c+=1
    return c
#--------------------------------
def charcount(line):
    ch =0
    ch += len(line)
    return ch
#--------------------------------
def wc(line):
    w =0
    b =''
    for e in line:
        if e not in 'abccdefghijklmnopqrstuvwxyz' and e not in '0123456789':
            b += ' '
        else:
            b += e
    c = b.strip().split()
    w = len(c)
    return w,c
#-----------------------------------
def BoW(word_all):
    b =[]
    x = []
    cut = open('stopwords.txt','r')
    for l in cut:
        x += l.strip().split()
    cut.close()
    for e in word_all:
        found = False
        if e not in x:
            for i in range(len(b)):
                if e == b[i][0]:
                    found =True
                    break
            if found == False:
                b.append([e,word_all.count(e)])
    b.sort()
    return b
#-----------------------------------------
def fhash(w,M):
    su =0
    for i in range(len(w)):
        su += ord(w[i])*(37)**(i)
    ans = su % int(M)
    return ans

#--------------------------------------------
def bowfh(word_all,M):
    bo =[]
    x = []
    ff =[]
    ss = []
    cut = open('stopwords.txt','r')
    for l in cut:
        x += l.strip().split()
    cut.close()
    for e in word_all:
        found = False
        if e not in x:
            ff.append(fhash(e,M))
    for i in ff:
        if i not in ss:
            ss.append(i)
    for i in range(len(ss)):
        bo.append([ss[i],ff.count(ss[i])])
    bo.sort()
    return bo
           
file_name = input('File name = ')
fh = input('Use feature hashing ? (y,Y,n,N) ')
wrong = True
while fh != 'n' and fh != 'N' and fh != 'y' and fh!= 'Y':
    print('Try again.')
    fh = input('Use feature hashing ? (y,Y,n,N) ')
if fh == 'n' or  fh== 'N':
    pass
if fh == 'y' or fh== 'Y':
    m = input('M = ')  
print('-------------------')
f = open(file_name, 'r')
a = 0
cha =0
l =0
word = 0
word_all = []
bow =[]
for line in f:
    line = line.strip().lower()
    a += alc(line)
    cha += charcount(line)
    l += 1
    word += wc(line)[0]
    word_all += wc(line)[1]
print('char count =',cha)
print('alphanumeric count =',a)
print('line count =',l)
print('word count =',word)
bow +=BoW(word_all)
f.close()
if fh == 'n' or fh == 'N':
    print('BoW =',bow)
else:
    print('BoW =',bowfh(word_all,m))
    
    

# 6330206421 (30.00) 62 (2021-03-21 22:44)
alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
num = ['0','1','2','3','4','5','6','7','8','9']
alc = 0
check = 0
f1 = input("File name = ")
f1 = f1.strip()
while check == 0:
    sel = input("Use feature hashing ? (y,Y,n,N) " )
    sel = sel.strip()
    if sel == "y" or sel == "Y":
        M = int(input("M = "))
        check += 1
    elif sel == "n" or sel == "N":
        check += 2
    else:
        print("Try again")
        
def fhash(w,M):
    sumx = 0
    x=[] 
    x[:0]=w
    for i in range(len(x)):
        constan = ord(x[i])
        sumx += constan*(37**i)
    sumx = sumx % M
    return sumx

stw = []
stop = open("stopwords.txt", "r")
for line in stop: 
    z = line.strip()
    z = z.lower()
    stw.append(z)
listToStr = ' '.join([str(elem) for elem in stw])
stopword = listToStr.split()
stop.close()

t = ""
forlist = ""
k = []
infile = open(f1, "r")
for line in infile: 
    point = line.strip()
    point = point.lower()
    k.append(point)
    t = t + point
    forlist = forlist +" "+point
list1=[] 
list1[:0]=forlist
for i in list1:
    if i in alphabet:
        alc += 1
    elif i in num:
        alc += 1
    else :
        pass
list2=list1
newworld = []
for i in list2:
    if i in alphabet:
        newworld.append(i)
    elif i in num:
        newworld.append(i)
    else :
        newworld.append(" ")
mother = ''.join([str(elem) for elem in newworld])
wordlist = mother.split()

    
line = len(wordlist)
char = len(t)
print ("-"*19)
print ("char count = "+str(char))
print ("alphanumeric count = "+str(alc))
print ("line count = "+str(len(k)))
print ("word count = "+str(line))

new = []
if check == 2 :
    for i in wordlist:
        if i not in stopword:
            new.append(i)
        else :
            pass         
    new2 = []
    for i in new:
        if i not in new2:
            new2.append([i,new.count(i)])
        else :
            pass
    new3 = []
    for i in new2:
        if i not in new3:
            new3.append(i)
        else :
            pass
    print ("BoW = "+str(new3))
    
new = []
if check == 1 :
    for i in wordlist:
        if i not in stopword:
            new.append(i)
        else :
            pass         
    countfhash = []
    for i in new:
        total = fhash(i,M)
        countfhash.append(str(total))
    new2 = []
    for i in countfhash:
        if i not in new2:
            new2.append([int(i),countfhash.count(i)])
        else :
            pass
    new3 = []
    for i in new2:
        if i not in new3:
            new3.append(i)
        else :
            pass
    new3.sort()
    print("BoW = "+str(new3))
    
infile.close()

# 6330208721 (26.50) 63 (2021-03-21 11:12)

sample = open(input("File name = "), "r")
text = ''
line_count = 0
for line in sample:
    li = line.strip('\n')
    text += li + " "
    line_count += 1
text.strip()
sample.close()

stop = open('stopwords.txt', "r")
stopword = ''
for line in stop:
    stopword += line
stop.close()

non_alpha = ['(', ')', '-', '_', '[', ']', '"', "'", ';', ':', '>', '<', '.',',','~','^','*','$','#','@','+','=','{','}']
def remove_nonal(a):
    for i in range(len(a)):
        if a[i] in non_alpha:
            a = a[:i] + " " + a[i+1:]
    a = a.strip()
    return a
def remove_stop(a):
    b = ''
    a = remove_nonal(a).lower().split()
    for i in a:
        if i not in stopword:
            b+= i+ " "
    return b.strip()
def bow(a):
    a = a.lower().split()
    b = []
    count = 0
    d = []
    e = []
    for i in range(len(a)):
        for k in range(len(a)):
            if a[i] == a[k] and a[i] not in d:
                count += 1
        d.append(a[i])
        e.append(count)
        count = 0
        b.append([d[i],e[i]])
        for i in b:
            if i[1]==0:
                b.remove(i)
    b.sort()
    return b
def fhash(w,M):
    sum = 0
    for i in range(len(w)):
        sum += ord(w[i])*(37**i)
    ans = sum%M
    return ans
def bow_hash(a,M): #bคือที่แปรรูปแล้ว
    a2 = []
    d = []
    e = []
    b = []
    count = 0
    a = a.split()
    for i in a:
        a2.append(fhash(i,M))
    for i in range(len(a2)):
        for k in range(len(a2)):
            if a2[i] == a2[k] and a2[i] not in d:
                count += 1
        d.append(a2[i])
        e.append(count)
        count = 0
        b.append([d[i],e[i]])
        for i in b:
            if i[1]==0:
                b.remove(i)
    b.sort()
    return b

yn = input("Use feature hashing ? (y,Y,n,N) ")
while yn not in "YyNn":
    print("Try again.")
    yn = input("Use feature hashing ? (y,Y,n,N) ")
if yn in "Yy":
    M = int(input("M = "))
    print("-------------------")
    print("char count = " + str(len(text)-line_count))
    refine_text = remove_nonal(text).split()
    print('alphanumeric count =',len("".join(refine_text)))
    print("line count =", line_count)
    print("word count =",len(refine_text))
    print("BoW =",bow_hash(remove_stop(text),M))
elif yn in "Nn":
    print("-------------------")
    print("char count = " + str(len(text) - line_count))
    refine_text = remove_nonal(text).split()
    print('alphanumeric count =', len("".join(refine_text)))
    print("line count =", line_count)
    print("word count =", len(refine_text))
    print("BoW =",bow(remove_stop(text)))

# 6330209321 (22.65) 64 (2021-03-22 21:03)

with open('stopwords.txt') as f:
    data = f.readlines()
    stopword = []
    for i in range(len(data)):
        x = data[i].strip().split()
        for j in range(len(x)):
            stopword.append(x[j])
def fhash(w,M):
    G = 37
    res = 0
    for i in range(len(w)):
        res += ord(w[i].lower())*(37**(i))
    return res%M

word = "qwertyuiopasdfghjklzxcvbnm0123456789"

with open(input('File name = '),'r',encoding='utf8') as f:
    rawData = f.readlines()
    lineCount = len(rawData)
    for i in range(len(rawData)):
        rawData[i] = rawData[i].strip('\n')
    data = []
    for i in range(len(rawData)):
        if rawData[i] != "":
            data.append(rawData[i])
    # ---------------------------------
    charCount = 0
    alphaNum = 0
    wordCount = 0
    noStopWord = []
    # ---------------------------------
    for i in range(len(data)):
        # lineCount += 1
        for j in range(len(data[i])):
            if data[i][j].lower() in word:
                alphaNum += 1
            elif data[i][j].lower() not in word:
                if j == 0:
                    pass
                elif data[i][j-1] in word:
                    wordCount += 1
            charCount+=1
        if data[i][-1] in word:
            wordCount += 1
    # Remove Stop Word ---------------------------------
        w = data[i].split()
        for j in range(len(w)):
            if w[j].lower() not in stopword:
                noStopWord.append(w[j].lower())
    # -------------------------------------------------
    betterNoStopWord = []
    for i in range(len(noStopWord)):
        formatString = ""
        for j in range(len(noStopWord[i])):
            if noStopWord[i][j] in word:
                formatString += noStopWord[i][j]
        betterNoStopWord.append(formatString)
    # -------------------------------------------------
    bow = []
    useFhash = True
    M = 0
    while True:
        x = input("Use feature hashing ? (y,Y,n,N) ")
        if x in ['y','Y']:
            M = int(input("M = "))
            break
        elif x in ['n','N']:
            useFhash = False
            break
        else:
            print("Try again.")
    # -------------------------------------------------
    for i in range(len(betterNoStopWord)):
        if useFhash:
            for j in range(len(bow)):
                if bow[j][0] == fhash(betterNoStopWord[i],M):
                    bow[j][1] += 1
                    break
            else:
                bow.append([fhash(betterNoStopWord[i],M),1])

        else:
            for j in range(len(bow)):
                if bow[j][0] == betterNoStopWord[i]:
                    bow[j][1] += 1
                    break
            else:
                bow.append([betterNoStopWord[i],1])
    bow.sort()
    # -------------------------------------------------
    print("-------------------")
    print("char count =",charCount)
    print("alphanumeric count =",alphaNum)
    print("line count =",lineCount)
    print("word count =",wordCount)
    print("BoW =",bow)

# 6330210921 (22.35) 65 (2021-03-19 00:14)

alp = "!@$%^&*()_+-={}[]:;\"\'<,>.?/\n"
###################################
def count_alpnum(s) :
    c = 0
    for e in s :
        if "0" <= e <= "9" or "a" <= e <= "z" or \
           "A" <= e <= "Z" :
            c += 1
    return c
def file_list(file_name) :
    fn = open(file_name)
    file_str = ""
    while True :
        a = fn.readline()
        if len(a) == 0 : break
        for e in a :
            if e not in alp : file_str += e.lower()
            else : file_str += " "
    file_list = file_str.split()
    return file_list
def check_str_in_list(str_list,not_in) :
    str_b_check = []
    for e in str_list :
        if e not in not_in : str_b_check.append(e)
    return str_b_check
def count_words_in_list(words,lis) :
    c = 0
    for e in lis :
        if e == words : c += 1
    return c
def find_num_flash(s,M) :    
    c = 0
    for i in range(len(s)) :
        c += ord(s[i])*37**i
    c %= M
    return c
def count_num(n,l) :
    c = 0
    for e in l :
        if e == n : c += 1
    return c
            
###################################
def char_count(file_name) :
    fn = open(file_name)
    c = 0
    for e in fn :
        if "\n" in e : c += e.find("\n")
        else : c += len(e)
    fn.close()
    return c
def alphanumeric(file_name) :
    fn = open(file_name)
    c = 0
    for e in fn :
        c += count_alpnum(e)
    fn.close()
    return c
def line_count(file_name) :
    fn = open(file_name)
    c = 0
    for e in fn :
        c += 1
    fn.close()
    return c
def word_count(file_name) :
    fn = open(file_name)
    c = 0
    for e in fn :
        e_l = e.split()
        c += len(e_l)
    fn.close()
    return c
def n_bow(file_sample,file_stop) :
    a = file_list(file_sample)
    b = file_list(file_stop)
    c = check_str_in_list(a,b)
    c.sort()
    check = []
    cc = []
    for i in range(len(c)) :
        if c[i] not in check :
            d = count_words_in_list(c[i],c)
            check.append(c[i])
            cc.append([c[i],d])
    return cc
def feture_hashing(sample,M) :
    a = check_str_in_list(file_list("sample.txt"),file_list("stopwords.txt"))
    c = []
    for e in a :
        c.append(find_num_flash(e,M))
    check = []
    f_h = []
    c.sort()
    for e in c :
        if e not in check :
            d = count_num(e,c)
            f_h.append([e,d])
            check.append(e)
    return f_h

###################################
print("File name = ",end = "")
file_name = input()

while True :
    print("Use feature hashing ? (y,Y,n,N) ",end = "")
    func = input()
    if func == "y" : break
    elif func == "Y" : break
    elif func == "n" : break
    elif func == "N" : break
    print("Try again.")
if func == "y" or func == "Y" :
    print("M = ",end = "" )
    M = int(input())

print("-------------------")
print("char count = ",end = "" )
print(char_count(file_name))
print("alphanumeric count = ",end = "" )
print(alphanumeric(file_name))
print("line count = ",end = "" )
print(line_count(file_name))
print("word count = ",end = "" )
print(word_count(file_name))
print("BoW = ",end = "" )

if func == "y" or func == "Y" :
    print(feture_hashing(file_name,M))
    
if func == "n" or func == "N" :
    print(n_bow(file_name,"stopwords.txt"))
# 6330211521 (7.63) 66 (2021-03-22 16:51)
File_name = input('File name = ').strip()
feature_hashing = input('Use feature hashing ? (y, Y, n, N) ')
while feature_hashing not in ['y', 'Y', 'n', 'N'] :
    print("Try again")
    feature_hashing = input('Use feature hashing ? (y, Y, n, N) ')
if feature_hashing == 'y' or feature_hashing == 'Y' :
    M = input('M = ')
fin = open(File_name, "r")
fin_1 = open("stopwords.txt", "r")
char_count = 0
alphanumeric_count = 0
line_count = 0
word_count = 0
c = "\"\'/\\,.:;"
c1 = ''
sample = ''
for line in fin :
    line_count += 1
    char_count += len(line)
    for i in range(len(line)) :
        if 'a'<=line[i]<='z' or 'A'<=line[i]<='Z' or '0'<=line[i]<='9' :
            alphanumeric_count += 1
        if line[i] in c :
            c1 += " "
        else :
            c1 += line[i]
        sample += c1
        c1 = ''
word_count = len(sample.split())
stopword = ''
a1 = ''
for line in fin_1 :
    for i in range(len(line)):
        if line[i] in c :
            a1 += ' '
        else : a1 += line[i]
        stopword += a1
        a1 = ''
stopword = stopword.split()
sample1 = sample.lower().split()
sample1_1 = []
for ch in sample1 :
    if ch in stopword :
        sample1_1 += []
    else :
        sample1_1 += [ch]
BoW = []
fre = 0
sample1_1_1 = []
for i in range(len(sample1_1)) :
    for j in range(len(sample1_1)) :
        if sample1_1[i] == sample1_1[j] :
            fre += 1       
    BoW += [[sample1_1[i], fre]]
    fre = 0
BoW2 = []
for ch in BoW :
    if ch in BoW2 :
        BoW2 += []
    elif ch not in BoW2 :
        BoW2 += [ch]

fin.close()
fin_1.close()
print("-------------------")
print("char_count =", char_count - (line_count-1) )
print("alphanumeric_count =", alphanumeric_count)
print("line_count =", line_count)
print("word_count =", word_count)
if feature_hashing == 'n' or feature_hashing == 'N' :
    print("BoW =", BoW2)
elif feature_hashing == 'y' or feature_hashing == 'Y' :
    BoW3 = []
    sum = 0
    for i in range(len(BoW2)) :
        for j in range(len(BoW2[i][0])) :
            sum += ord(BoW2[i][0][j])*(37**j)
        BoW3 += [sum%int(M)]
    num1 = 0
    BoW4 = []
    BoW3_1 = []
    for i in range(len(BoW3)) :
        if BoW3[i] not in BoW3_1 :
            BoW3_1 += [BoW3[i]]
    for i in range(len(BoW3_1)) :
        for j in range(len(BoW3)) :
            if BoW3_1[i] == BoW3[j] :
                num1 += 1
        BoW4 += [[BoW3_1[i], num1]]
        num1 = 0
    print("BoW =", BoW4)
            
            

# 6330212121 (25.20) 67 (2021-03-22 14:40)
k='yYnN'
#p=' .,:;?()[]\"\' -_\\!'
engandmath='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
p = '''!()-[]{};:'"\,<>./?@#$%^&*_~ '''
p2 = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
def fhash(w,m):
    sum=0
    for i in range(len(w)):
        sum+=ord(w[i])*(37**i)
    return sum%m
def bow(f_name,m):
    #################stopwords####################
    stopwords=[]
    infile = open('stopwords.txt', "r")
    for line in infile:
        line=line.strip().split()
        for e in line:
            stopwords.append(e)
    infile.close()
    #################stopwords####################
    #print(stopwords)
    word=''
    infile = open(f_name, "r")

    for line in infile:
        line=line.strip().lower()
        #print(line)
        #word+= line
        for e in line:
            if e not in p2:
                word+= e
            else:
                word+=' '
        word+=' '
    #print(word)
    #print(word.split())
    word=word.split()
    n_word=[]
    for e in word:
        if e not in stopwords:
            n_word.append(e)
    #print(n_word)
    n_word2=[]
    for i in range(len(n_word)):
        if n_word[i] not in n_word2:
            n_word2.append(n_word[i])
    #print(n_word2)
    ##############################
    if m==-1:

        BoW=[]
        for e in n_word2:
            c=0
            for ee in n_word:
                if e==ee:
                    c+=1
            BoW.append([e,c])
        BoW.sort()
        print('BoW =',BoW)
###################################
    else:
        BoW_hash1=[]
        BoW_hash2=[]
        BoW_hash3=[]

        for e in n_word:
           BoW_hash1.append(fhash(e,m))
        for i in range(len(BoW_hash1)):
            if BoW_hash1[i] not in BoW_hash2:
                BoW_hash2.append(BoW_hash1[i])
        for e in BoW_hash2:
            c=0
            for ee in BoW_hash1:
                if e == ee:
                    c+=1
            BoW_hash3.append([e,c])
        BoW_hash3.sort()
        print('BoW =',BoW_hash3)



    infile.close()
def easy(f_name):
    infile = open(f_name, "r")
    char_count=0
    line_apb=[]
    line_count=0
    word_count=0
    for line in infile:
        line1 = line.strip()
        line2=''
        char_count+=len(line1)
        line_count+=1
        for e in line1:
            if e not in p2:
                line2+=e
            else:
                line2+=' '
        line3=  line2.split()
        word_count+=len(line3)

        for i in range(len(line1)):
            if line1[i] in engandmath:
                line_apb.append(line1[i])

    infile.close()
    print('-------------------')
    print('char count =',char_count)
    print('alphanumeric count =',len(line_apb))
    print('line count =',line_count)
    print('word count =',word_count)
def main():
    file_name=input('File name = ')
    method=input('Use feature hashing ? (y,Y,n,N) ')
    while not method in k:
        print('Try again.')
        method=input('Use feature hashing ? (y,Y,n,N) ')


    if method=='y' or method=='Y':
        m=int(input('M= '))
        easy(file_name)
        bow(file_name,m)
    else:
        easy(file_name)
        bow(file_name,-1)






#print(fhash('shane',4))
main()


# 6330213821 (15.00) 68 (2021-03-22 23:59)
def fhash(a,m):
    for i in range(len(b)):
        b = ord(a[i])
        c = 37**i
        d += b*c
    d = d%m
    return d
file_name = input('File name = ')
j = input('Use feature hashing ? (y,Y,n,N) ')
for i in range(1000):
    if j not in ['y','Y','n','N']:
        print('try again.')
        j = input('Use feature hashing ? (y,Y,n,N) ')

if j == 'n' or 'N':
    pass
else :
    m = input('M = ')
def stop_word(file):
    sw = open('stopwords.txt', 'r')
    d = ''
    for i in (sw):
        d += i
    a = d.split()      
    return a

    sw.close()
n = 0
a = 0
wc = 0
f = open(file_name, 'r')
for line in f:
    n += 1
d = []
def char_count(file_name):
    sw = open(file_name)
    c = 0
    l = 0
    for line in sw:
        c += len(line)
        l += 1
    n = c - l + 1
    sw.close()
    return n
def rmsw(n):
    f = open(file_name, 'r')
    
    f.close
    

    
def word_count(n):
    wc = 0
    f = open(file_name, 'r')
    for line in f:
        line = f.readline()
        b = line.split()
        d.append(b)
        wc += len(d)
    return wc
    f.close()
def ac(file_name):
    a = 0
    f = open(file_name, 'r')
    for line in f:
        for e in line:
            if e.lower() in 'abcdefghijklmnopqrstuvwxyz':
                a += 1
            if e in '0123456789':
                a += 1
    return a
    f.close()
    
    
    
    

f.close()
print('char count =',char_count(file_name))
print('alphanumeric count =',ac(file_name))
print('line count =',n)
print('word count =',word_count(n))
print('BoW =')













    

    
# 6330214421 (26.00) 69 (2021-03-21 17:32)
def charcount(a):
    infile = open(a,"r")
    x = 0
    for line in infile:
        r = line.strip()
        x += len(r)
    infile.close()
    return x
def alcount(a):
    infile = open(a,"r")
    x = ""
    for line in infile:
        r = line.strip()
        for i in r:
            if "a" <= i <= "z" or "A" <= i <= "Z" or "0" <= i <= "9":
                x += i
    infile.close()
    return len(x)   
def linecount(a):
    infile = open(a,"r")
    x = 0
    for line in infile:
        x+= 1
    infile.close()
    return x
def wordcount(a):
    infile = open(a,"r")
    o = []
    for line in infile:
        y = ""
        r = line.strip()
        for i in range(len(r)):
            if "a" <= r[i] <= "z" or "A" <= r[i] <= "Z" or "0" <= r[i] <= "9":
                y += r[i]
            if not ("a" <= r[i] <= "z" or "A" <= r[i] <= "Z" or "0" <= r[i] <= "9") :
                y += " "
        x = y.split()
        for i in x:
            o.append(i)
    infile.close()
    return len(o)
def fhash(a,f):
    x = 0
    for i in range(len(a)):
        r = ord(a[i])*(37**i)
        x += r
    k = x%f
    return k

file_name = input("File name = ")
b = input("Use feature hashing ? (y,Y,n,N) ")
while b not in ["y","Y","n","N"]:
    print("Try again.")
    b = input("Use feature hashing ? (y,Y,n,N) ")
if b in ["n","N"]:
    print("-------------------")
    print("char count =",charcount(file_name))
    print("alphanumeric count =",alcount(file_name))
    print("line count =",linecount(file_name))
    print("word count =",wordcount(file_name))
    
    x1 = open("stopwords.txt" , "r")
    y1 = []
    for line in x1:
        r1 = line.strip()
        if r1 != "":
            s1 = r1.split()
            for i in s1:
                y1.append(i)
    x1.close()
    x2 = open(file_name , "r")
    o = []
    for line in x2:
        y3 = ""
        r2 = line.strip().lower()
        for i in range(len(r2)):
            if "a" <= r2[i] <= "z" or "A" <= r2[i] <= "Z" or "0" <= r2[i] <= "9":
                y3 += r2[i]
            if not ("a" <= r2[i] <= "z" or "A" <= r2[i] <= "Z" or "0" <= r2[i] <= "9") :
                y3 += " "
        k = y3.split()
        for i in k :
            o.append(i)
    x2.close()
    y4 = []
    for i in o:
        if i not in y1:
            y4.append(i)
    y4.sort()
    y5 = []
    c = 1
    for i in range(len(y4)-1):
        if y4[i] == y4[i+1]:
            c += 1
        if y4[i] != y4[i+1]:
            y5.append([y4[i],c])
            c = 1
    j = y4.index(y4[len(y4)-1])
    if j == len(y4)-1:
        y5.append([y4[-1],1])
    elif j != len(y4)-1:
        c = len(y4)-j
        y5.append([y4[-1],c])
    print("BoW =",y5)    
if b in ["Y","y"]:
    m = int(input("M = "))
    print("-------------------")
    print("char count =",charcount(file_name))
    print("alphanumeric count =",alcount(file_name))
    print("line count =",linecount(file_name))
    print("word count =",wordcount(file_name))
    x1 = open("stopwords.txt" , "r")
    y1 = []
    for line in x1:
        r1 = line.strip()
        if r1 != "":
            s1 = r1.split()
            for i in s1:
                y1.append(i)
    x1.close()
    x2 = open(file_name , "r")
    o = []
    for line in x2:
        y3 = ""
        r2 = line.strip().lower()
        for i in range(len(r2)):
            if "a" <= r2[i] <= "z" or "A" <= r2[i] <= "Z" or "0" <= r2[i] <= "9":
                y3 += r2[i]
            if not ("a" <= r2[i] <= "z" or "A" <= r2[i] <= "Z" or "0" <= r2[i] <= "9") :
                y3 += " "
        k = y3.split()
        for i in k :
            o.append(i)   
    x2.close()
    y4 = []
    for i in o:
        if i not in y1:
            y4.append(i)
    y5 = []
    for i in y4:
        u = fhash(i,m)
        y5.append(u)
    y5.sort()
    y6 = []
    c = 1
    for i in range(len(y5)-1):
        if y5[i] == y5[i+1]:
            c += 1
        if y5[i] != y5[i+1]:
            y6.append([y5[i],c])
            c = 1
    j = y5.index(y5[-1])
    if j == len(y5)-1:
        y6.append([y5[-1],1])
    elif j != len(y5)-1:
        c = len(y5)-j
        y6.append([y5[-1],c])

    print("BoW =",y6)    
# 6330215021 (30.00) 70 (2021-03-22 05:06)

def cha_count(x):
    c = 0
    for i in x:
        c += len(i)
    return c
    
def alpha_count(x):
    c = 0
    num = "0123456789"
    a = "abcdefghijklmnopqrstuvwxyz"
    for i in x:
        for e in i:
            if e in num or e in a:
                c += 1
    return c
def create_y(x):
    num = "0123456789"
    a = "abcdefghijklmnopqrstuvwxyz"
    y = []
    for i in x:
        t = "" 
        for e in i:
            if e in num or e in a:
                t += e
            else:
                t += " "
        for c in t.split():
            y.append(c)
    return y
            
def Fhash(w, M):
    s = 0
    for i in range(len(w)):
        s += (37**i)*ord(w[i])
    g = s%M
    return g
def BOW(x, d, M):
    if d == "y":
        BoW = []
        l = []
        for i in range(len(x)):
            l.append(Fhash(x[i], M))
        for e in range(M):
            a = l.count(e)
            if a != 0: 
                BoW.append([e,a])
    else:
        BoW = []
        l = []
        for i in x:
            if i not in l:
                l.append(i)
        for c in l:
            a = x.count(c)
            BoW.append([c,a])
    return BoW
    
file_name = input("File name = ")
fhash = input("Use feature hashing ? (y,Y,n,N) ")
fhash = fhash.lower()
while fhash not in ['y','n']:
    print("Try again.")
    fhash = input("Use feature hashing ? (y,Y,n,N) ")
    fhash = fhash.lower()
M = 1
if fhash == "y":
    M = int(input("M = "))

fn = open("stopwords.txt", "r")
stopwords = []
for c in fn:
    stopwords.append(c.lower().strip())
lsw = []
for i in stopwords:
    p = i.split()
    for k in p:
        lsw.append(k)

file = open(file_name, "r")
words = []
for c in file:
    words.append(c.lower().strip())
lw = []
for i in words:
    p = i.split()
    for k in p:
        lw.append(k)

lnsw = []
y = create_y(lw)
for i in y:
    if i not in lsw:
        lnsw.append(i)

print("-------------------")    
print("char count =", cha_count(words))     
print("alphanumeric count =", alpha_count(words))
print("line count =", len(words))
print("word count =", len(create_y(words)))
print("BoW =", sorted(BOW(lnsw, fhash, M)))

fn.close()
file.close()








# 6330216721 (30.00) 71 (2021-03-22 23:58)

debug_assertion = False
run = True

def debug_assert(exp):
    """
    :type exp: bool | () -> bool
    """
    if not debug_assertion:
        return
    if callable(exp):
        assert exp()
    else:
        assert exp

def is_lower_alnum(w: str, exclude_empty=True):
    return (not exclude_empty or w != '') and all('a' <= c <= 'z' or '0' <= c <= '9' for c in w)

def fhash(w: str, m: int, g=37):
    debug_assert(lambda: is_lower_alnum(w))
    return sum(ord(c) * pow(g, n, m) for n, c in enumerate(w)) % m

def tests():
    debug_assert(lambda: fhash('shane', 4) == 3)
    debug_assert(lambda: fhash('football', 4) == 3)
    debug_assert(lambda: fhash('team', 4) == 3)
    debug_assert(lambda: fhash('likes', 4) == 0)
    debug_assert(lambda: fhash('big', 4) == 2)
    debug_assert(lambda: fhash('fan', 4) == 1)
    debug_assert(lambda: fhash('arsenal', 4) == 2)

    debug_assert(lambda: word_split("") == [])
    debug_assert(lambda: word_split("a") == ['a'])
    debug_assert(lambda: word_split("ab c") == ['ab', 'c'])
    debug_assert(lambda: word_split("ab cdef1 g2 3 4") == ['ab', 'cdef1', 'g2', '3', '4'])
    debug_assert(lambda: word_split('Abc:a18 ("Okay")') == ['Abc', 'a18', 'Okay'])

    # debug_assert(lambda: process("empty.txt", None, []) == (0,0,0,0,[]))


def handle_input():
    file_name = input("File name = ")

    while True:
        prompt = input("Use feature hashing ? (y,Y,n,N) ")
        if prompt.lower() == "y":
            should_feature_hash = True
            break
        elif prompt.lower() == "n":
            should_feature_hash = False
            break
        print("Try again.")

    m = int(input("M = ")) if should_feature_hash else None

    return file_name, m

def read_stopwords():
    stopwords = []
    with open("stopwords.txt", "r") as file:
        for line in file:
            stopwords += [i.lower() for i in line.split()]

    return stopwords

def word_split(string: str):
    words = []
    w = []
    for c in string:
        if c.isalnum():
            w.append(c)
        else:
            if len(w) > 0:
                words.append("".join(w))
            w = []
    if len(w) > 0:
        words.append("".join(w))
    return words

def process(file_name: str, m, stopwords):
    """
    :type m: int | None
    :type stopwords: list[(str, int)]
    """
    line_count = 0
    char_count = 0
    alphanum_count = 0
    word_count = 0
    bow = []

    with open(file_name, "r") as file:
        line_count = len([line for line in file])

    with open(file_name, "r") as file:
        for line in file:
            last_line = line
            char_count += len(line.replace('\n', ''))
            alphanum_count += sum(c.isalnum() for c in line)

            words = word_split(line)
            word_count += len(words)
            for word in (w.lower() for w in words if w.lower() not in stopwords):
                key = word if m is None else fhash(word, m)

                i = next((i for i, (k, v) in enumerate(bow) if k == key), None)
                if i is not None:
                    bow[i][1] += 1
                else:
                    bow.append([key, 1])

    return char_count, alphanum_count, line_count, word_count, bow

def print_info(char_count, alphanum_count, line_count, word_count, bow):
    bow.sort(key=lambda x: -x[1])
    print("""
char count = {}
alphanumeric count = {}
line count = {}
word count = {}
BoW = {}
    """.strip().format(char_count, alphanum_count, line_count, word_count, bow))

if debug_assertion:
    tests()

if run:
    file_name, m = handle_input()
    stopwords = read_stopwords()

    print("-------------------")
    info = process(file_name, m, stopwords)
    print_info(*info)

# 6330217321 (30.00) 72 (2021-03-22 21:25)

def fhash(w, M):
    s = 0
    for c in range (len(w)):
        s += ord(w[c])*37**c
    s = s%M
    return s
def bow(w):
    w = w.lower()
    new = ""
    for e in w:
        if "0" <= e <= "9" or "a" <= e <= "z":
            new += e
        else:
            new += " "
    new = new.split()
    no = []
    out = []
    for e in new:
        if e not in no:
            no.append(e)
    for i in range (len(no)):
        if no[i] not in stopword:
            fre = w.count(no[i])
            out.append([no[i], fre])
    out.sort()
    return out
def bhash(w, M):
    w = w.lower()
    new = ""
    for e in w:
        if "0" <= e <= "9" or "a" <= e <= "z":
            new += e
        else:
            new += " "
    fre = 0
    ah = []
    out = []
    new = new.split()
    for i in range (len(new)):
        if new[i] not in stopword:
            ah.append(fhash(new[i], M))
    for i in range (len(ah)):
        fre = ah.count(ah[i])
        out.append([ah[i], fre])
    out2 = []
    for e in out:
        if e not in out2:
            out2.append(e)
    out2.sort()
    return out2

print("File name = ", end = "")
file_name = input()
char = 0
al = 0
for_word_count = ""
word = 0
bh = []
alll = ""
li = []

stopword = ""
stop = open("stopwords.txt", "r")
for line in stop:
    line = line.lower()
    for e in line:
        stopword += e
stop.close()
stopword = stopword.split()


fn = open(file_name, "r")
for line in fn:
    line = line.lower()
    l = line.split()
    for e in l:
        alll += e + " "
    for e in line:
        if e != "\n":
            char += 1
        if "0" <= e <= "9" or "a" <= e <= "z":
            al +=1
        if "0" <= e <= "9" or "a" <= e <= "z" or e == " " or e == "\n":
            if e == "\n":
                for_word_count += " "
            else:
                for_word_count += e
        elif not "0" <= e <= "9" or "a" <= e <= "z" or e == " " or e == "\n":
            for_word_count += " "
    li.append(line)
fn.close()

for_word_count = for_word_count.split()
word = len(for_word_count)

print("Use feature hashing ? (y,Y,n,N) ", end = "")
f = input()

while f not in "y Y n N":
    print("Try again.")
    print("Use feature hashing ? (y,Y,n,N) ", end = "")
    f = input()
    
if f == "n" or f == "N":
    print("-------------------")
    print("char count =",char)
    print("alphanumeric count =",al)
    print("line count =",len(li))
    print("word count =",word)
    print("BoW =",bow(alll))
    
elif f == "y" or f == "Y":
    print("M = ", end = "")
    M = int(input())
    print("-------------------")
    print("char count =",char)
    print("alphanumeric count =",al)
    print("line count =",len(li))
    print("word count =",word)
    print("BoW =",bhash(alll, M))
# 6330219621 (12.00) 73 (2021-03-21 16:48)
def stopwords() :
    spfile = open("stopwords.txt","r")
    stp = list()
    for line in spfile :
        sw = line.split()
        for s in sw :
            stp.append(s)
    spfile.close()
    return stp
def readfile(file_name) :
    read = list()
    file = open(file_name,"r")
    for line in file :
        line = line.strip()
        read.append(line)
    file.close()
    return read
def cutsymbol(word_list) :
    readword = ''
    for e in word_list :
        e = e.strip()
        for k in e :
            if k in '\\\"^&*(){}[]<>?%$#@!+-=_|.,:;/\'~' :
                readword += ' '
            else : readword += k.lower()
    refile = readword.split()
    return refile
def count(word_list) :
    count = 0
    for w in word_list :
        count += len(w)
    return count
def cutstop(word_list) :
    cutword = list()
    for w in word_list :
        if w not in stopwords() :
            cutword.append(w)
    return cutword
def fhash(word_list,M) :
    fha_list = list()
    for e in word_list :
        s,n = 0,0
        for w in e :
            s += ord(w)*(37**n)
            n += 1
        s = s % M
        fha_list.append(s)
    return fha_list
def B_O_W(word_list) :
    word_list.sort()
    count = 1
    BOW = list()
    for i in range(len(word_list)-1) :
        if word_list[i] == word_list[i+1] :
            count += 1
        else :
            BOW.append([word_list[i],count])
            count = 1
    BOW.append([word_list[-1],count])
    return BOW
def main() :
    file_name = input("File name = ").strip()
    relist = cutstop(cutsymbol(readfile(file_name)))
    feature = input("Use feature hasing ? (y,Y,n,N) ").strip()
    while feature not in ['y','Y','n','N'] :
        print("Try again")
        feature = input("Use feature hasing ? (y,Y,n,N) ").strip()
    if feature in ['y','Y'] :
        M = int(input("M = "))
        relist = fhash(relist,M)
    print("-------------------")
    print("char count =",count(readfile(file_name)))
    print("alphanumric count =",count(cutsymbol(readfile(file_name))))
    print("line count =",len(readfile(file_name)))
    print("word count =",len(cutsymbol(readfile(file_name))))
    print(B_O_W(relist))
#-------------------------------------------
main()
# 6330221821 (25.00) 74 (2021-03-18 20:46)

def stopw(wordlist):
    l,g = [],[]
    file = open('stopwords.txt', 'r')
    for line in file:
        c = line.split()
        for e in c: l.append(e)
    file.close()
    for e in wordlist:
        if e not in l: g.append(e)
    return g

def read(name):
    alp = 'abcdefghijklmnopqrstuvwxyz\
    ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
    linec = 0
    b = ''
    file = open(name, 'r')
    for line in file:
        #line count
        linec += 1
        #string
        for e in line:
            if e in alp: b += e
            else: b += ' '
    file.close()
    return b, linec

def counter(string, linec):
    #char count
    charc = len(string)-linec+1
    #alp num count
    a = string.split()
    b = ''.join(a)
    alpnumc = len(b)
    #word count
    wordc = len(a)
    return charc, alpnumc, wordc

def bow(string):
    a = string.lower().split()
    b = stopw(a)
    b.sort()
    z = []
    i = 1
    for e in range(len(b)-1):
        if b[e] == b[e+1]:
            i += 1
        else:
            z.append([b[e], i])
            i = 1
    z.append([b[-1],i])
    return z

def fhash(word, M):
    b = 0
    for e in range(len(word)):
        a = ord(word[e])
        b += a*(37**e)
    return b%M

def bowfhash(string, M):
    a = string.lower().split()
    b = stopw(a)
    c = []
    for e in b:
        c.append(fhash(e, M))
    c.sort()
    z = []
    i = 1
    for e in range(len(c)-1):
        if c[e] == c[e+1]:
            i += 1
        else:
            z.append([c[e],i])
            i = 1
    z.append([c[-1],i])
    return z
def main(file_name):
    y = input('Use feature hashing ? (y,Y,n,N) ')
    while y not in 'NnYy':
        print('Try again.')
        y = input('Use feature hashing ? (y,Y,n,N) ')
    if y in 'Yy':
        M = int(input('M = '))
    string, linec = read(file_name)
    charc, alpnumc, wordc = counter(string, linec)
    print('-------------------')
    print('char count =',charc)
    print('alphanumeric count =',alpnumc)
    print('line count =',linec)
    print('word count =',wordc)
    if y in 'Nn':
        print('BoW =', bow(string))
    else:
        print('BoW =', bowfhash(string, M))        


file_name = input('File name = ')
main(file_name)
# 6330222421 (15.60) 75 (2021-03-21 11:00)

file_name = input("File name = ")
feature = input("Use feature hashing ? (y,Y,n,N) ")

if feature in 'Yy' :
    M = int(input("M = "))
    check = True
elif feature in 'Nn' :
    check = False
else :
    while feature not in 'YyNn':
        print("Try again")
        feature = input("Use feature hashing ? (y,Y,n,N) ")

    if feature in 'Yy' :
        M = int(input("M = "))
        check = True
    elif feature in 'Nn' :
        check = False
    
    
wordd = ''
char = []
line_count = 0
apl = 0

fin = open(file_name, "r")
for line in fin :
    line_count += 1
    
    for e in line :
        if e in 'qwertyuiopasdfghjklzxcvbnm1234567890QWERTYUIOPASDFGHHJKLMNBVCXZ' :
            wordd += e.lower()
        else :
            wordd += ' '
    
    char += line.strip()

word = wordd.split()

            
words_count = len(word)
char_count = len(char)

for i in range(len(char)-1) :
    if char[i] not in 'qwertyuiopasdfghjklzxcvbnm1234567890' :
        apl += 1
alpha_count = char_count - apl


fin.close()

stp = []
fin = open('stopwords.txt', "r")
for line in fin :
    stp += line.split()
    
    bow = []
    for i in word : #find stp
        if i not in stp :
            bow.append(i)
    bow.sort()
    
fin.close()

def fhash(bow,M) :
    a = 0
    for i in range(len(bow)) :
        fh = ord(bow[i])
        a += fh*(37**i)
    fhash = a%M
    
    return fhash
def BoW(bow) :
    a = 1
    b = []
    i = 0
    while i < len(bow)-1 :
        if bow[i] == bow[i+1] :
            a += 1
        else :
            b.append([bow[i],a])
            a = 1
        i += 1
            
    b.append([bow[-1],a])
    
    return b
            
def ffhash(bow,M) :
    f = []
    for i in bow :
        f.append(fhash(i,M))
    f.sort()
    
    a = 1
    b = []
    for i in range(len(f)-1) :
        if f[i] == f[i+1] :
            a += 1
        else :
            b.append([f[i],a])
            a = 1
            
    b.append([f[-1],a])        
      
    return b

if check == True :
    bbbb = ffhash(bow,M)
else :
    bbbb = BoW(bow)

print("-------------------")
print("char count = " , char_count)
print("alphanumeric count = " , alpha_count)
print("line count = " , line_count)
print("word count = " , words_count)
print("BoW = " , bbbb)
# 6330223021 (30.00) 76 (2021-03-21 18:50)

def read_stopwords():
    file = open('stopwords.txt', 'r'); stop_words = ''
    for i in [e.strip() for e in file.readlines()]:
        stop_words += i+' '
    file.close()
    return stop_words.split()
def read_file(file_name):
    file = open(file_name, 'r'); temp = ''
    for i in [line.strip() for line in file.readlines()]:
        temp += i+' '
    file.close(); words = ''
    for i in temp:
        if i.isalnum() == False:
            words += ' '
        else:
            words += i
    return words.lower()
def alphanum_count(file_name):
    c = 0
    for i in read_file(file_name):
        for e in i:
            if e.isalnum() == True:
                c += 1
    return c
def line_count(file_name):
    file = open(file_name, 'r')
    temp = file.readlines()
    file.close()
    return len(temp)
def char_count(file_name):
    file = open(file_name, 'r'); temp = []; c = 0
    for i in [line for line in file.readlines()]:
        if '\n' in i:
            c += 1
        temp += i
    file.close()
    return len(temp) - c
def main(file_name):
    print('-------------------')
    print('char count =', char_count(file_name))
    print('alphanumeric count =', alphanum_count(file_name))
    print('line count =', line_count(file_name))
    print('word count =', len(read_file(file_name).split()))
def words_nostop(file_name):
    words = []
    for i in read_file(file_name).split():
        if i in read_stopwords():
            words += []
        else:
            words += [i]
    return ' '.join(words)
def BoW(file_name):
    x = []; y = []; bow = []
    for i in words_nostop(file_name).split():
        if i not in x:
            x.append(i)
            y.append(1)
        else:
            y[x.index(i)] += 1
    for i in range(len(x)):
        bow.append([x[i], y[i]])
    return sorted(bow)
def fhash(w, M):
    c = 0
    for i in range(len(w)):
        c += ord(w[i])*37**i
    return c % M
def BoW_fhash(file_name, M):
    y = []; x = []; z = []; bow = []
    for i in range(len(words_nostop(file_name).split())):
        y.append(fhash(words_nostop(file_name).split()[i], M))
    for i in range(len(y)):
        if y[i] not in x:
            x.append(y[i])
            z.append(1)
        else:
            z[x.index(y[i])] += 1
    for i in range(len(x)):
        bow.append([x[i], z[i]])
    return sorted(bow)

#-----------------------------------------------------

file_name = input('File name = ')
use_fh = input('Use feature hashing ? (y,Y,n,N) ')
while use_fh not in 'nNyY':
    print('Try again.')
    use_fh = input('Use feature hashing ? (y,Y,n,N) ')
if use_fh in 'Yy':
    M = int(input('M = '))
    main(file_name)
    print('BoW =', BoW_fhash(file_name, M))
else:
    main(file_name)
    print('BoW =', BoW(file_name))
# 6330224721 (29.00) 77 (2021-03-21 21:38)

file_name = input("File name = ")
method = input("Use feature hashing ? (y,Y,n,N) ")
while method not in "yYnN":
    print("Try again.")
    method = input("Use feature hashing ? (y,Y,n,N) ")
if method in "yY":
    M = int(input("M = "))
print("-------------------")

#===================================================
stopWords = []
infile = open("stopwords.txt", "r")
for line in infile:
    stopWords += line.split()
infile.close()

#===================================================
infile2 = open(file_name, "r")

isBlankFile = True
charCount = 0
alphaCount = 0
lineCount = 0
wordList = []

for line in infile2:
    isBlankFile = False
    charCount += len(line) - 1
    start = 0
    stop = 0
    for e in line:
        if ('0' <= e <= '9') or ('a' <= e <= 'z') or ('A' <= e <= 'Z'):
            alphaCount += 1
            stop += 1
            tempWord = line[start:stop]
        else:
            tempWord = line[start:stop]
            if len(tempWord) != 0:
                wordList.append(tempWord.lower())
                tempWord = ''
            start = stop + 1
            stop = start

    if len(tempWord) != 0:
        wordList.append(tempWord.lower())
    lineCount += 1
 
# For blank file case
if not isBlankFile:
    charCount += 1

wordCount = len(wordList)


#========Compute BoW========
finishCutWordList = [word for word in wordList if word not in stopWords]
BoW = []

if method in 'yY':
    wordToNumberList = []
    for word in finishCutWordList:
        number = 0
        i = 0
        for e in word:
            number += ord(e)*((37)**i)
            i += 1
        wordToNumberList.append(number % M)
    temp = []
    for num in wordToNumberList:
        if num not in temp:
            BoW.append([num,wordToNumberList.count(num)])
        temp.append(num)
        
elif method in 'nN':
    temp = []
    for word in finishCutWordList:
        if word not in temp:
            BoW.append([word,finishCutWordList.count(word)])
        temp.append(word)

BoW.sort()



#========Output========
print("char count =",charCount)
print("alphanumeric count =",alphaCount)
print("line count =",lineCount)
print("word count =",wordCount)
print("BoW =",BoW)


# 6330225321 (22.90) 78 (2021-03-22 00:28)
file_name = input('File name = ')
a = input('Use feature hashing ? (y,Y,n,N) ')
x = open('stopwords.txt','r')
y = open(file_name,'r')
while a != 'n' and a != 'N' and a != 'y' and a != 'Y' :
    print('Try again.')
    a = input('Use feature hashing ? (y,Y,n,N) ')
o = ''
for q in y :
    r = q.lower()
    for e in r :
        if 'a'<=e<='z' or 'A'<=e<='Z' or '1'<=e<='9':
            o +=e
        else :
            o+=' '
p = o.split()
m = ''
h = ''
for w in x :
    if w[-1] == '\n' :
        h +=w[0:-1]+' '
    else :
        h +=' ' +w
t = h.split()
for e in p :
    if e in t :
        m += ' '
    else :
        m += ' ' +e
n = m.split()
n.sort()
BoW = []
for i in range(len(n)) :
    z = m.count(n[i])
    if n[i-1] != n[i] :
        BoW.append([n[i],z])
def fhash(w,M) :    
    B = 0
    for i in range(len(w)) :
        B+=ord(w[i])*(37**i)
        C = B%int(M)
    return C
if a== 'y' or a=='Y' :
    M = input('M = ',)
    Bows =[]
    for e in n :
        Bows.append(fhash(e,M))
        Bows.sort()
    E = []
    for i in range(len(Bows)) :
        W = Bows.count(Bows[i])
        if Bows[i-1] !=Bows[i] :
            E.append([Bows[i],W])
x.close()
y.close()
print('-------------------')
b = open(file_name,'r')
c = 0
line = b.readline().strip()
while len(line) > 0 :
    c += len(line) 
    line = b.readline().strip()    
print('char count =',c) 
b.close()
d = open(file_name,'r')
f = ''
for line in d :
    for e in line :
        if 'a'<=e<='z' or 'A'<=e<='Z' or '1'<=e<='9' :
            f+=e
        else :
            f +=''
print('alphanumeric count =',len(f))
d.close()
g = open(file_name,'r')
i = 0
for h in g :
    if h[-1] == '\n' :
        i +=1
print('line count =',i+1)
g.close()
j = open(file_name,'r')
n =''
for l in j :
    for m in l :
        if 'a'<=m<='z' or 'A'<=m<='Z' or '1'<=m<='9':
            n+= m
        else :
            n+=' '
k = n.split()
print('word count =',len(k))
j.close()
if a== 'n' or a=='N' :
    print('BoW =',BoW)
if a== 'y' or a=='Y' :
    print('BoW =',E)
# 6330226021 (30.00) 79 (2021-03-22 22:44)
def fhash(word, M):
    Fhash = 0
    M = int(M)
    for i in range(len(word)):
        Fhash += ord(word[i])*(37**i)
    return Fhash % M
def BoW(clause):
    bow = []
    clause = clause.split()
    clause.sort()
    n = 1
    b_word = None
    for word in clause:
        if word == b_word:
            n += 1
            b_word = word     
        else:
            bow.append([b_word, n])
            b_word = word
            n = 1
    bow.append([b_word, n])
    bow = bow[1::]
    return bow

file_name = input('File name = ')

FH = input('Use feature hashing ? (y,Y,n,N) ')
while True:
    if FH not in ['y','Y','n','N']:
        print('Try again.')
        FH = input('Use feature hashing ? (y,Y,n,N) ')
    else: break
if FH in ['y','Y']:
    M = input('M = ')

stopwords = open('stopwords.txt', 'r')
list_of_stopwords = ''
for line in stopwords:
    for e in line:
        if e != '\n':
            list_of_stopwords += e
        else:
            list_of_stopwords += ' '
list_of_stopwords = list_of_stopwords.split()
stopwords.close()

open_file = open(file_name, 'r')

char_count = ''
alphanumeric_count = ''
line_count = 0
word_count = ''
for line in open_file:
    line = line.lower()
    line_count += 1
    for e in line:
        if e != '\n':
            char_count += e
    for e in line:
        if 'a' <= e <= 'z' or '0' <= e <= '9':
            alphanumeric_count += e    
    for e in line:
        if 'a' <= e <= 'z' or '0' <= e <='9':
            word_count += e
        else:
            word_count += ' '
word_count = word_count.split()
open_file.close()

print('-------------------')
print('char count =', len(char_count))
print('alphanumeric count =', len(alphanumeric_count))
print('line count =', line_count)
print('word count =', len(word_count))        

no_stop = []
for e in word_count:
    if e not in list_of_stopwords:
        no_stop.append(e)

if FH in ['y','Y']:
    Fhash = ''
    for word in no_stop:
       Fhash += str(fhash(word, M)) + ' '
    bow_ = BoW(Fhash.strip())
    bow = []
    for fh, n in bow_:
        bow.append([int(fh), n])
else:
    bow = BoW(' '.join(no_stop))
print('BoW =', bow)
# 6330227621 (30.00) 80 (2021-03-22 17:11)
print("File name = ",end='')
file_name = input()
print("Use feature hashing ? (y,Y,n,N) ",end='')
ans = input().lower()
#////////////////////////////////////////////////////////////////////////////
while ans not in ['y','Y','n','N']:
    print('Try again.')
    print("Use feature hashing ? (y,Y,n,N) ",end='')
    ans = input().lower()
#+++++++++++++++++NONONONONONO+++++++++++++++++++++++++++
if ans =='n':
    print('-------------------')
    I=open(file_name, "r")
    CHAR=0
    L=0
    W=0
    ALPHA=0
    nL=''
    for line in I:
    #.....CHAR.........
        if line[-1]!='\n':
            CHAR+= len(line)
        else:
           CHAR+= len(line)-1
        #.......ALPHA......
        LL=line.lower()   
        for i in range(len(LL)):
            if ('a'<=LL[i]<='z' or '0'<=LL[i]<='9'):
                ALPHA+=1
        #......................  
        LL=line.lower()
        for i in range(len(LL)):
            if ('a'<=LL[i]<='z' or '0'<=LL[i]<='9'):
               nL+= LL[i] 
            else:
               nL+=' '
        #....L.....
        L+=1
    x1=nL.split() 
    W+= len(x1)
    I.close()
    print('char count =',CHAR)
    print('alphanumeric count =',ALPHA )
    print('line count =',L )
    print('word count =',W )
#..........BoW............
    S=open('stopwords.txt','r')
    s=[]
    for line in S:
        s+=line.split()
    S.close()
    x2=[]
    for i in range(len(x1)):
        if x1[i] not in s:
            x2.append(x1[i])
    y=[]
    z=[]
    for i in range(len(x2)):
        if x2[i] not in y:
            y.append(x2[i])
    for i in range(len(y)):
        z.append([y[i],0])
    for i in range(len(x2)):
        n=y.index(x2[i])
        z[n][1]+=1
    print('BoW =',z)    
#++++++++++++++++++++YESYESYESYESYES+++++++++++++++++++++++
elif ans=='y':
    print('M = ',end='')
    M=input()
    print('-------------------')
#...........................
    I=open(file_name, "r")
    CHAR=0
    L=0
    W=0
    ALPHA=0
    nL=''
    for line in I:
    #.....CHAR.........
        if line[-1]!='\n':
            CHAR+= len(line)
        else:
           CHAR+= len(line)-1
        #.......ALPHA......
        LL=line.lower()   
        for i in range(len(LL)):
            if ('a'<=LL[i]<='z' or '0'<=LL[i]<='9'):
                ALPHA+=1
        #......................  
        LL=line.lower()
        for i in range(len(LL)):
            if ('a'<=LL[i]<='z' or '0'<=LL[i]<='9'):
               nL+= LL[i] 
            else:
               nL+=' '
        #....L.....
        L+=1
    x1=nL.split() 
    W+= len(x1)
    I.close()
    print('char count =',CHAR)
    print('alphanumeric count =',ALPHA )
    print('line count =',L )
    print('word count =',W )
    #.............................
    I=open(file_name, "r")
    nL=''
    for line in I:
        LL=line.lower()
        for i in range(len(LL)):
            if ('a'<=LL[i]<='z' or '0'<=LL[i]<='9'):
               nL+= LL[i] 
            else:
               nL+=' '
    x1=nL.split()
    I.close()

    S=open('stopwords.txt','r')
    s=[]
    for line in S:
        s+=line.split()
    S.close()
    x2=[]
    for i in range(len(x1)):
        if x1[i] not in s:
            x2.append(x1[i])
    m1=[]
    m2=[]
    m3=[]
    a=0
    A=[]
    for i in range(int(M)):
        m1.append(i)
        m2.append([i,0])
    
    for i in range(len(x2)):
        for j in range(len(x2[i])):
            a+=(ord(x2[i][j])*(37**j))
        a=a%int(M)
        A.append(a)
        a=0
    for i in range(len(A)):
        n=m1.index(A[i])
        m2[n][1]+=1
    for i in range(len(m2)):
        if m2[i][1]!=0:
            m3.append(m2[i])

    print('BoW =',m3)
#..................................................

# 6330228221 (22.80) 81 (2021-03-20 22:24)
file_name = input('File name = ').strip()

f = open( file_name , 'r')
file = []
count = 0
char_count = 0
for line in f :
    file.append(line)
    char_count += len(line)
    count+=1
f.close()
file = "".join(file)
########################################################
x = input('Use feature hashing ? (y,Y,n,N) ')
while x not in 'y,Y,n,N' :
    print('Try again.')
    x = input('Use feature hashing ? (y,Y,n,N) ')
########################################################
stop = open('stopwords.txt' ,'r')
stopword  = ''
for line in stop :
    stopword += line
stopword = stopword.split()
stop.close()
########################################################
def BoW (w):
    BoW =  ''
    final = []
    final1 = []
    bow = word_list_no_stopword(w)
    for i in range(len(bow)):
        if bow[i] not in final :
            final.append(bow[i])
            final.append(bow.count(bow[i]))
    f1 = final[::2]
    f2 = final[1::2]
    for i in range(len(f1)):
        final1.append([f1[i],f2[i]])
    return final1
def fhash (w,M):
    ans = 0
    for i in range(len(w)):
        ans += ord(w[i])*(37**i)
    ans = ans % M
    return ans
def word_list (w):
    w = w.lower()
    w1 = ''
    for i in w :
        if i not in '.;/\\:;,()!#%"\'':
            w1+=i
        else :
            w1+=' '
    wword = w1.split()    
    return wword
def word_list_no_stopword (w):
    BoW =  ''
    wword = word_list(w)
    for i in range(len(wword)) :
        if wword[i] in stopword :
            BoW += ''
        else :
            BoW += wword[i]
            BoW += ' '
    bow = BoW.split()
    return bow

##this is ans
word_count = len(word_list(file))
alphanumeric_count = len("".join(word_list(file)))
char_count -= (count-1)
line_count = count
BoW = BoW(file)
##this is ans

if x == 'y' or x == 'Y' :
    M = int(input('M = '))
    print('-------------------')
    ans = []
    all_BoW_fhash = []
    fhash_string = word_list_no_stopword(file)
    for i in range(len(fhash_string)):
        all_BoW_fhash.append(str(fhash(fhash_string[i],M)))
    BoW_fhash = []
    for i in range(len(all_BoW_fhash)):
        if all_BoW_fhash[i] not in BoW_fhash:
            BoW_fhash.append(all_BoW_fhash[i])
            BoW_fhash.append(all_BoW_fhash.count(all_BoW_fhash[i]))
    BoW_fhash1 = BoW_fhash[::2]
    BoW_fhash2 = BoW_fhash[1::2]
    for i in range(len(BoW_fhash1)):
        ans.append([int(BoW_fhash1[i]),BoW_fhash2[i]])
    print('char count =',char_count)
    print('alphanumeric count =',alphanumeric_count)
    print('line count =',line_count)
    print('word count =',word_count)
    print('BoW =',ans)
        
if x == 'n' or x == 'N' :
    print('-------------------')
    print('char count =',char_count)
    print('alphanumeric count =',alphanumeric_count)
    print('line count =',line_count)
    print('word count =',word_count)
    print('BoW =',BoW)
# 6330229921 (30.00) 82 (2021-03-20 23:40)

def fhash(w, M):
    a = 0
    for i in range(len(w)):
        a += ord(w[i]) * (37 ** i)
    b = a % M
    return b
def countrepeat(w, words):
    a = []
    for e in words:
        if e not in a:
            a.append(e)
    b = 0
    for e in words:
        if e == w:
            b += 1
    return b

file_name = input('File name = ')
ans = input('Use feature hashing ? (y,Y,n,N) ')

while ans not in ['Y','y','N','n']:
    print('Try again.')
    ans = input('Use feature hashing ? (y,Y,n,N) ')
    
stopwords = open('stopwords.txt', 'r')
file = open(file_name, 'r')

swline = stopwords.readline()
sw = []
while len(swline) > 0:
    a = swline.lower()
    sw += a.split()
    swline = stopwords.readline()

Letters = 'abcdefghijklmnopqrstuvwxyz'
Nums = '0123456789'
fline = file.readline()
char_count = 0
line_f = 0
sen = ''

while len(fline) > 0:
    b = fline.lower()
    for i in range(len(b)):
        if b[i] in 'abcdefghijklmnopqrstuvwxyz0123456789':
            sen += b[i]
        else:
            sen += ' '
    line_f += 1
    char_count += len(fline)
    sen += ' '
    fline = file.readline()

alpha = 0
for i in range(len(sen)):
        if sen[i] in Letters or sen[i] in Nums:
            alpha += 1

w = sen.split()
words = []
for e in w:
    if e not in sw:
        words.append(e)

a = []
same = []
for e in words:
    if e not in a:
        a.append(e)
for e in a:
    b = []
    b.append(e)
    b.append(countrepeat(e,words))
    same.append(b)
if ans in 'Nn':
    print('-------------------')
    print('char count =', char_count - line_f + 1)
    print('alphanumeric count =', alpha)
    print('line count =', line_f)
    print('word count =', len(w))
    print('BoW =', same)
    
elif ans in 'Yy':
    M = int(input('M = '))
    
    n = []
    g = []
    fh = []
    for e in words:
        c = fhash(e, M)
        n.append(c)
    for e in n:
        if e not in g:
            g.append(e)
    g.sort()
    for e in g:
        f = []
        f.append(e)
        f.append(countrepeat(e,n))
        fh.append(f)
        
    print('-------------------')
    print('char count =', char_count - line_f + 1)
    print('alphanumeric count =', alpha)
    print('line count =', line_f)
    print('word count =', len(w))
    print('BoW =', fh)
    

    
stopwords.close()
file.close()
# 6330230421 (21.40) 83 (2021-03-18 21:41)

def run():
    file_name = open(input("File name = "),"r")
    text,n_line = reading(file_name)
    f = input("Use feature hashing ? (y,Y,n,N) ")
    while not f in ['y','Y','n','N']:
        print("Try again.")
        f = input("Use feature hashing ? (y,Y,n,N) ")
    else :
        if f == 'y' or f == 'Y':
            M = int(input("M = "))
        else : M = ""
        print('-'*19)
        print("char count =",len(text))
        print("alphanumeric count =",alpha_count(text))
        print("line count =",n_line)
        print("word count =",w_count(text))
        print("BoW =",BoW(text,f,M))
    
    file_name.close()
    
def reading(file):
    t = ""
    l = 0
    for line in file:
        l += 1
        t += line.strip().lower()
    return t,l
    
def fhash(w,M):
    c = 0
    for i in range(len(w)):
        c += ord(w[i])*((37)**i)
    return c % M
    
def alpha_count(text):
    n = 0
    for c in text:
        if c in alpha or c in num:
            n += 1
    return n
def w_count(text):
    t = ""
    for i in text:
        if i in alpha or i in num:
            t += i
        else: t += " "
    words = t.split()
    return str(len(words))
def frequency(word,f,M):
    words = []
    fre = []
    c = 0
    for i in range(len(word)):
        w_check = word[i]
        for i in word:
            if w_check == i:
                c += 1
        if not w_check in words:
            words.append(w_check)
            fre.append(c)
            c = 0
        c = 0
    result = []
    for i in range(len(words)):
        result.append([words[i],fre[i]])
    if f == "n" or f == "N":
        return result    
    if f == "y" or f == "Y":
        fre_fhash = []
        for i in range(len(word)):
            fre_fhash.append(fhash(word[i],M))
        fre_fhash.sort()
        k = 0
        result = []
        for i in fre_fhash:
            for e in fre_fhash:
                if i == e:
                    k += 1
            if not i in [result[z][0] for z in range(len(result))]:
                result.append([i,k])    
                k = 0
            k = 0
        return result            
        
def BoW(text,f,M):
    t = ""
    for i in text:
        if i in alpha or i in num:
            t += i
        else: t += " "
    words = t.split()
    bag_w = []
    for i in words:
        if not i in stopwords:
            bag_w.append(i)
    return frequency(bag_w,f,M)
    
#-----------------------------------------
alpha = "abcdefghijklmnopqrstuvwxyz"
num = "0123456789"
#-----------------------------------------
file_sw = open("stopwords.txt","r")
stopwords = []
for line in file_sw:
    stopwords += line.split()
file_sw.close() 

run()
# 6330232721 (30.00) 84 (2021-03-22 21:02)

alpha = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
alpha2 = alpha.lower()
alpha3 = '1234567890'

#fhash func
def fhash(w,M):
    x = 0
    Fnum = 0
    for i in range(len(w)):
        Fnum += ord(w[i])*(37**x)
        x += 1
    return (Fnum % M)

# character count
def char_count(a):
    ch_c=0
    lnum=0
    inF = open(a, "r")
    for line in inF:
        for i in range(len(line)):
            ch_c += 1
        lnum+=1
    inF.close()
    ch_c = ch_c-(lnum-1)
    return ch_c

# alphanumeric count
def alpha_count(a):
    alnum = 0
    inF = open(a, "r")
    for line in inF:
        for i in range(len(line)):
            if line[i] in alpha or line[i] in alpha2 \
               or line[i] in alpha3:
                alnum+=1
    inF.close()
    return alnum

# line count
def line_count(a):
    l = 0
    inF = open(a, "r")
    for line in inF:
        l+=1
    inF.close()
    return l

# change words to list
def cutLetM(a):
    inF = open(a, "r")
    b = ''
    for line in inF.readlines():
        b+=line+' '
    b=b.lower()
    c=''
    for i in range(len(b)):
        if b[i] in alpha or b[i] in alpha2 \
           or b[i] in alpha3:
            c+=b[i]
        else:
            c+=' '
    c = c.split()
    inF.close()
    return c

# change stopwords to list
def cutLetS(e):
    inF = open(e, "r")
    b = ''
    for line in inF.readlines():
        b+=line+' '
    b=b.lower()
    c=''
    for i in range(len(b)):
        if b[i] in alpha or b[i] in alpha2 \
           or b[i] in alpha3:
            c+=b[i]
        else:
            c+=' '
    c = c.split()
    inF.close()
    return c

# word count
def word_count(a):
    wnum = len(cutLetM(a))
    return wnum

# Final BoW without hashF
def FinalBoW(c,d):
    check = []
    g = []
    h = []
    c.sort()
    d.sort()
    for i in range(len(c)):
        if c[i] not in d:
            check.append(c[i])
    for i in range(len(check)):  
        if check[i] not in h:
            h.append(check[i])
            g.append([check[i],0])
        g[-1][1]+=1
    return g

# Final BoW with hash
def FinalBoWH(c,d,M):
    check = []
    g = []
    h = []
    f = []
    c.sort()
    d.sort()
    for i in range(len(c)):
        if c[i] not in d:
            check.append(c[i])
    for i in range(len(check)):
        g.append(fhash(check[i],M))
    g.sort()
    for i in range(len(check)):  
        if g[i] not in f:
            f.append(g[i])
            h.append([g[i],0])
        h[-1][1]+=1
    return h
            
            
a = input('File Name = ')
b = input('Use feature hashing ? (y,Y,n,N) ')
sword = 'stopwords.txt'
c = cutLetM(a)
d = cutLetS(sword)
while b not in ['y','Y','n','N']:
    print('Try again.')
    b = input('Use feature hashing ? (y,Y,n,N) ')
if b in ['y','Y']:
    M = int(input('M = '))
print('-------------------')
print('char count =',char_count(a))
print('alphanumeric count =',alpha_count(a))
print('line count =',line_count(a))
print('word count =',word_count(a))
if b in ['y','Y']:
    print('BoW =',FinalBoWH(c,d,M))
if b in ['n','N']:
    print('BoW =',FinalBoW(c,d))
# 6330233321 (21.85) 85 (2021-03-22 17:19)
file_name = input('File name = ')
file = open(file_name,'r')
line = file.readlines()
file.close()
s = open('stopwords.txt','r')
stop = s.readlines()
s.close()
k = 0
while k < 1:
    usefh = input('Use feature hasing ? (y,Y,n,N) ')
    if usefh == 'y' or usefh == 'Y':
        fh = True
        break
    elif usefh == 'n' or usefh == 'N':
        fh = False
        break
    else:
        print('Try again.')
if fh == True:
    M = input('M = ')
print('------------------')

# stop

stopword = ''
for e in stop:
    stopword += e.strip()+' '
stopword = stopword.split()

# char count

t = ''
for e in line:
    t += e.strip()
    t = t.lower()
print('char count = '+str(len(t)))

# alphanumeric count
def alpha(texts):
    m = ''
    for i in range(len(texts)):
        if texts[i].isalnum() == True:
            m += texts[i]
    return 'alphanumeric count = '+str(len(m))
print(alpha(t))
    
# line count
def line_count(texts):
    count = 0
    for c in texts:
        if c.find('\n') != -1:
            count += 1
    return 'line count = '+str(count+1)
print(line_count(line))

# word count
def word_count(texts, i):
    m = ''
    j = []
    for c in texts:
        if c.isalnum() == True:
            m += c
        if c.isalnum() == False:
            if not m == '':
                j.append(m)
            m = ''
    if i == 0:
        return 'word count = '+str(len(j))
    else:
        return j
print(word_count(t, 0))

# BoW if n N
def BoW_nN():
    word = word_count(t, 1)
    bow = []
    new_word = []
    for c in word:
        if not c in stopword:
            new_word.append(c)
    new_word.sort()
    #new_word.append('')
    count = 1
    for i in range(len(new_word)-1):
        if new_word != []:
            x = new_word.pop(0)
            if not x in new_word:
                bow.append([x, count])
            elif x in new_word:
                while x in new_word:
                    count += 1
                    new_word.pop(0)
                bow.append([x, count])
                count = 1
    return print('BoW = '+str(bow))

# flash
def flash(w, M):
    fla = 0
    for i in range(len(w)):
        fla += ord(w[i])*(37**i)
    fla = fla%int(M)
    return fla

# BoW if y Y
    
def BoW_yY():
    word = word_count(t, 1)
    bow = []
    new_word = []
    for c in word:
        if not c in stopword:
            new_word.append(c)

    flash_word = []
    for c in new_word:
        flash_word.append(flash(c, M))
    flash_word.sort()

    count = 1
    for i in range(len(flash_word)-1):
        if flash_word != []:
            x = flash_word.pop(0)
            if not x in flash_word:
                bow.append([x, count])
            elif x in flash_word:
                while x in flash_word:
                    count += 1
                    flash_word.pop(0)
                bow.append([x, count])
            count = 1
    return print('BoW = '+str(bow))




if usefh == 'n' or usefh == 'N':
    BoW_nN()
else:
    BoW_yY()
# 6330234021 (9.87) 86 (2021-03-22 22:03)

def char_count(file_name): ###
    fn = open(file_name)
    c = 0
    for line in fn :
        for e in line :
            if e != '\n' :
                c += 1
    fn.close()
    return c
def alphanumeric_count(file_name) : ###
    fn = open(file_name)
    c = ''
    c_1 = "\"\'/\\,.:; "
    for line in fn :
        for e in line :
            if e not in c_1 :
                c += e
    fn.close()
    return len(c)- line_count(file_name) + 1
def line_count(file_name) :  ###
    fn = open(file_name)
    c = 0
    for line in fn :
        c += 1
    fn.close()
    return c
def word_count(file_name) : ###
    f = open(file_name)
    c = ''
    x = 0
    alp = 'abcdefghijklmnopqrstuvwxyz0123456789'
    for line in f :
        for g in line :
            if g.lower() in alp :
                c += g
            else :
                c += ' '
        x += len(c.split())
        c = ''
    f.close()
    return x
def BoW(file_name , stopwords) : ###
    f1 = open(file_name)
    f2 = open(stopwords)
    l = []
    cc = ''
    d2 = []
    alp = 'abcdefghijklmnopqrstuvwxyz0123456789'
    c =''
    
    for line in f1 :
        for g in line :
            if g.lower() in alp :
                cc += g.lower()
            else :
                cc += ' '
    for line in f2 :
        c += ' '
        if line[-1] == '\n' : line = line[:-1]
        for g in line :
            c += g.lower()
    c1 = c.split()
    c2 = ''
    for r in cc.split() :
        if r not in c1 :
            c2 += r + ' '
    d = c2.split()
    for d1 in d :
        if d1 not in d2 : d2.append(d1)
    for e in d2 :
        c3 = 0
        w = 0
        while c2.find(e,w) != -1 :
            c3 += 1
            w += c2.find(e,w) + 1
        l.append([e,c3])
    l.sort()
    f1.close()
    f2.close()
    return l
def feature_harshing(l,M) : ###
    x = []
    y = ''
    for l1 in l :
        c = 0
        c1 = 0
        for l3 in l1[0] :
            c += ord(l3)*(37**c1)
            c1 += 1
        flash = c % M
        y += str(flash)*l1[1]    
        
    for i in range(M):
        c2 = 0
        c3 = 0
        while y.find(str(i),c3) != -1 :
            c2 += 1
            c3 = y.find(str(i),c3) + 1
        if c2 != 0 :
            x.append([i,c2])
    return x
def display(file_name , stopwords , x) :
    print("-------------------")
    print("char_count =", char_count(file_name))
    print("alphanumeric_count =", alphanumeric_count(file_name))
    print("line_count =", line_count(file_name))
    print("word_count =", word_count(file_name))
    if x == '0' :
        print('BoW =' , BoW(file_name , stopwords) )
    if x == '1' :
        print('BoW =' , feature_harshing(BoW(file_name , stopwords),M))
        
file_name = input("File name = ")
x = input("use feature hashing ? (y,Y,n,N)")
while x not in ['y','Y','n','N'] :
    print('Try again.')
    x = input("use feature hashing ? (y,Y,n,N)")
if x == 'y' or x == 'Y' :
    M = int(input("M = "))
    display(file_name , 'stopwords.txt' , '1')
else :
    display(file_name , 'stopwords.txt' , '0')



# 6330235621 (26.00) 87 (2021-03-22 18:15)
def flash(w,M) :
    a=0
    for i in range(len(w)) :
        a+=ord(w[i])*(37**i)
        b=a%M
    return b
def fea_hash() :
    x=input("Use feature hashing ? (y,Y,n,N) ")
    if x=="Y" or x=="y" :
        return True
    elif x=="N" or x=="n" :
        return False
    else :
        print("Try again.")
def remove(x) :
    a=""
    for e in x :
        if "A"<=e<="Z" or "a"<=e<="z" or "0"<=e<="9" :
            a+=e
        else :
            a+=" "
    return a

file_name=input("File name = ")
xx=fea_hash()
while xx!=True and xx!=False :
    xx=fea_hash()
if xx==True :
    M=int(input("M = "))
print("-"*19)

stopwords=[]
fst = open("stopwords.txt","r")
for line in fst :
    a=line.split()
    for e in a :
        stopwords.append(e.lower())
fst.close()

fin=open(file_name,"r")
char=0
alph_num=0
linecount=0
word=0
wordlist=[]
BoW=[]
q=1

for line in fin :
    if "\n" in line :
        char+=len(line)-1
    if "\n" not in line :
        char+=len(line)    
    for ch in line :
        if "A"<=ch<="Z" or "a"<=ch<="z" or "0"<=ch<="9" :
            alph_num+=1
    linecount+=1
    word_count=remove(line).split()
    word+=len(word_count)
    for i in word_count :
        if i.lower() not in stopwords :
            wordlist.append(i.lower())
wordlist.sort()        
print("char count =",char)
print("alphanumeric count =",alph_num)
print("line count =",linecount)
print("word count =",word)

if xx==False :
    for i in range(len(wordlist)-1) :
        if wordlist[i]==wordlist[i+1] :
            q+=1
        else :
            BoW.append([wordlist[i],q])
            q=1
    BoW.append([wordlist[-1],q])
    print("BoW =",BoW)
if xx==True :
    t=[]
    for k in wordlist :
        t.append(flash(k,M))
    t.sort()
    for i in range(len(t)-1) :
        if t[i]==t[i+1] :
            q+=1
        else :
            BoW.append([t[i],q])
            q=1
    BoW.append([t[-1],q])
    print("BoW =",BoW)
    
fin.close()
    
        
    
    

    
    



# 6330236221 (25.15) 88 (2021-03-21 23:32)
file_name = open(input("File name = "),"r")
use_fea = input("Use feature hashing ? (y,Y,n,N)"" ")
while use_fea not in ["y","Y", "n","N"]:
    print("Try agin.")
    use_fea = input("Use feature hashing ? (y,Y,n,N)"" ")
if use_fea in ["y","Y"]:
    M = int(input("M = "" "))
elif use_fea in ["n","N"]:
    M = 0
stop_word = open("stopwords.txt","r")
character_count = 0
alpha_count = 0
line_count = 0
word_count = 0
k = 0
G = ""
C = ""
q = []
G1 = []
for line in file_name:
    k += 1
    x = 0
    alpha = 0
    if "\n" in line:
        x += len(line) - 1
        character_count += x 
    else:
        character_count += len(line)
    for i in range(len(line)):
        if "A"<= line[i] <= "Z" or "0"<=line[i]<="9" or "a" <= line[i] <= "z":
             alpha += 1
        alpha_count += alpha
        alpha = 0
    p = ""
    for i in range(len(line)):
        if line[i] in "abcdefghijklmnopqrstuvwxyz" :
            p += line[i]
            C += line[i]
        elif line[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
            p += line[i]
            C += line[i]
        elif line[i] in "0123456789":
            p += line[i]
            C += line[i]
        else:
            p += " "
            C += " "
    x = p.split()
    word_count += len(x)

line_count += k
for line  in stop_word:
            
    for i in range(len(line)):  
               if line[i] in "abcdefghijklmnopqrstuvwxyz" :
                   G += line[i]
               elif line[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
                   G += line[i]
               elif line[i] in "0123456789":
                   G += line[i]
               else:
                   G += " "   
G = G.split()
for i in G:
    G1.append(i.lower())   
C = C.split()
if M != 0:
    def flash(w,z):
        sums = 0
        for i in range(len(w)):
            sums += ord(w[i])*(37**i)
        fhash = (sums)%z
        return fhash
    
    bow2 = []
    N = []
    V =[]
    X = []
    for i in C:
        if i.lower() not in G1:
           q.append(i)
    for i in q:
        bow1 = flash(i,M)
        bow2.append(bow1)
    for i in bow2:
        if i not in N:
            N.append(i)
    N.sort()
    for i in N:
       c = 0
       for k in range(len(bow2)):
              if i == bow2[k]:
                  c += 1
       V.append(c)
    for i in range(len(N)):
         X.append([N[i],V[i]])
    print("-------------------")
    print("char count = "+str(character_count))
    print("alphanumeric count = "+str(alpha_count))
    print("line count = "+str(line_count))
    print("word count = "+str(word_count))
    print("BoW = "+str(X))
    
else:
    N1 = []
    V1 = []
    X1 = []
    for i in C:
        if i.lower() not in G1:
           q.append(i)
    for i in q:
        if i not in N1:
            N1.append(i)
    for i in N1:
        c = 0
        for k in range(len(q)):
            if i == q[k]:
                c +=1
        V1.append(c)
    for i in range(len(N1)):
        X1.append([N1[i],V1[i]])
        
    print("-------------------")
    print("char count = "+str(character_count))
    print("alphanumeric count = "+str(alpha_count))
    print("line count = "+str(line_count))
    print("word count = "+str(word_count))
    print("BoW = "+str(X1))        
    
file_name.close()
stop_word.close()


# 6330238521 (28.40) 89 (2021-03-22 17:03)

def fhash(w,M):
    G = 37
    y = 0
    for i in range(len(w)):
        x = (ord(w[i])*(G**(i)))
        y += x
    return y%M

file_name = input("File name = ")
x = input("Use feature hashing ? (y,Y,n,N)  ").lower()

while x != "y" and x != "n":
    print("Try again.")
    x = input("Use feature hashing ? (y,Y,n,N)  ").lower()
y = open("stopwords.txt","r")
list_stopwords = []
for line in y:
    list_stopwords += line.strip().split()
y.close()
if x == "y":
    M = int(input("M = "))

openf = open(file_name,"r")
char_count = 0
for line in openf:
    char_count += len(line.strip())
print("char count = ",char_count)
openf.close()

openf = open(file_name,"r")
alphanum = 0
for line in openf:
    alphanumeric_count = [c for c in line if c.isalnum()]
    alphanum += len(alphanumeric_count)
print("alphanumeric count =",alphanum)
openf.close()

openf = open(file_name,"r")
line_count = 0
for line in openf:
    if line != "\n":
        line_count += 1
openf.close()
print("line count = ",line_count)

openf = open(file_name,"r")
w = 0
for line in openf:
    a = "".join([c if c.isalnum() else " " for c in line])
    a = a.split()
    w += len(a) 
openf.close()
print("word count = ",w)

openf = open(file_name,"r")
bow = []
for line in openf:
    line = line.lower()
    a = "".join([c if c.isalnum() else " " for c in line])
    a = a.split()
    cutlaew = [c for c in a if c not in list_stopwords]
    for word in cutlaew:
        added = False
        for a in range(len(bow)):
            if (word if not x == "y" else fhash(word, M)) == bow[a][0]:
                bow[a][1] += 1
                added = True
                break
        if not added:
            bow.append([(word if not x == "y" else fhash(word, M)), 1])
    bow.sort()
openf.close()
print("BoW = ",bow)

# 6330239121 (18.39) 90 (2021-03-21 23:18)
def replace_punctuation(s):
    t = ""
    for e in s:
        if e in "\"\'/\\,.:;()[]{}":
            t += " "
        else:
            t += e
    return t


file_name = input("File name = ")
fh = input("Use feature hashing ? (y,Y,n,N) ")
if fh not in "yYnN":
    while fh not in "yYnN":
        print("Try again.")
        fh = input("Use feature hashing ? (y,Y,n,N) ")
    if fh in "yY":
        M = input("M = ")
        print("-------------------")
elif fh in "yY":
    M = input("M = ")
    print("-------------------")
else:
    print("-------------------")

stop = open( "stopwords.txt", "r")
stopwords = []
for line in stop:
    stopwords += line.split()
stop.close()

    
    
main = open( file_name , "r")
char_count = 0
line_count = 0
for line in main:
    char_count += len(line)
    line_count += 1
    
print ("char count =",char_count - line_count )
main.close()

main = open( file_name , "r")
alphanumeric_count = 0
line_count = 0
for line in main:
    wordslist = replace_punctuation(line).split()
    alphanumeric_count += sum(len(word) for word in wordslist)
    line_count += 1
    
print ("alphanumeric count =",alphanumeric_count )
main.close()

main = open( file_name , "r")
line_count = 0
for line in main:
    line_count += 1    
print ("line count =",line_count )
main.close()


main = open( file_name , "r")
word_count = 0
for line in main:
    wordslist = replace_punctuation(line).split()
    word_count += len(wordslist)
print ("word count =",word_count )
main.close()

main = open( file_name , "r")
Bagofwords = []
for line in main:
    Bagofwords += replace_punctuation(line.lower()).split()    
main.close()
def fhash(w,x):
    G = 37
    a = 0
    for i in range(len(w)):
        a += ord(w[i])*G**(i)
    a = a%int(x)
    return a
    
if fh in "nN":
    BoW = []
    for e in Bagofwords:
        if e not in stopwords and [e,Bagofwords.count(e)] not in BoW:
            BoW.append([e,Bagofwords.count(e)])
    BoW.sort()
    print ("BoW =",BoW)
else:
    if fh in "yY":
        BoW = []
        L = []
        for e in Bagofwords:
            if e not in stopwords:
                L.append(fhash(e,M))
        for e in L:
            if [e,L.count(e)] not in BoW:
                BoW.append([e,L.count(e)])
        BoW.sort()
        print ("BoW =",BoW)
        






# 6330240721 (24.00) 91 (2021-03-22 22:28)

def fhash(w,M) :
    olo = 0
    for i in range(len(w)):
        olo += ord(w[i])*(37**i)
    shibal = olo % int(M)
    return(shibal)


file_name = input('File name = ')
lol = 69
while lol == 69:
    WayV = input('Use feature hashing ? (y,Y,n,N) ')
    if WayV == 'Y' or WayV == 'y':
        M = int(input('M = '))
        yes = True;break
    elif WayV == 'N' or WayV == 'n':
        yes = False;break
    else:
        print('Try again.')


NCT = []
X1 = []
l = 1


TXT = open(file_name,"r")
line_text = TXT.readlines()
for line in TXT:
    X = line.strip('\n')
    X1 += X
    U = line.strip('\n').split()
    NCT += U
    if '\n' in line:
        l+=1
TXT.close()
line_count = len(line_text)

line_textz = ''
for e in line_text:
    line_textz += e.strip('\n')

line_textzz = ''
for e in line_text:
    line_textzz += e

line_textx = line_textzz +'\n'


Lucas = ''
Daniel = []
nct=0
for e in line_textx:
    if e in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890':
        Lucas += e
        nct+=1
    elif len(Lucas) != 0:
        Daniel.append(Lucas.lower())
        Lucas = ''

x = []
k = []


Wanna_One = open("stopwords.txt","r")
for line in Wanna_One:
    a = line.strip('\n').split()
    x += a
Wanna_One.close()
for e in Daniel:
    if e in x:
        pass
    else :
        k.append(e)


Izone = []
for e in line_textx:
   Izone += e.lower()


words = []
temp = ''
for e in Izone:
  if e in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890':
    temp += e
  elif len(temp) != 0:
    words.append(temp)
    temp = ''
wc = len(words)


BOW = []
for e in k:
    if [e, k.count(e)] in BOW:
        pass
    else:
        BOW.append([e, k.count(e)])



if yes == False :
    jj = ['kuay',69]

if yes == True :
    BOW = []
    for e in k:
        BOW.append([e, k.count(e)])
    for e in k:
        jj =[]
    for i in range(len(BOW)):
        jj.append(fhash(BOW[i][0],M))

    BoW = []
    for i in jj:
        if [i, jj.count(i)] not in BoW:
            BoW.append([i, jj.count(i)])
        else :
            pass
    BoW.sort()



if yes == False :
    BoW = []
    for e in k:
        if [e, k.count(e)] in BoW:
            pass
        else:
            BoW.append([e, k.count(e)])
    BoW.sort()



print('-------------------')
print('char count = '+str(len(line_textz)) )
print('alphanumeric count = '+str(nct) )
print('line count = '+str(len(line_text)) )
print('word count = '+str(wc) )
print('BoW = '+str(BoW))
# 6330241321 (22.99) 92 (2021-03-22 23:59)

def fhash(w, M) :
    a = 0
    for i in range(len(w)) :
        a = a + (ord(w[i]) * ((37)**i))
    a = a % M
    return a




file_name = input("File name = ")
ans = input("Use feature hashing ? (y,Y,n,N) ")
M = 0
while ans != "n" and ans != "N" and ans != "y" and ans != "Y" :
    print("Try again.")
    ans = input("Use feature hashing ? (y,Y,n,N) ")
if ans.lower() == "n":
    ans = False
else :
    M = int(input("M = "))
    ans = True
print("-"*19)






l1 = 0
l2 = 0
lineCount = 0
words = []
    
file_words = open(file_name, "r")
for line in file_words :
    lineCount = lineCount + 1 
    for c in line :
        l1 = l1 + 1
        if c == "\n" :
            l1 = l1 - 1
        if ("0" <= c <= "9") or ("a" <= c <= "z") or ("A" <= c <= "Z") :
            l2 = l2 + 1
        
    word = ""
    for c in line :
        if ("0" <= c <= "9") or ("a" <= c <= "z") or ("A" <= c <= "Z") :
            word = word + c
        else :
            if len(word) != 0 :
                words.append(word)
            word = ""
file_words.close()


stopwords = []
file_stopwords = open("stopwords.txt", "r")
for line in file_stopwords :
    for w in line.split() :
        w = w.lower()
        if w not in stopwords :
            stopwords.append(w)
file_stopwords.close()





a = []
for c in words :
    c = c.lower()
    if c in stopwords :
        pass
    else :
        have = False
        if ans :
            d = fhash(c, M)
            for i in range(len(a)) :
                if a[i][0] == d :
                    a[i][1] = a[i][1] + 1
                    have = True
                    break
            if not have :
                a.append([d, 1])
        else:
            for i in range(len(a)) :
                if a[i][0] == c :
                    a[i][1] = a[i][1] + 1
                    have = True
                    break
            if not have :
                a.append([c, 1])


print("char count =", l1)
print("alphanumeric count =", l2)
print("line count =", lineCount)
print("word count =", len(words))
print("BoW =", a)
# 6330242021 (0.00) 93 (2021-03-22 23:50)
AL=['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
NUM=['0','1','2','3','4','5','6','7','8','9']
print('File name =',end=' ')
file_name=input()
print('Use feature hashing ? (y,Y,n,N)',end=' ')
fh=input()
while fh not in ['y','Y','n','N']:
    print('Try again.')
    print('Use feature hashing ? (y,Y,n,N)',end='')
    fh=input()
if fh=='n' or fh=='N':
    print('-------------------')
    file=open(file_name,'r'); lc=0
    for line in file:
        f=line.lower()
        ch=0
        if line[-1]=='\n':
            ch+=len(line)-1
        else: ch+=len(line)
        alp=0
        for e in f:
            if e in AL or e in NUM:
                alp+=1
        if line!='\n':
            lc+=1
        for i in range(len(f)):
            if f[i] in AL or f[i] in NUM:
                ww+=f[i]
            else: ww+=' '
        w1=ww.split(); w2+=len(w1)
    file.close()
    s=open(stopword.txt,'r'); ss=[]
    for line in s:
        ss+=line.split()
    s.close()
    w3=[]
    for i in range(len(w1)):
        if w1[i] not in ss: w3.append(w1[i])
    d=[]; e=[]
    for i in range(len(w3)):
        if w3[i] not in d: d.append(w3[i])
    for i in range(len(d)):
        e.append([d[i],0])
    for i in range(len(w3)):
        j=d.index(w3[i]); e[j][1]+=1
elif fh=='y' or fh=='Y':
    print('-------------------')
    M=int(input()); G=37
    file=open(file_name,'r'); lc=0
    for line in file:
        f=line.lower()
        ch=0
        if line[-1]=='\n':
            ch+=len(line)-1
        else: ch+=len(line)
        alp=0
        for e in f:
            if e in AL or e in NUM:
                alp+=1
        if line!='\n':
            lc+=1
        for i in range(len(f)):
            if f[i] in AL or f[i] in NUM:
                ww+=f[i]
            else: ww+=' '
        w1=ww.split(); w2+=len(w1)
    file.close()
    s=open(stopword.txt,'r'); ss=[]
    for line in s:
        ss+=line.split()
    s.close()
    w3=[]
    for i in range(len(w1)):
        if w1[i] not in ss: w3.append(w1[i])
    cal=0; calc=0
    for i in range(len(w3)):
        for k in range(len(w3[i])):
            cal+=(ord(w3[i])*(G**k))
            calc=cal%M
    e=[]
    for i in range(len(w3)):
        if w3[i] not in d: d.append(w3[i])
    for i in range(0,M):
        e.append([i,0])
    for i in range(len(w3)):
        j=d.index(w3[i]); e[j][1]+=1
print('char count =',ch)
print('alphanumeric count =',alp)
print('line count =',lc)
print('word count =',w2)    
print('BoW =',e)
# 6330243621 (19.05) 94 (2021-03-18 23:40)

def fhash(w, M) :
    c = 0
    for i in range(len(w)) :
        c += ord(w[i])*(37**i)
    return c%M
    
def fehas(y) :
    if fehash in ['y','Y'] :
        fh = []
        ss = osw(feh,sw)
        for i in ss :
            fh.append(fhash(i, M))
        bb = bow(fh)
        return bb
    else :
        fh = []
        ss = osw(feh,sw)
        bb = bow(ss)
        return bb
    
def osw(b1,b2) :
    ss = []
    for i in b1 :
        if i not in b2 :
            ss.append(i)
    return ss
def texttoword(fs) :
    sw = []
    c,d,f = 0,0,0
    for i in fs :
        i = fullword(i)
        f += len(i)
        i = i.lower().split()
        for e in i :
            sw.append(e)
            d += len(e)
        c += 1
    return sw,c,d,f
def fullword(cc) :
    cp = ''
    for i in range(len(cc)) :
        if cc[i] in '\'\"\\/-_,.:;()<>' :
            cp += ' '
        elif cc[i] == '\n' :
            cp += ''
        else :
            cp += cc[i]
    return cp
    
def bow(fh) :
    c = 1
    p = []
    fh.sort()
    for i in range(1,len(fh)) :
        if fh[i-1] == fh[i] :
            c += 1
        else :
            p.append( [fh[i-1],c])
            c = 1
    p.append( [fh[-1],c])
    return p

#-----------------------------------------------------
        
file_name = input('File name = ')
fehash = input('Use feature hashing ? (y,Y,n,N) ')
while fehash not in ['y','Y','n','N'] :
    print('Try again.')
    fehash = input('Use feature hashing ? (y,Y,n,N) ')
if fehash in ['y','Y'] :
    M = int(input('M = '))
print('-'*19)
fs = open('stopwords.txt', 'r')
fn = open(file_name, 'r')
sw,c1,d1,f1 = texttoword(fs)
feh,c2,d2,f2 = texttoword(fn)
print('char count =', f2)
print('alphanumeric count =', d2)
print('line count =', c2)
print('word count =', len(feh))
print('BoW =', fehas(feh))

fs.close()
fn.close()
# 6330245921 (23.80) 95 (2021-03-22 20:17)
#Prog-08: Bag-of-words
#6330245921 (23.80) Teetat Karuhawanit
def somchai(c):
    v = open(c,'r')
    x = ''
    b = []
    for j in v.readlines():
        b+= [j.strip()]
    splitted = ''
    for i in b:
        splitted += i.lower()+' '
        x = splitted.split()
    v.close()
    return ' '.join(x)
def paisan(file_name):
    u = open(file_name)
    x = u.readlines()
    alphacount = 0
    for b in range(len(x)):
        x[b] = x[b].strip('\n').lower()
        for n in x[b]:
            if n in 'abcdefghijklmnopqrstuvwxyz0123456789':
                alphacount += 1
    u.close()
    return alphacount
def chate(file_name):
    z = 0
    c = somchai(file_name)
    for i in range(len(c)):
        z+=1
    return((z-thanarat(file_name))+1)
def thanarat(file_name):
    f = open(file_name,'r')
    v = f.readlines()
    f.close()
    return len(v)
def pannarai(file_name):
    d = somchai(file_name)
    c = len(d.split())
    return c
def sukree(file_name):
    a = somchai(file_name)
    b = somchai('stopwords.txt')
    x = ''
    for i in a:
        if i not in 'abcdefghijklmnopqrstuvwxyz0123456789':
            x += ' '
        else:
            x += i
    x = x.split()
    l = []
    for i in x:
        if i in b:
            l += []
        else:
            l += [i]
    return l
def fhash(W,M):
    x = 0
    s = 0
    G = 37
    for i in W:
        x += ord(i)*(G**s)
        s += 1
    d = x % M
    return d
def kirati():
    v = sukree(file_name)
    a = []
    b = []
    c = []
    d = 0
    for i in v:
        if i not in a:
            a.append(i)
            b.append(1)
        else:
            b[a.index(i)] += 1
    for i in a:
        c += [[i,b[d]]]
        d += 1
    return c
def parngod(M):
    a = []
    b = []
    c = []
    d = 0
    for i in sukree(file_name):
        if fhash(i,M) not in a:
            a += [fhash(i,M)]
            b.append(1)
        else:
            b[a.index(fhash(i,M))] += 1
    for i in a:
        c += [[i,b[d]]]
        d += 1
    return c

file_name = input('File name = ')   
x = input('Use feature hashing ? (y,Y,n,N) ')
while x not in 'yYnN':
    print('Try again.')
    x = input('Use feature hashing ? (y,Y,n,N) ')
if x in 'Yy':
    M = int(input('M = '))   
    print('-------------------')
    print('char count =',chate(file_name))
    print('alphanumeric count =',paisan(file_name))
    print('line count =',thanarat(file_name))
    print('word count =',pannarai(file_name))
    print('BoW =',parngod(M))
else:
    print('-------------------')
    print('char count =',chate(file_name))
    print('alphanumeric count =',paisan(file_name))
    print('line count =',thanarat(file_name))
    print('word count =',pannarai(file_name))
    print('BoW =',kirati())
# 6330246521 (30.00) 96 (2021-03-21 18:26)

def main():
    file_name = input('File name = ')
    x = read_file(file_name)
    y = read_file('stopwords.txt')
    list_of_noStopwords = cut_stopwords(x[0], y[0])
    Bo = BoW(list_of_noStopwords)
    fh = input('Use feature hashing ? (y,Y,n,N) ').lower()
    while fh != 'y' and fh != 'n':
        print('Try again.')
        fh = input('Use feature hashing ? (y,Y,n,N) ').lower()
    if fh == 'y':
        M = int(input('M = '))
        print('-------------------')
        print('char count =', x[1])
        print('alphanumeric count =', x[2])
        print('line count =', x[3])
        print('word count =', x[4])
        print('BoW =', BoW_to_fhash(Bo, M))
        
    elif fh == 'n':
        print('-------------------')
        print('char count =', x[1])
        print('alphanumeric count =', x[2])
        print('line count =', x[3])
        print('word count =', x[4])
        print('BoW =', sorted(Bo))
    
def read_file(file_name):
    fn = open(file_name, 'r')
    words = ''
    char_count = 0
    alph_count = 0
    line_count = 0
    for e in fn:
        char_count += len(e)-1
        z = e
        e = e.strip().lower()
        for i in range(len(e)):
            if e[i] not in 'abcdefghijklmnopqrstuvwxyz0123456789':
                e = e[:i] + ' ' + e[i+1:]
        e = e.strip()
        words += e + ' '
        line_count += 1
    if '\n' not in z:
        char_count += 1
    list_of_words = words.split()
    word_count = len(list_of_words)
    for k in list_of_words:
        alph_count += len(k)
    fn.close()
    return [list_of_words, char_count, alph_count, line_count, word_count]
def cut_stopwords(list_of_words, list_of_stopwords):
    list_of_noStopwords = []
    for e in list_of_words:
        if e not in list_of_stopwords:
            list_of_noStopwords.append(e)
    return list_of_noStopwords
def BoW(list_of_noStopwords):
    x = []
    for e in list_of_noStopwords:
        if e not in x:
            x.append(e)
    f = [0]*len(x)
    for e in list_of_noStopwords:
        if e in x:
            f[x.index(e)] += 1
    BoW = []
    for i in range(len(x)):
        BoW.append([x[i], f[i]])
    return BoW
    
def fhash(w,M):
    fh = 0
    for i in range(len(w)):
        fh += ord(w[i]) * 37**i
    fh %= M
    return fh
def BoW_to_fhash(BoW, M):
    m = []
    u = []
    for k in BoW:
        k[0] = fhash(k[0], M)
    BoW.sort()
    for e in BoW:
        m += ([e[0]]*e[1])
    for e in m:
        if e not in u:
            u.append(e)
    l = [0]*len(u)
    for e in m:
        if e in u:
            l[u.index(e)] += 1
    Bfh = []
    for i in range(len(u)):
        Bfh.append([u[i], l[i]])       
    return Bfh

#-----------------------------------------------
main()
# 6330247121 (30.00) 97 (2021-03-22 15:29)
filename = input("File name = ")
yn = input("Use feature hashing ? (y,Y,n,N) ")
yn = yn.lower()
while not(yn == "y" or yn == "n") :
    print("Try again.")
    yn = input("Use feature hashing ? (y,Y,n,N) ")
    yn = yn.lower()
if yn == "y" :
    M = int(input("M = "))
print("-------------------")
f = open(filename)
fl = f.readlines()
charcount = 0
for i in range(len(fl)) :
    fl[i] = fl[i].strip("\n").lower()
    charcount += len(fl[i])
print("char count =" ,charcount)
acount = 0
wl = []
for i in fl :
    ws = " "
    for j in i :
        if j in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" : 
            acount += 1
            ws += j
        else :
            ws += " "
    wl += ws.split()
print("alphanumeric count =",acount)
print("line count =",len(fl))
print("word count =",len(wl))
stopwords = open("stopwords.txt").readlines()
for i in range(len(stopwords)) :
    stopwords[i] = stopwords[i].strip("\n")
    stopwords[i] = stopwords[i].lower()
stop = []
for i in stopwords :
    stop += i.split()
wns = []
for i in wl :
    if i not in stop :
        wns.append(i)
bag1 = []
bag2 = []
for w in wns :
    if yn == "y" :
        u = 0
        for i in range(len(w)):
            o = ord(w[i])
            p = 37**i
            y = o*p
            u += y
        w = u%M
    if w in bag1 :
        index = bag1.index(w)
        bag2[index] += 1
    elif w not in bag1 :
        bag1.append(w)
        bag2.append(1)
bagofwords = []
for i in range(len(bag1)) :
    bagofwords += [[bag1[i],bag2[i]]]
bagofwords.sort()
print("BoW =",bagofwords)
# 6330248821 (24.80) 98 (2021-03-22 20:51)
alp = 'abcdefghijklmnopqrstuvwxyz'
num = '0123456789'
file = input('File name = ')
x = input('Use feature hashing ? (y,Y,n,N) ').lower()
M = ''
def nFhash(w):
    BoW = []
    bow = []
    count = []
    for word in w:
        if word not in bow:
            bow.append(word)
            count.append(int(1))
        else :
            for i in range(len(bow)):
                if word == bow[i]:
                    count[i] += 1
    for j in range(len(bow)):
        BoW.append([bow[j],count[j]])
    return BoW
def yFhase(w,m):
    fhase = []
    BoW = []
    bow = []
    count = []
    for word in w:
        f = 0
        for i in range(len(word)):
            f += ord(word[i]) * (37 ** i)
        fhase.append(f % int(m))
    for e in fhase :
        if e not in bow:
            bow.append(e)
            count.append(1)
        else:
            for j in range(len(bow)):
                if e == bow[j]:
                    count[j] += 1
    for k in range(len(bow)):
        BoW.append([bow[k],count[k]])
    return BoW

while x not in'ny':
    print('Try again')
    x = input('Use feature hashing ? (y,Y,n,N) ').lower()
    
if x == 'y':
    M = input('M = ')
sFile = open('stopwords.txt','r')
stop_words = []
for line in sFile:
    stop_words += line.split()
    stop_words = list(map(str.lower,stop_words))
sFile.close()
wFile = open(file,'r')

charCount = 0
alpCount = 0
lineCount = 0
wordCount = 0

words = []
text = ''
for line in wFile:
    lineCount += 1
    words += line.split()
    words = list(map(str.lower,words) )
    for char in line.strip():
        charCount += 1

for word in words:
    for alpnum in word:
        if alpnum in alp or alpnum in num:
            text += alpnum
    text += ' '
clearedWords = text.split()
print(clearedWords)
wordCount += len(clearedWords)
for i in range(len(clearedWords)):
    alpCount += len(clearedWords[i])
print('-------------------')
print('char count =',charCount)
print('alphanumeric count =',alpCount)
print('line count =',lineCount)
print('word count =',wordCount)
deletedWord = []
for w in clearedWords:
    if w not in stop_words:
        deletedWord.append(w)
if x == 'y':
    print('BoW =',sorted(yFhase(deletedWord,M)))
else :
    print('Bow =',sorted(nFhash(deletedWord)))


# 6330249421 (26.00) 99 (2021-03-21 22:15)
char=0
alpha=0
lines=0
ms=''
bow=''
stopwords=''
answer=[]
answeranswer=[]
file_name=input('File name = ')
want=input('Use feature hashing ? (y,Y,n,N) ')
while want not in ['Y','y','N','n']:
    print('Try again.')
    want=input('Use feature hashing ? (y,Y,n,N) ')    
if want=='y' or want=='Y' :
    Ball=True #ต้องใช้อีก
    M=int(input('M = '))
else :
    Ball=False #ต้องใช้อีก

coke=open('stopwords.txt', 'r')
stopwords=''
for line in coke:#######งงว่าทำไมsplit'i me myself\n'ทำไมได้เป็น['i','me','myself']
    for i in range(len(line)):
        if line[i]=="\n":
            stopwords+=' '
        else:
            stopwords+=line[i]
stopwords=stopwords.split()        
    #stopwords=stopwords.join()
coke.close()

print('-------------------')
fn=open(file_name, 'r')
for line in fn:
    lines+=1
    for i in range(len(line)) :
        if line[i]!="\n" :
            char+=1
            if 'a'<=line[i]<='z' or 'A'<=line[i]<='Z' or '0'<=line[i]<='9':
                alpha+=1          
        #else :
            #lines+=1
        if 'a'<=line[i]<='z' or 'A'<=line[i]<='Z' or '0'<=line[i]<='9':
            ms+=line[i]
        else:
            ms+=' '
print('char count = '+str(char))
print('alphanumeric count = '+str(alpha))
print('line count = '+str(lines))
fn.close()
ms=ms.lower()
a=ms.split() #list of ms
words=len(a)
print('word count = '+str(words))
#ms=ข้อความที่ตัดอักขระที่ไม่ใช่ตัวอิ้ง,เลขและเป็นตัวพิมเล็กทั้งหมด
for i in range(len(a)):
    if a[i] in stopwords:
        pass
    else:
        answer.append(a[i]) #answerคือlistของmsที่ตัดstop wordออก
answer.sort() #
answer.append(answer[len(answer)-1]+'xecrvtbynumi')
nn=1
for i in range(len(answer)-1):
    if answer[i] != answer[i+1]:
        answeranswer.append([answer[i],nn])
        nn=1
    else :
        nn+=1  #ได้ค่าansweranswer ใช้กับฺBall==False    #หาBowแบบปกติ 

if Ball==True :
    f_answer=[]
    f=0
    f_answeranswer=[]
    answer=answer[:-1:] #เริ่มหาแบบf
    for i in range(len(answer)):
        for k in range(len(answer[i])):
            f+=(37**(k)*ord((answer[i])[k]))
        f_answer.append(f%M)
        f=0
    f_answer.sort()
    f_answer.append(f_answer[len(f_answer)-1]+1)
    nn=1
    for i in range(len(f_answer)-1):
        if f_answer[i] != f_answer[i+1]:
            f_answeranswer.append([f_answer[i],nn])
            nn=1
        else :
            nn+=1
       
#print('char count = '+str(char))##
#print('alphanumeric count = '+str(alpha)) ###
#print('line count = '+str(lines)) ##
#print('word count = '+str(words))
if Ball==False :
    print('BoW =',answeranswer)
else:
    print('BoW =',f_answeranswer)
# 6330250021 (29.00) 100 (2021-03-20 11:58)

def fhash(w,M):
    n=0
    for i in range(len(w)):
        n+=ord(w[i])*37**i
    return n%M

file_name=input('File name = ')
fh=input('Use feature hashing ? (y,Y,n,N) ').lower()
while fh not in 'yn':
    print('Try again.')
    fh=input('Use feature hashing ? (y,Y,n,N) ').lower()
if fh=='y':
    M=int(input('M = '))
stop=open('stopwords.txt','r')
sw=[]
for line in stop:
    if len(line.strip())!=0:
        sw+=line.split()
stop.close()
ch=0
al=0
li=0
word=''
f=open(file_name,'r')
for line in f:
    li+=1
    ch+=len(line.strip())
    for a in line.lower():
        if '0'<=a<='9' or 'a'<=a<='z':
            word+=a
            al+=1
        else:
            word+=' '
wordlist=word.split()
wd=len(wordlist)
fhlist=[]
bow=[]
if fh=='n':
    for w in wordlist:
        if [w,wordlist.count(w)] not in bow and w not in sw:
            bow.append([w,wordlist.count(w)])
elif fh=='y':
    for w in wordlist:
        if w not in sw:
            fhlist.append(fhash(w,M))
    for x in fhlist:
        if [x,fhlist.count(x)] not in bow:
            bow.append([x,fhlist.count(x)])
bow.sort()
print('-------------------')
print('char count =',ch)
print('alphanumeric count =',al)
print('line count =',li)
print('word count =',wd)
print('BoW =',bow)
# 6330251621 (26.00) 101 (2021-03-22 10:50)
file_name = input('File name = ')
b = input('Use feature hashing ? (y,Y,n,N) ')
n = 0
while n == 0:
    if b in ['y','Y']:
        M = input('M = ')
        n = 1
    elif b in ['n','N']:
        pass
        n = 1
    else:
        print(('Try again.'))
        b = input('Use feature hashing ? (y,Y,n,N) ')

z = open( file_name, 'r')
line_count = 0

for line in z:
    line_count +=1
z.close()    

z = open( file_name, 'r')
alp = 0
char = 0
s = z.read()
for i in s:
    if i.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
        alp += 1
    elif i == ' ': char += 1
    elif i == '\n': pass
    else:
        char +=1
char_total = alp + char
w = []
word = 0
for e in s:
    if e.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
        w.append(e)
    elif e not in 'abcdefghijklmnopqrstuvwxyz0123456789':
        if len(w) != 0:
            word += 1
            w = []
if len(w) != 0:
    word += 1
z.close()  
      
print('-------------------')
print('char count = '+str(char_total))
print('alphanumeric count = '+str(alp))
print('line count = '+str(line_count))
print('word count = '+str(word))

sam = ''
stop = ''
for i in s:
    if i.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
        sam += i.lower()
    else :
        sam += ' '
st = open('stopwords.txt', 'r')
sto = st.read()
for i in sto:
    if i.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
        stop += i.lower()
    else :
        stop += ' '
sam = sam.split()
stop = stop.split()
for e in stop:
    q = True
    while q == True:
        if e in sam:
            sam.remove(e)
            q = True
        else: q = False

Bow = []
def cut(K):
    finish = []
    for i in range(len(K)-1):
        if K[i] != K[i+1]:
            finish.append(K[i])
    if K[len(K)-1] != K[len(K)-2] and K[len(K)-1] not in finish:
        finish.append(K[len(K)-1])
    if K[len(K)-1] == K[len(K)-2] and K[len(K)-1] not in finish:
        finish.append(K[len(K)-1])
    return finish
    
if b in ['n','N']:
    for i in sam:
        Bow.append([i,sam.count(i)])
    Bow.sort()
    finish = cut(Bow)
    print('BoW = '+str(finish))
    
elif b in ['y','Y']:
    def fhash(w,M):
        f = 0
        n = 0
        for i in w:
            f += ord(i)*(37**n)
            n += 1
        fh = f%int(M)
        return fh
    for i in sam:
        Bow.append(fhash(i,M))
    Bow.sort()
    Bowlast = []
    for i in Bow:
        Bowlast.append([i,Bow.count(i)])
    finish = cut(Bowlast)
    print('BoW = '+str(finish))



# 6330252221 (13.30) 102 (2021-03-22 22:10)
def fhash(w,M):
    a=0
    for i in range(len(w)):
        b=int(ord(w[i]))
        c=(37)**(i)
        d=b*c
        a+=d
    a=int(a%int(M))
    return a
def remove(t):
    out=''
    for e in t:
        if e not in " \"\'/\()<>[].,:;-_&#!* ":
            out += e
        else:
            out+=' '
    return out
def word(t):
    t=remove(t)
    a=t.split()
    a=len(a)
    return a
def char(t):
    a=0
    for e in t:
        a+=len(e)
    return a
def alphanumeric(t):
    t=t.lower()
    a=0
    for e in t:
        if e in "abcdefghijklmnopqrstuvwxyz0123456789":
            a += 1
    return a
def word1(t):
    a=[]
    t=remove(t)
    t=t.lower()
    t=t.split()
    for i in range(len(t)):
        a.append(t[i])
    return a

#---------------------------
file=input("File name = ")
text=open(file,'r')
hashing=input('Use feature hashing ? (y,Y,n,N) ')


#--------------------------
stopword=open('stopwords.txt','r')
stopwords=[]
for line in stopword:
    line=line.split()
    for i in range(len(line)):
        stopwords.append(line[i])
stopword.close()
#-------------------------
char_count=0
line_count=0
word_count=0
alphanumeric_count=0
word2=[]
for line in text:
    line_count+=1
    a=char(line)
    char_count+=a
    b=alphanumeric(line)
    alphanumeric_count+=b
    c=word(line)
    word_count+=c
    d=word1(line)
    word2+=d
char_count=(char_count)-(line_count)+1
word3=[]
for e in word2:
    if e not in stopwords:
        word3.append(e)
word3.sort()
#----------------------
while hashing not in ['y','Y','n','N']:
    print('Try again.')
    hashing=input('Use feature hashing ? (y,Y,n,N) ')
if hashing.lower() == 'y':
    M=input("M = ")
    #------------กรณี y
    Bow1=[]
    BoW=[]
    for i in range(len(word3)):
        a=fhash(word3[i],M)
        Bow1.append(a)
    Bow1.sort()
    n=1
    for i in range(len(Bow1)-1):
        if Bow1[i]==Bow1[i+1]:
           n+=1
        else:
            BoW+=[[Bow1[i],n]]
            n=1
    BoW+=[[Bow1[-1],n]]   
if hashing.lower() == 'n':
    
    BoW=[]
    n=1
    for i in range(len(word3)-1):
        if word3[i]==word3[i+1]:
           n+=1
        else:
            BoW+=[[word3[i],n]]
            n=1
    BoW+=[[word3[-1],n]]

#---------------------
print('-------------------')
print('char count =',char_count)
print('alphanumeric count =',alphanumeric_count)
print('line count =',line_count)
print('word count =',word_count)
print('BoW =',BoW)
text.close()
# 6330253921 (22.27) 103 (2021-03-22 23:21)

fn = input('File name = ')
f = open(fn)
k = f.read()
def chcount(k):
    s = ''
    for i in range(len(k)):
        if k[i] == '\n':
            s += ''
        else:
            s += k[i]
    return s
def alphacount(k):
    m = chcount(k)
    alpha = 'abcdefghijklmnopqrstuvwxyz'
    ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    num = '0123456789'
    c = 0
    for i in range(len(m)):
        if m[i] in alpha or m[i] in ALPHA or m[i] in num:
            c += 1
    return c
    
def habuntud(k):
    c = 1
    for i in range(len(k)):
        if k[i] == '\n':
            c += 1
    return c
def wordcount(k):
    s = ''
    for i in range(len(k)):
        if k[i] == '\n' or k[i] == '"' \
           or k[i] == "'" or k[i] == ','\
           or k[i] == '(' or k[i] == ')'\
           or k[i] == '[' or k[i] == ']'\
           or k[i] == '.' or k[i] == '!'\
           or k[i] == '?' or k[i] == '/'\
           or k[i] == ':' or k[i] == ';'\
           or k[i] == '{' or k[i] == '}'\
           or k[i] == '+' or k[i] == '-'\
           or k[i] == '*' or k[i] == '^'\
           or k[i] == '#' or k[i] == '&'\
           or k[i] == '=' or k[i] == '$'\
           or k[i] == '~' or k[i] == '_'\
           or k[i] == '%' or k[i] == '|':
            s += ' '
        else:
            s += k[i]
    a = s.split()
    return a

s = open('stopword.txt')
ss = s.read()
stopwords = wordcount(ss)
def fhash(a,m):
    d = 0
    for i in range(len(a)):
        d += ord(a[i])*(37**i)
    e = d % int(m)
    return e
def check(a):
    if a == 'n' or a == 'N' or a == 'y' or a == 'Y':
        return a
    else:
        print('Try again.')
        x = input('Use feature hashing ? (y,Y,n,N) ')
        if x == 'n' or x == 'N' or x == 'y' or x == 'Y':
            return x
        else:
            return check(x)
def kunsum(k):
    m = wordcount(k)
    g = []
    for i in m:
        if i.lower() in stopwords:
            pass
        else:
            g.append(i)
    return g
def bow2(a,m):
    c = kunsum(a)
    d = []
    f = []
    k = []
    l = []
    u = 0
    for i in range(len(c)):
        d.append(fhash(c[i],m))
    for y in range(m):
        k.append(y)
        for e in range(len(d)):
            if y == d[e]:
                u += 1
        f.append(u)
        u = 0
    for i in range(m):
        if f[i] != 0:
            l.append([k[i],f[i]])
    return l
def bow1(h):
    a = wordcount(h)
    x = []
    y = 0
    z = []
    for i in range(len(a)):
        if a[i].lower() in stopwords:
            pass
        else:
            for e in range(len(a)):
                if i+e >= len(a):
                    break
                elif a[i] == a[i+e] and a[i] in x:
                    a[i+e] = stopwords[0]
                    d = x.index(a[i])
                    z[d] = str(int(z[d])+1)
                    break
                elif a[i] == a[i+e] and a[i] not in x:
                    x.append(a[i])
                    y += 1
                    z.append(str(y))
            y = 0
    m = []
    for i in range(len(x)):
        m += [[x[i],int(z[i])]]
    m.sort()
    return m
   
fd = input('Use feature hashing ? (y,Y,n,N) ')
fh = check(fd)
if fh == 'n' or fh == 'N':
    print('-'*19)
    print('char count = '+str(len(chcount(k))))
    print('alphanumeric count = '+str(alphacount(k)))
    print('line count = '+str(habuntud(k)))
    print('word count = '+str(len(wordcount(k))))
    print('BoW = '+str(bow1(k)))
else:
    m = int(input('M = '))
    print('-'*19)
    print('char count = '+str(len(chcount(k))))
    print('alphanumeric count = '+str(alphacount(k)))
    print('line count = '+str(habuntud(k)))
    print('word count = '+str(len(wordcount(k))))
    print('BoW = '+str(bow2(k,m)))

f.close()
s.close()
# 6330254521 (30.00) 104 (2021-03-22 23:03)
def fhash(w,m):
    compute = 0
    for i in range(len(w)):
        compute+=ord(w[i])*(37**i)
    result = compute%m
    return result
def basic_count(file_name):#count for line ,charactor and only number and alphabet
    z=open(file_name,'r')
    charcount=0
    engnum_count=0
    linecount=0
    while True:
        y=z.readline()
        if len(y)!=0:
            linecount+=1
            charcount+=len(y)
            for e in y:
                if "a" <= e <="z" or "A" <= e <= "Z" or "0"<= e <="9":
                    engnum_count+=1
        else:
            break
    z.close()
    charcount_true = charcount-linecount+1
    return charcount_true,engnum_count,linecount
def split_word(file_name):
    list_word =[]
    string_word=''
    file_count=open(file_name,'r')
    while True:
        line_word=file_count.readline().lower()
        if len(line_word)!=0:
          for e in line_word:
             list_word.append(e)
          for i in range(len(list_word)):
              if "a" <= list_word[i] <="z" or "A" <= list_word[i] <= "Z" or "0"<= list_word[i] <="9":
                  string_word+=list_word[i]
              else:
                  list_word[i] = ' '
                  string_word+=list_word[i]
          list_word = []
        else:
              break
    file_count.close()
    real_list = string_word.split()
    return real_list,len(real_list)
def function_in_main(file_name):
    charcount_true,engnum_count,linecount = basic_count(file_name)
    print("char count =",charcount_true)
    print("alphanumeric count =",engnum_count)
    print("line count =",linecount)
    list_word,num_word = split_word(file_name)
    print("word count =",num_word)
    stopword_string = open("stopwords.txt","r")
    stopword_list = stopword_string.read().lower().split()
    stopword_string.close()
    list_word_cut = []
    for e in list_word:
        if e not in stopword_list:
            list_word_cut.append(e)
    list_word_cut.sort()
    return list_word_cut
def main():
    file_name =input("File name = ")
    while True:
       Use_Hashing=input("Use feature hashing ? (y,Y,n,N) ")
       if Use_Hashing == 'n' or Use_Hashing == 'N':
          list_word_cut = function_in_main(file_name)
          Bow_list=[]
          nonrepeat_list = []
          for word in list_word_cut:
              if word not in nonrepeat_list:
                nonrepeat_list.append(word)  
          for i in range(len(nonrepeat_list)):
              n=0
              for e in list_word_cut:
                  if e == nonrepeat_list[i]:
                      n+=1
              Bow_list.append([nonrepeat_list[i],n])
          print("BoW = ",Bow_list)
          break
       elif Use_Hashing == 'y' or Use_Hashing == 'Y':
            M = int(input("M = "))
            list_word_cut = function_in_main(file_name)
            fhash_list = []
            nonrepeatfhash = []
            Bow_hash = []
            for e in list_word_cut:
               fhash_list.append(fhash(e,M))
            for word in fhash_list:
               if word not in nonrepeatfhash:
                 nonrepeatfhash.append(word)  
            for i in range(len(nonrepeatfhash)):
               n=0
               for e in fhash_list :
                  if e == nonrepeatfhash[i]:
                      n+=1
               Bow_hash.append([nonrepeatfhash[i],n])
            Bow_hash.sort()
            print("BoW = ",Bow_hash)
            
            break
       else:
          print("Try again.")
#-------------------------------------------------------------------------------------------------         
main()
# 6330255121 (10.00) 105 (2021-03-22 23:47)


file_name = input('File name = ')
fh = input('Use feature hashing ? (y,Y,n,N) ')
while fh != 'y' and fh != 'Y' and fh != 'n' and fh != 'N':
    print('Try again.') 
    fh = input('Use feature hashing ? (y,Y,n,N) ')
if fh == 'y' or fh == 'Y':
    M = int(input('M = '))
        
print('-------------------')


def tostopwords():
    stopwordlist = []
    stopwords = open('stopwords.txt', 'r')
    for line in stopwords:
        for s in line.split():
            s = s.lower()
            if s not in stopwordlist:
                stopwordlist.append(s)
    stopwords.close()
    
    return stopwordlist


lenght = 0
wfile = open(file_name, 'r')
for w in wfile:
    for ww in w:
        if ww != '\n':
            lenght += 1
wfile.close()

l = 0
words = []
word = ''
wfile = open(file_name, 'r')
for w in wfile:
    for ww in w:
        if ('a'<=ww<='z') or ('A'<=ww<='Z') or ('0'<=ww<='9'):
            word += ww
        else:
            l += len(word)
            words.append(word)
            word = ''
wfile.close()

words2 = []
for w in words:
    if w != '':
        words2.append(w)


linecount = 0
wfile = open(file_name, 'r')
for line in wfile:
    linecount += 1
wfile.close()


                
def fhash(word,M):
    G = 37
    v = 0
    for i in range(len(word)):
        v += ord(word[i])*(G**i)
    
    f = v % M
    return f

#----------------------------------------------------------------
for w in words:
    w = w.lower()
    
        


print('char count =', lenght)
print('alphanumeric count =', l)
print('line count =', linecount)
print('word count =', len(words2))
print('BoW =', )
# 6330256821 (0.00) 106 (2021-03-22 23:56)
def fhash(w,M) :
    k=[]
    x=0
    for e in w:
        k.append(e)
    for i in range(len(k)):
        x+=ord(k[i])*(37**k[i])
    fhash_re=x%M
    return fhash_re
def remove_punc(t):
    out=""
    for e in t:
        if e  in "\'\"/\\().,;:":
            out+=" "
        else:
            out+=e
        return out
a=input("File name =")
x=input("Using feature hashing ? ")
while x not in ['y','Y','n','N']:
    print("Try again.")
    x=input("Using feature hashing ? ")
if x in ['y','Y']:
    M=int(input('M='))
fn=open("stopwords.txt",'r')
l=[]
line1=fn.readline()
for line1 in fn:
    l.append(line)
fn.close
l_new=l.join(" ")
bow_word=l_new.split()
print("-"*20)
sample=open("sample.txt","r")
s=[]
line=sample.readline()
for line in sample:
    s.append(line)
sample.close
s_new=s.join(" ")
sample_word=s_new.split()
z=0
for e in range(len(s_new)):
    z+= len(e)
print("char count=",z)
alphabet=remove_punc(s_new)
alphabet_1=alphabet.split()
b=0
for e in alphabet:
    b+=len(e)
print("alphanumeric count =", b)
print("line count =",len(s))
print("word count =",len(sample_word))

    

    



 
    

      


# 6330257421 (22.20) 107 (2021-03-22 19:43)
def fhash(w, M):
    G = 37
    r = 0
    for i in range(len(w)):
        r += (ord(w[i]) * (G**i))
    return r % M
tx = input('File name = ')
hashing = input('Use feature hashing ? (y,Y,n,N) ')
if(hashing == ''):
  hashing = 'a'
while hashing not in 'yYnN ':
  hashing = input('Use feature hashing ? (y,Y,n,N) ')
  if(hashing == ''):
    hashing = 'a'
  print('Try again.')
if(hashing in 'yY'):
  m = int(input('M = '))
print('-------------------')
i = 0
j = 0
lc = 0
words = []
file = open(tx, 'r')
for l in file:
  lc=lc+1
  for k in l:
    i=i+1
    if(k == '\n'):
      i=i-1
    if('a'<=k<='z')or('A'<=k<='Z')or('0'<=k<='9'):
        j=j+1
  word = ''
  for k in l:
    if('a'<=k<='z')or('A'<=k<='Z')or('0'<=k<='9'):
      word=word+k
    elif len(word) != 0:
      words.append(word)
      word = ''
file.close()
stopword = []
stop = open('stopwords.txt', 'r')
for line in stop:
    for word in line.strip().split():
        word = word.lower()
        if word not in stopword:
            stopword.append(word)
stop.close()
r = []
for c in words:
    c = c.lower()
    if c not in stopword:
        if hashing in 'yY':
            cEdit = fhash(c, m)
            for x in range(len(r)):
                if r[x][0] == cEdit:
                    r[x][1] += 1
                    break
            else:
                r.append([cEdit, 1])
        else:
            for x in range(len(r)):
                if r[x][0] == c:
                    r[x][1] += 1
                    break
            else:
                r.append([c, 1])
print('char count =', i)
print('alphanumeric count =', j)
print('line count =', lc)
print('word count =', len(words))
print('BoW =', r)
# 6330258021 (30.00) 108 (2021-03-22 20:09)

def readlines(fn) :
    fin = open(fn,'r')
    lines,line = [],fin.readline().strip('\n')
    while len(line) != 0 :
        lines.append(line.strip('\n'))
        line = fin.readline() 
    fin.close()
    return lines
def fhash(w,m) :
    h = 0
    for i in range(len(w)) : h += ord(w[i])*(37**i)
    return h % m

file_name = input('File name = ').strip()
mode = input('Use feature hashing ? (y,Y,n,N) ').strip().lower()
while mode != 'n' and mode != 'y' :
    print('Try again.')
    mode = input('Use feature hashing ? (y,Y,n,N) ').strip().lower()
if mode == 'y' : m = int(input('M = ').strip())
print('-------------------')
lines = readlines(file_name)
cc,ca,words = 0,0,''
for line in lines : 
    cc += len(line)
    words += ' '
    for c in line :
        if c.isalnum() : 
            words += c
            ca += 1
        else : words += ' '
print('char count = %d' % cc) 
print('alphanumeric count = %d' % ca)
print('line count = %d' % len(lines))
words,uwords,cwords,stopwords = words.split(),[],[],[]
print('word count = %d' % len(words))
for i in readlines('stopwords.txt') : stopwords += i.lower().split()
for word in words :
    word = word.lower()
    if word in stopwords  : continue
    if mode == 'y' : word = fhash(word,m)
    if word not in uwords : 
        uwords.append(word)
        cwords.append(1)
    else : cwords[uwords.index(word)] += 1
bow = []
for i in range(len(uwords)) : bow.append([uwords[i],cwords[i]])
print('BoW = %s' % sorted(bow))
# 6330259721 (15.25) 109 (2021-03-21 11:33)

file_name = input("File name = " )
check = False
while not check:
    is_feature_hashing = input("Use feature hashing ? (y,Y,n,N)").lower()
    if is_feature_hashing not in "yn":
        print("Try again.")
    else:
        check = True


#read stopwords.txt -> convert to list
fn_stopwords = open("stopwords.txt","r")
li_of_stopwords= []
for line in fn_stopwords:
    li_of_stopwords += line.split()
fn_stopwords.close()

#read file_name,line_count,char_count
fn_file_name = open(file_name,"r")
in_file = []
line_count = 0
char_count = 0
for line in fn_file_name:
    for e in line.strip():
        char_count+=1
    line_count +=1
    in_file += line.lower().split()
    
#seperated words       
alp = "abcdefghijklmnopqrstuvwxyz"
number = "0123456789"   
seperated_word = []
for word in in_file:
    word_check = ""
    for e in word:
        if ( e != word[-1]) and ((e in alp) or (e in number)):
            word_check += e
        elif ( e == word[-1]) and ((e in alp) or (e in number)):
            word_check += e
            seperated_word.append(word_check)
        else:
            seperated_word.append(word_check)
            word_check = ""
for e in seperated_word:
    if e== "":
        seperated_word.remove(e)
        
#alphanumeric count
alphanumeric_count = 0
for word in seperated_word:
    for e in word:
        alphanumeric_count +=1
#word count
word_count = len(seperated_word)

#feature hashing
cut_stopwords = []
for word in seperated_word: #remove stopwords from seperated_word list
    if word not in li_of_stopwords:
        cut_stopwords.append(word)
preBoW = []
for word in cut_stopwords:
    check = 0
    if len(preBoW) !=0:
        for li in preBoW:
            if word == li[0]:
                li[1] +=1
                check = 1
    if check == 0:
        preBoW.append([word,1])
        
       
def print_other(char_count, alphanumeric_count,line_count,word_count):
    print("char count  = "+ str(char_count))
    print("alphanumeric count = "+str(alphanumeric_count))
    print("line count = "+str(line_count))
    print("word count = "+str(word_count))
def fhash(w,m):
    ans = 0
    for i in range(len(w)):
        ans += (ord(w[i])*(37**i))
    return ans%m
        
    
if is_feature_hashing == "n":
    preBoW.sort()
    print("-------------------")
    print_other(char_count, alphanumeric_count,line_count,word_count)
    print("BoW = "+str(preBoW))
elif is_feature_hashing == "y":
    m = input("M = ")
    print("-------------------")
    bow_count = []
    for li in preBoW:
        for i in range(li[1]):
            bow_count.append(fhash(li[0],int(m)))
    BoW = []
    for e in bow_count:
        check = False
        if len(BoW) != 0:
            for c in BoW:
                if e == c[0]:
                    check = True
                    c[1]+=1
        if not check :
            BoW.append([e,1])
    BoW.sort()
    print_other(char_count, alphanumeric_count,line_count,word_count)
    print("BoW = "+ str(BoW))
# 6330260221 (25.50) 110 (2021-03-22 21:24)
stopwords = []
words = []
words2 = []
words3 = []
b=[]
bow = []
n = 0
count = 0
line = 0
alphacount = 0
alphabet = 'abcdefghijklmnopqrstuvwxyz'
num='0123456789'
def fhash(w,M):
    n = 0
    for i in range(len(w)):
        a = ord(w[i])*(37**i)
        n+=a
    n2 = n%M
    return n2

a = input('File name = ')
while True == True:
    feature = input('Use feature hashing ? (y,Y,n,N) ')
    if feature in ('y', 'Y', 'n', 'N'):
        if feature in ('y','Y'):
            M = int(input('M = '))
            break
        elif feature in ('n','N'):
            M = 'No'
            break
    else:
        print('Try again')

word = open(a,'r')
for i in word:
    count+=len(i)
    line+=1
    for i2 in i.lower():
        if i2 == ' ':
            words.append(' ')
        elif i2 not in alphabet :
            if i2 in num:
                 words.append(i2)
                 alphacount+=1
            elif n != len(i)-1:
                if i[n+1] in alphabet:
                    words.append(' ')
        elif i2 in alphabet :
            words.append(i2.lower())
            alphacount+=1
        n+=1
    words2+=(''.join(words).split())
    words.clear()
    n=0
word.close()

stop_word = open('stopwords.txt','r')
for j in stop_word:
    stopwords += j.split()
for k in words2:
    if k in stopwords:
        continue
    else:
        words3.append(k)
if M != 'No':
    for j in words3:
        b.append(fhash(j,M))
    for d in b:
        if [d,b.count(d)] not in bow:
            bow.append([d,b.count(d)])
    bow = (sorted(bow,key=lambda x:x[0]))
else:
    for c in words3:
        if [c,words3.count(c)] not in bow:
            bow.append([c,words3.count(c)])
    bow = (sorted(bow,key=lambda x:x[0]))
stop_word.close()
print('-------------------')
print('char count =',count-line+1)
print('alphanumeric count =',alphacount)
print('line count =',line)
print('word count =',len(words2))
print('Bow =',bow)
# 6330261921 (30.00) 111 (2021-03-21 23:39)
file_n=input('File name = ')
while True:
    ufh=input('Use feature hashing ? (y,Y,n,N) ')
    if ufh in 'YyNn':break
    print('Try again.')
if ufh in 'Yy':
    M=int(input('M = '))
    print('-------------------')
else :print('-------------------')
#อ่านstw                  
filestw = open('stopwords.txt','r')
l1=[]
l2=[]
for i in filestw:
    l1.append(i.split())
filestw.close()
for i in l1:
    l2.append(' '.join(i))
sw=' '.join(l2).lower().split()
lsw=[]
for i in sw:
    if not i in lsw:
        lsw.append(i)
        
        
#อ่านfile
file_name=open(file_n,'r')
s1=[]
for i in file_name:
    s1.append(i.strip('\n'))
line=len(s1)
char=len(''.join(s1))
file_name.close()
s1=' '.join(s1).lower()
s2=''
for i in s1:
    if i.isalnum():
        s2+=i
    else:s2+=' '
s2=s2.split()
s2.sort()#listของไม่เอาตัวพิเสดแยกคำเรียง
word=len(s2)
alphanumeric=len(''.join(s2))
s3=[]
for i in s2:
    if not i in s3:
        s3.append(i)#s2ไม่ซ้ำ
#-------------------------
        
s4=[]
for i in s2:
    if not i in lsw:
        s4.append(i)
s5=s4 #s2ไม่เอาstopwordมีซ่้ำ       
#-------------------------   
print('char count =',char)
print('alphanumeric count =',alphanumeric)
print('line count =',line)
print('word count =',word)
bow=[]
a=1            
for i in s4:
    s4=s4[1::]
    if i in s4:a+=1
    else :
        bow.append([i,a])
        a=1
#-----------------------        
def fhash(w,M):
    n=0
    for i in range(len(w)):
        n+=ord(w[i])*37**i
    return n%M

bow1=[]
if ufh in 'Yy':
    for i in s5:
        bow1.append(fhash(i,M))
    bow1.sort()
    bow3=[]
    a=1            
    for i in bow1:
        bow1=bow1[1::]
        if i in bow1:a+=1
        else :
            bow3.append([i,a])
            a=1
    print('BoW =',bow3)
else :print('BoW =',bow)
# 6330262521 (28.00) 112 (2021-03-21 21:25)
file_name = input('File name = ')
a = input('Use feature hashing ? (y,Y,n,N) ')
def fhash(w,M):
    G = 37
    c = 0
    for i in range(len(w)):
        c += ord(w[i])*G**i
    return c%M
def type(a):
    if a in 'abcdefghijklmnopqrstuvwxyz0123456789':
        return 'a'
    return 'b'

while True:        
    if a.upper() == 'Y':
        M = int(input('M = '))
        print("-------------------")
        k = 1
        break
    if a.upper() == 'N':
        k = 2
        print("-------------------")
        break
    print('Try again.')
    a = input('Use feature hashing ? (y,Y,n,N) ')
    
stop = open("stopwords.txt", "r")
pp = []
p = ''
for i in stop:
    pp.append(i.strip())
for i in pp:
    p = p +i + ' '
p = p.split()
stop.close()

fin = open(file_name, "r")
nn = fin.readlines()
fin.close()

l = len(nn)
b = ''
for i in range(len(nn)):
    b+=nn[i].lower()
if b[-1] == '\n':
    b = b[:-1]
c = len(b)-l+1
cc = 0
for i in b:
    if i in 'abcdefghijklmnopqrstuvwxyz0123456789':
        cc+=1
f = ''
for i in range(len(b)):
    if type(b[i]) == 'b':
        f += ' '
    else:
        f += b[i]
f = f.split()
ccc = len(f)
def cut(f):
    b = ''
    for i in f:
        if i not in p:
            b+=i + ' '
    return b
def BoW(b):
    kk = []
    vv = []
    da = cut(f)
    da = da.split()
    if k == 2:
        for i in range(len(da)):
            if da[i] not in kk:
                kk.append(da[i])
                vv.append(1)
            elif da[i] in kk:
                ii = kk.index(da[i])
                vv[ii] +=1
        BoW = []
        for i in range(len(kk)):
            BoW.append([kk[i],vv[i]])
        print('BoW = ' + str(BoW))
        
    elif k == 1:
        t = []
        da = cut(f)
        da = da.split()
        for i in range(len(da)):
            t.append(fhash(da[i],M))
        t.sort()
        g = []
        for i in range(len(t)-1):
            if t[i] != t[i+1]:
                g.append([t[i],t.count(t[i])])
        g.append([t[-1],t.count(t[-1])])
        print('BoW = ' + str(g))
        
if k == 1:
    print('char count = '+ str(c))
    print('alphanumeric count = ' + str(cc))
    print('line count = ' + str(l))
    print('word count = ' + str(ccc))
    BoW(b)
elif k == 2:
    print('char count = '+ str(c))
    print('alphanumeric count = ' + str(cc))
    print('line count = ' + str(l))
    print('word count = ' + str(ccc))
    BoW(b)
# 6330263121 (16.03) 113 (2021-03-22 20:01)
def line(file):
    a = len(file)
    return a
def charcount(file):
    file1 = ''.join(file)
    a = len(file1)
    return a
def alphanumericcount(file):
    file1 = ''.join(file)
    file2 = file1.split(',')
    file3 = ' '.join(file2)
    file4 = file3.split('.')
    file5 = ' '.join(file4)
    file6 = file5.lower()
    file9 = file6.split('"')
    file10 = ''.join(file9 )
    file11 = file10.split()
    file12 = ''.join(file11)
    file13 = file12.split()
    file14 = ''.join(file13)
    a = len(file14)
    return a
def wordcourt(file):
    file1 = ''.join(file)
    file2 = file1.split(',')
    file3 = ' '.join(file2)
    file4 = file3.split('.')
    file5 = ' '.join(file4)
    file6 = file5.lower()
    file7 = file6.split()
    a = len(file7)
    return a 
def read(x):
    f = open(x)
    a = [line.strip() for line in f.readlines()]
    f.close()
    return a
def stopword(x):                         
    c=[]
    for i in range(len(x)):
        x1 =x[i]
        x2 =x1.split()
        for j in (x2):
            c.append(j)
    return c
def test(x,y):                            
    a=[]
    for i in range(len(x)):
        if x[i] not in y:
            a.append(x[i])
    return a
def cut1(file):                       
    file1 = ''.join(file)
    file2 = file1.split(',')
    file3 = ' '.join(file2)
    file4 = file3.split('.')
    file5 = ' '.join(file4)
    file6 = file5.lower()
    file9 = file6.split('"')
    file10 = ''.join(file9 )
    file11 = file10.split(';')
    file12 = ''.join(file11)
    file13 = file12.split()
    return file13
def samecut(x):                     
    c =[]
    for i in range(len(x)):
        a =x[i]
        if a not in x[i+1:]:
            c.append(a)
    return c
def bow(x,y):
    c=[]
    for i in range(len(x)):
        a = x[i]
        l =0
        for j in range(len(y)):
            if a==y[j]:
                l +=1
            else:
                l +=0
        a =[x[i],l]
        c.append( a )
    return c
def fhash(w,m):
    b = 0
    i = 0
    while i < len(w):
        a = ord(w[i])*(37**(i))
        b += a
        i +=1
    c =b%m
    return c
def mixfhash(x,m):                  
    a =[]
    for i in range(len(x)):
        b = fhash(x[i],m)
        a.append(b)
    return a

c = read('stopword.txt')
e = stopword(c)
a = str(input('File name = '))
file = read(a)
file6 = cut1(file)
e1 = stopword(file6)
e2 = test(e1,e)
e3 = samecut(e2)
e4 = bow(e3,e2)
b6 = mixfhash(e2,4)
b7 = samecut(b6)
b8 = bow(b7,b6)
for i in range(10000):
    y = str(input('Use feature hashing ? (y,Y,n,N) '))
    if y =='n' or y == 'N':
        print('-------------------')
        b1 = charcount(file)
        print('char count = 'f'{b1}' )
        b2 = alphanumericcount(file)
        print('alphanumeric count = 'f'{b2}')
        b3 = line(file)
        print('line count = 'f'{b3} ')
        b4 = wordcourt(file)
        print('word count = 'f'{b4} ')
        print('BoW = 'f'{e4}')
        break
    elif y =='y' or y=='Y':
        b1 = str(input('M = '))
        g1 = int(b1)
        print('-------------------')
        b2 = charcount(file)
        print('char count = 'f'{b2}' )
        b3 = alphanumericcount(file)
        print('alphanumeric count = 'f'{b3}')
        b4 = line(file)
        print('line count = 'f'{b4} ')
        b5 = wordcourt(file)
        print('word count = 'f'{b5} ')
        g6 = mixfhash(e2,g1)
        g7 = samecut(g6)
        g8 = bow(g7,g6)
        g9 = sorted(g8)
        print('BoW = 'f'{g9}')
        break
    else:
        print('Try again.')

    
    

    


            
# 6330264821 (30.00) 114 (2021-03-22 19:45)

def char_c(x) :
    c = 0
    for ch in x :
        if ch != '\n':
            c += 1
    return c
def alpha_c(alpha) :
    k = 0
    for ch in alpha :
        if '0' <= ch <= '9' or 'a' <= ch <= 'z' or 'A' <= ch <= 'Z' :
            k += 1
    return k
        
def words_c(word) :
    w = ''
    for ch in word :
        if '0' <= ch <= '9' or 'a' <= ch <= 'z' or 'A' <= ch <= 'Z' :
            w += ch
        else:
            w += ' '
    x = w.split()
    l = len(x)
    return l
def create_w(word) :
    w = ''
    for ch in word :
        if '0' <= ch <= '9' or 'a' <= ch <= 'z' or 'A' <= ch <= 'Z'  :
            w += ch
        else:
            w += ' '
    w = w.lower()
    return w
def create_s(stop):
    ss = ''
    for ch in stop:
        if ch != '\n':
            ss += ch
        else :
            ss += ' '
    return ss
def cutto(word) :
    Bodyya = []
    w = create_w(word)
    w = w.split()
    stop_word = open('stopwords.txt', 'r')
    stopword = ''
    for line in stop_word :
            stopword += create_s(line)
    stopword = stopword.split()
    for i in w :
        if i in stopword:
            pass
        else:
            Bodyya.append(i)
    Bodyya.sort()
    stop_word.close()
    return Bodyya
def flash(w,M) :
    G = 37
    n = 0
    x = 0
    for ch in w :
        x += ord(ch)*(G**n)
        n += 1
    flashy = x%M
    return flashy
    
file_name = input('File name = ')
read_file = open(file_name , 'r')

sumz = 0
line_n = 0
sumx = 0
sumw = 0
word = []
for line in read_file :
    sumz += char_c(line)
    line_n += 1
    sumx += alpha_c(line)
    sumw += words_c(line)
    word += cutto(line)
read_file.close()    

feature_hash = input('Use feature hashing ? (y,Y,n,N) ')
read_file = open(file_name , 'r')
loop = True
while loop == True :
    BoW = ''
    if feature_hash in 'yY' :
        M = int(input('M = '))
        wordy = []
        x = []
        y = []
        c = 1
        for line in read_file :
            wordy += cutto(line)
        for e in wordy :
            x.append(flash(e, M))
        x.sort()
        x.append(100000)
        for i in range(len(x)-1) :
            if x[i] != x[i+1] :
                y.append([x[i], c])
                c = 1
            else :
                c += 1 
        BoW = y
        loop = False
    elif feature_hash in 'nN' :
        wordn = []
        nono = []
        k = 1
        for line in read_file :
            wordn += cutto(line)
        wordn.sort()
        wordn.append('TIDapORnChaWFerN28')
        for i in range(len(wordn)-1) :
            if wordn[i] != wordn[i+1] :
                nono.append([wordn[i], k])
                k = 1
            else :
                k += 1
        BoW = nono
        loop = False
    else :
        print('Try again.')
        feature_hash = input('Use feature hashing ? (y,Y,n,N) ')
read_file.close() 

print('-------------------')
print('char count = '+str(sumz))
print('alphanumeric count = '+str(sumx))
print('line count = '+str(line_n))
print('word count = '+str(sumw))
print('BoW = '+str(BoW))



        
    
    
# 6330265421 (25.00) 115 (2021-03-22 22:29)

def stopwords():
    stop_word = open('stopwords.txt','r')
    w = []
    for line in stop_word:
        x = line.strip().split()
        for e in x:
            if e not in w:
                e = e.lower()
                w.append(e)
    stop_word.close()
    return w

file_name = input('File name = ')
def count(file_name):
    ch_c = 0
    alpha_c = 0
    alpha = []
    line_c = 0
    file = open(file_name)
    for line in file:
        x = line.strip()
        ch_c += len(x)
        line_c += 1
        a = ' '
        for e in x:
            if e in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789':
                alpha_c += 1
                a += e
            else: a += ' '
        alpha += a.split()
    file.close()
    return ch_c,alpha_c,line_c,len(alpha),alpha
a,b,c,d,e = count(file_name)

f_hash = input('Use feature hashing ? (y,Y,n,N) ').lower()
while f_hash not in 'yn':
    print('Try again.')
    f_hash = input('Use feature hashing ? (y,Y,n,N) ').lower()
if f_hash in 'y':
    M = int(input('M = '))

print('-------------------')
print('char count =',a)
print('alphanumeric count =',b)
print('line count =',c)
print('word count =',d)
    
def fhash(word,M):
    c = 0
    for i in range(len(word)):
        c += ord(word[i])*(37**i)
    return c % M 
def BoW(words):
    BoW = []
    w = []
    c = 1
    d = []
    for e in words:
        e = e.lower()
        if e not in stopwords(): w.append(e)
    if f_hash in 'y':
        for e in w:
            f = fhash(e,M)
            d.append(f)
        d.sort()
        a = d[0]
        for i in range(1,len(d)):
            if d[i] == a:
                c +=1
            else :
                BoW.append([d[i-1],c])
                a = d[i]
                c = 1  
        BoW.append([d[i],c])
    else:
        for e in w: d.append(e)
        d.sort()
        a = d[0]
        for i in range(1,len(d)):
            if d[i] == a: c +=1
            else :
                BoW.append([d[i-1],c])
                a = d[i]
                c = 1  
        BoW.append([d[i],c])
    return BoW
print('BoW =',BoW(e))            
# 6330266021 (16.34) 116 (2021-03-22 23:55)

def fhash(w,M):
    total = 0
    index = 0
    for i in w:
        if index == 0:
            total += ord(i)
        else:
            total += ord(i)*(37**(index-1))
        index += 1
    total = total%M
    return total

file_name = input("File name = ")
file_name = open(file_name, "r")
    
while True:
    feature = input("Use feature hashing ? (y,Y,n,N) ")
    if feature == 'Y' or feature == 'y' or feature == 'N' or feature == 'n':
        break;
    else:
        print("Try again.")

if feature == 'Y' or feature == 'y':
    M = int(input("M = "))

print("-----------------------")

with open('stopwords.txt', 'r') as stopwords:
    ban = stopwords.read().replace('\n', ' ')
ban = ban.split();

charCount = 0
alphanumbericCount = 0
lineCount = 0
wordCount = 0

temp = []
BoW = []
res = []

for text in file_name:
    charCount += len(text)

    for t in text:
        if t.isalnum():
            alphanumbericCount += 1

    text = text.lower()
    textArray = text.split()
    wordCount += len(textArray)

    resultwords  = [word for word in textArray if word not in ban]
    result = ' '.join(resultwords)
    result =  ''.join([i for i in result if i.isalnum() or ' ' in i])
    
    for word in result.split():
        temp.append(word)

    lineCount += 1

print("char count = ", charCount)
print("alphanumberic count = ",alphanumbericCount)        
print("line count = " , lineCount)
print("word count = ",wordCount)
if feature == 'Y' or feature == 'y':
    fhashArray = []
    for x in temp:
        fhashArray.append(fhash(x,M))

    BoW = [fhashArray.count(w) for w in fhashArray]

    for i in zip(fhashArray, BoW): 
        if i not in res: 
            res.append(i) 
    print("BoW = ", res)

else: 
    BoW = [temp.count(w) for w in temp]

    for i in zip(temp, BoW): 
        if i not in res: 
            res.append(i) 
    print("BoW = ", res)

# 6330267721 (23.20) 117 (2021-03-22 14:30)
file_name = input('File name = ')
b = input('Use feature hashing ? (y,Y,n,N) ')
def fhash(w,M) :
    number = 0
    result = 0
    for i in range(len(w)) :
        number += ord(w[i])*(37**i)
    result += number % M
    return result
def cut_special_char(a) :
    list_word = []
    string_word = ''
    for i in range(len(a)) :
        if 'a' <= a[i] <= 'z' or 'A' <= a[i] <= 'Z' or '0' <= a[i] <= '9' :
            string_word += a[i]
        else :
            if string_word != '' :
                list_word.append(string_word)
            string_word = ''
    return list_word
def number_and_alphabet(a) :
    count = 0
    for i in range(len(a)) :
        if 'a' <= a[i] <= 'z' or 'A' <= a[i] <= 'Z' or '0' <= a[i] <= '9' :
            count += 1
    return count

c = ''
while True :
    if b == 'y' or b == 'Y' :
        c = int(input('M = '))
        break
    elif b == 'n' or b == 'N' :
        break
    else :
        print('Try again.')
        b = input('Use feature hashing ? (y,Y,n,N) ')

fn1 = open(file_name, 'r')
fn2 = open('stopwords.txt', 'r')
count_line = 0
char_count = 0
word_count = 0
num_and_alpha = 0
BoW_1 = []
t = []
flash = []
show = []
stopwords_string = ''
file_name_string = ''
line_3 = ''
line_4 = ''

line_1 = fn1.readlines()
line_1 = ''.join(line_1)
for i in line_1 :
    if '0' <= i <= '9' or 'a' <= i <= 'z' or 'A' <= i <= 'Z' :
        line_3 += i
    else :
        line_3 += ' '

line_3 = line_3.split()
for i in range(len(line_3)) :
    file_name_string += line_3[i].lower() + ' '
list_of_file_name = file_name_string.split()



line_2 = fn2.readlines()
line_2 = ''.join(line_2)
for i in line_2 :
    if '0' <= i <= '9' or 'a' <= i <= 'z' or 'A' <= i <= 'Z' :
        line_4 += i
    else :
        line_4 += ' '

line_4 = line_4.split()
for i in range(len(line_4)) :
    stopwords_string += line_4[i].lower() + ' '
list_of_stopwords = stopwords_string.split()

for i in list_of_file_name :
    if i not in list_of_stopwords :
        BoW_1.append(i)


for i in BoW_1 :
    if [i,BoW_1.count(i)] not in t :
        t.append([i,BoW_1.count(i)])
        

fn3 = open(file_name, 'r')
for i in fn3 :
    char_count += len(i.strip())
    list_word = i.strip().split() 
    count_line += 1
    word_count += len(cut_special_char(i))
    num_and_alpha += number_and_alphabet(i)

print('-------------------')
print('char count =',char_count)
print('alphanumeric count =',num_and_alpha)
print('line count =',count_line)
print('word count =',word_count)
if c == '' :
     print('BoW =',t)
else :
    for i in BoW_1 :
        flash.append(fhash(i,c))
    for i in range(min(flash),max(flash)+1)  :
        if flash.count(i) != 0:
            show.append([i,flash.count(i)])
        
    print('BoW =',show)


fn1.close()
fn2.close()
fn3.close()


# 6330268321 (19.25) 118 (2021-03-22 17:41)
def bow(s):
    file_name=open(s)
    a=[]
    l=[]
    count=[]
    w=[]
    BoW=[]
    u=""
    x=["(", ")", "-", "_", "[", "]", '"', "'", ';', ':', '<', '>', '.', ',']
    stopwords=open("stopwords.txt")
    
    for line in stopwords:
        line=line.split()
        for e in line:
            if e!="\n":
                a.append(e)
        
    
    for line in file_name:
        for e in line:
            if e in x:
                u+=" "
            else:
                u+=e
        u=u.lower()
    u=u.split()
    for e in u:
        if e not in a:
            l.append(e) #ได้ list ที่มีแต่พิมพ์เล็กและตัด stopword ออกไปแล้ว
    c=0
    for e in l:
        for i in range(len(l)):
            if e == l[i]:
                c+=1
        count.append(c)
        c=0
    for i in range(len(l)):
        w.append([l[i],count[i]])
    for e in w:
        if e not in BoW:
            BoW.append(e)
        
    stopwords.close()
    return BoW
    
#-------------------------------
def fhash(w,M):
    a=0
    for i in range(len(w)):
        a+=ord(w[i])*37**i
    b=a%int(M)
    return b



#--------------------------------
s=input("File name = " )
file_name=open(s)
a=["y","Y","n","N"]
usef=input("Use feature hashing ? (y,Y,n,N) ")
while usef not in a:
    print("Try again.")
    usef=input("Use feature hashing ? (y,Y,n,N) ") 

if  usef =="N" or usef=="n":
    u=""
    x=["(", ")", "-", "_", "[", "]", '"', "'", ';', ':', '<', '>', '.', ',']
    linecount=0
    alphacount=0
    char=""
    for line in file_name:
        linecount+=1 #นับจำนวนบรรทัด
        for e in line:
            if e in x:
                u+=" " #นับจำนวนคำ(word count)
            else:
                u+=e
            
        
        
        line=line.lower()
        for e in line:
            
            if e!="\n":   #นับอักขระ
                char+=e
        
            if "a"<=e<="z" or "0"<=e<="9": #นับจำนวนตัวอักษรอังกฤษและตัวเลขเท่านั้น
                 alphacount+=1     
        
        line=line.split()
        
            
    u=u.split()
    print("-------------------")
    print("char count =",str(len(char))) #จำนวนอักขระ
    print("alphanumeric count =", str(alphacount)) #จำนวนตัวอักษากับตัวเลข
    print("line count =",str(linecount))#จำนวนบรรัด
    print("word count =",str(len(u)))#จำนวนคำ
    print("BoW =",bow(s))
#-----------------------------------------------------------------------------
elif usef =="Y" or usef=="y":
    M=input("M = ")
    u1=""
    u2=""
    l=[]
    x=["(", ")", "-", "_", "[", "]", '"', "'", ';', ':', '<', '>', '.', ',']
    linecount=0
    alphacount=0
    char=""
    a=[]
    stopwords=open("stopwords.txt")
    for line in stopwords:
        line=line.split()
        for e in line:
            if e!="\n":
                a.append(e)
    for line in file_name:
        linecount+=1 #นับจำนวนบรรทัด
        for e in line:
            if e in x:
                u1+=" " #นับจำนวนคำ(word count)
            else:
                u1+=e
            
        
        
        line=line.lower()
        for e in line:
            
            if e!="\n":   #นับอักขระ
                char+=e
        
            if "a"<=e<="z" or "0"<=e<="9": #นับจำนวนตัวอักษรอังกฤษและตัวเลขเท่านั้น
                 alphacount+=1     
        
        
        
    
        for e in line:
            if e in x:
                u2+=" "
            else:
                u2+=e
    u2=u2.lower()        
    u2=u2.split()
    for e in u2:
        if e not in a:
            l.append(e) #ได้ list ที่มีแต่พิมพ์เล็กและตัด stopword ออกไปแล้ว
    
    n1=[]
    countfh=0
    n2=[]
    n3=[]
    for e in l:
        n1.append(fhash(e,M))
    for i in range(max(n1)+1):
        for j in range(len(n1)):
            if i==n1[j]:
                countfh+=1
        n2.append(countfh)
        countfh=0
        if n2[i]!=0:
            n3.append([i,n2[i]])
        
        
    u1=u1.split()    
    print("-------------------")
    print("char count =",str(len(char))) #จำนวนอักขระ
    print("alphanumeric count =", str(alphacount)) #จำนวนตัวอักษากับตัวเลข
    print("line count =",str(linecount))#จำนวนบรรัด
    print("word count =",str(len(u1)))#จำนวนคำ
    print("BoW =",n3)
    stopwords.close()
file_name.close()

# 6330269021 (22.54) 119 (2021-03-21 21:55)

def fhash(w, m):
    s = 0
    for i in range(len(w)):
        s += ord(w[i]) * 37**i
    return s % m


fileName = input("File name = ").strip()
while True:
    fhashMode = input("Use feature hashing ? (y,Y,n,N) ").strip().lower()
    if fhashMode == 'y' or fhashMode == 'n':
        break
    else:
        print("Try again.")
if fhashMode == 'y':
    m = int(input("M = "))
print("-------------------")


stopWordsFile = open("stopwords.txt")
stopWords = []
for line in stopWordsFile:
    if line != "":
        for e in line.split():
            stopWords.append(e)
stopWordsFile.close()


inputFile = open(fileName)
chCount = 0
alnumCount = 0
lineCount = 0
wordCount = 0
BoW = []
wordTemp = ""
words = []
for line in inputFile:
    lineCount += 1
    for ch in line:
        chCount += 1
        if ch.isalnum():
            alnumCount += 1
            wordTemp += ch
        elif wordTemp != "":
            words.append(wordTemp)
            wordTemp = ""
    words.append(wordTemp)
inputFile.close()
wordCount = len(words)
print("char count =", chCount)
print("alphanumeric count =", alnumCount)
print("line count =", lineCount)
print("word count =", wordCount)


wordsLower = [e.lower() for e in words if e.lower() not in stopWords and e != ""]
wordsLowerNoDuplicate = []
for e in wordsLower:
    if e not in wordsLowerNoDuplicate:
        wordsLowerNoDuplicate.append(e)
for e in wordsLowerNoDuplicate:
    if fhashMode == 'y':
        BoW.append([fhash(e, m), wordsLower.count(e)])
    else:
        BoW.append([e, wordsLower.count(e)])
print("BoW =", BoW)

# 6330270521 (23.00) 120 (2021-03-22 18:49)

def fhash(w,M) :
    sum = 0
    for i in range(len(w)) :
        sum += ord(w[i])*(37**i)
    fhash = sum % int(M)
    return fhash
def BoW(file) :
    file_name = open(file)
    stopwords = open('stopwords.txt')
    stop = []
    sentence = ''
    word = []
    countword = []
    alpnum = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
    cnt = 0
    BoW = []
    for i in stopwords :
        i = i.split()
        for j in i :
            if j != '\n' :
                stop.append(j)
    for i in file_name :
        for j in i :
            if j in alpnum :
                sentence += j
            else :
                sentence += ' '
        sentence = sentence.lower()
    sentence = sentence.split()
    for i in sentence :
        if i not in stop :
            word.append(i)
    for i in word :
        for j in range(len(word)) :
            if i in word[j] :
                cnt += 1
        countword.append(cnt)
        cnt = 0
    for i in range(len(word)) :
        BoW.append([word[i],countword[i]])

    stopwords.close()
    return BoW

file = input('File name = ')
file_name = open(file)
usefh = input('Use featur hashing ? (y,Y,n,N) ')
while usefh not in 'yYnN' :
    print('Try again')
    usefh = input('Use featur hashing ? (y,Y,n,N) ')
if usefh == 'Y' or usefh == 'y' :
    M = input('M = ')
    alpnum = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
    stopwords = open('stopwords.txt')
    stopword = []
    linecnt = 0
    for i in stopwords :
        i = i.split()
        for j in i :
            stopword.append(j)
    sentence = ''
    alpcnt = 0
    charcnt = ''
    for i in file_name :
        linecnt += 1
        for j in i :
            if j != '\n' :
                charcnt += j
            if j in alpnum :
                sentence += j
                alpcnt += 1
            else :
                sentence += ' '
    sentence = sentence.lower()
    sentence = sentence.split()
    l = []
    for i in sentence :
        if i not in stopword :
            l.append(i)
    fh = []
    cnt = 0
    fhcnt = []
    bow = []
    for i in l :
        fh.append(fhash(i,M))        
    for i in range(max(fh)+1) :
        for j in range(len(fh)) :
            if i == fh[j] :
                cnt += 1
        fhcnt.append(cnt)
        cnt = 0
        if fhcnt[i] != 0 :
            bow.append([i,fhcnt[i]])
    print("-------------------")
    print("char count =",str(len(charcnt))) 
    print("alphanumeric count =", str(alpcnt))
    print("line count =",str(linecnt))
    print("word count =",str(len(sentence)))
    print("BoW =",bow)
    stopwords.close()
else :
    alpnum = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
    linecnt = 0
    alpcnt = 0
    charcnt = ''
    sentence = ''
    for i in file_name :
        linecnt += 1
        for j in i :
            if j != '\n' :
                charcnt += j
            if j in alpnum :
                sentence += j
                alpcnt += 1
            else :
                sentence += ' '
    print("-------------------")
    print("char count =",str(len(charcnt))) 
    print("alphanumeric count =", str(alpcnt))
    print("line count =",str(linecnt))
    print("word count =",str(len(sentence.split())))
    print("BoW =",BoW(file))
file_name.close()

# 6330271121 (30.00) 121 (2021-03-21 21:27)
file_name = input('File name = ')
ufh = input('Use feature hashing ? (y,Y,n,N) ').lower()

while ufh not in ['y','n']:
    print('Try again.')
    ufh = input('Use feature hashing ? (y,Y,n,N) ').lower()
    
stopword = open('stopwords.txt')
stw = []
for line in stopword:
    sword = line.strip().lower().split()
    stw += sword
stopword.close()

if ufh == 'n':
    fin = open(file_name)
    charc = 0
    alphc = 0
    linec = 0
    wordc = 0
    all_words = []
    cutwords = []
    for line in fin:
        linec += 1
        line = line.lower()
        if line[-1] == '\n':
            line = line[0:-1:1]
        charc += len(line) 
        newlin = '' 
        for e in line:
            if e in 'abcdefghijklmnopqrstuvwxyz' or e in '1234567890':
                newlin += e
            else:
                newlin += ' '
        words = newlin.split()
        all_words +=  words
        for a in words:
            if a not in cutwords and a not in stw:
                cutwords.append(a)
        al = ''.join(words)
        alphc += len(al)
        wordc += len(words)
    cutwords.sort()
    c = [0]*len(cutwords)
    for i in all_words:
        if i in cutwords:
            j = cutwords.index(i)
            c[j] += 1
    bow = []
    for i in range(len(c)):
        bow.append([cutwords[i],c[i]])        
    fin.close()
    print('-------------------')
    print('char count =',charc)
    print('alphanumeric count =',alphc)
    print('line count =',linec)
    print('word count =',wordc)
    print('BoW =',bow)
def fhash(w,M):
    num = 0
    for i in range(len(w)):
        num += ord(w[i])*(37**i)
    fhash = num%M
    return fhash
    
if ufh == 'y':
    M = int(input('M = '))
    fin = open(file_name)
    charc = 0
    alphc = 0
    linec = 0
    wordc = 0
    cutwords = []
    for line in fin:
        linec += 1
        line = line.lower()
        if line[-1] == '\n':
            line = line[0:-1:1]
        charc += len(line) 
        newlin = ''
        for e in line:
            if e in 'abcdefghijklmnopqrstuvwxyz' or e in '1234567890':
                newlin += e
            else:
                newlin += ' '
        words = newlin.split()
        for a in words:
            if a not in stw:
                cutwords.append(a)
        al = ''.join(words)
        alphc += len(al)
        wordc += len(words)
    fh = []
    for o in cutwords:
        if fhash(o,M) not in fh:
            fh.append(fhash(o,M))
    fh.sort()
    c = [0]*len(fh)
    for u in cutwords:
        j = fh.index(fhash(u,M))
        c[j] += 1
    bow = []
    for i in range(len(c)):
        bow.append([fh[i],c[i]])        
    fin.close()
    print('-------------------')
    print('char count =',charc)
    print('alphanumeric count =',alphc)
    print('line count =',linec)
    print('word count =',wordc)
    print('BoW =',bow)
# 6330272821 (10.10) 122 (2021-03-22 00:08)

def fhash(w, m):
    s = 0
    for i in range(len(w)):
        s += ord(w[i]) * 37**i
    return s % m
def f(file_name):
    chr_cnt, alnum_cnt, line_cnt, word_cnt = 0, 0, 0, 0
    buffer = ""
    words = []

    file = open(file_name)

    for line in file:
        line_cnt += 1
        for chr in line:
            chr_cnt += 1
            if chr.isalnum():
                alnum_cnt += 1
                buffer += chr
            elif buffer != "":
                words.append(buffer)
                buffer = ""
        words.append(buffer)

    file.close()

    word_cnt = len(words)

    return chr_cnt,alnum_cnt, line_cnt, word_cnt, words
def BoW_process(words, stopwords, is_hashing, M):
    BoW = []
    bag_of_word = []
    #bag_of_word = [n.lower for n in words if n.lower() not in stopwords and n != ""]
    for n in words:
        if n.lower() not in stopwords and n != "":
            bag_of_word.append(n.lower())

    bag_of_word_nodup = []

    for n in bag_of_word:
        if n not in bag_of_word_nodup:
            bag_of_word_nodup.append(n)

    for n in bag_of_word_nodup:
        if is_hashing:
            BoW.append([fhash(n, M), bag_of_word.count(n)])
        else: BoW.append([n, bag_of_word.count(n)])

    return BoW


if __name__ == '__main__':
    file_name  = input("File name = ").strip()

    while True:
        user_input = input("Use feature hashing ? (y,Y,n,N) ").strip().lower()
        if user_input == "y" or user_input == "n":
            break
        else: print("Try again.")

    is_hashing = False
    if user_input == "y": is_hashing = True
    if is_hashing:
        M = int(input("M = "))

    print("-------------------")

    #stopword
    stopword_file = open("stopwords.txt")
    stopwords = []
    for line in stopword_file:
        if line != "":
            for e in line.split(): stopwords.append(e)
    stopword_file.close()

    #prcoess
    chr_cnt,alnum_cnt, line_cnt, word_cnt, words = f(file_name)

    #BoW
    BoW = BoW_process(words, stopwords, is_hashing, M)

    #output
    print("char count =", chr_cnt)
    print("alphanumeric count =", alnum_cnt)
    print("line count =", line_cnt)
    print("word count =", word_cnt)
    print("BoW =", BoW)

# 6330273421 (30.00) 123 (2021-03-18 22:00)
def pure_apb(line): #รับ,คืนstr
    apb=''
    for e in line:
        if 'a'<=e<='z' or 'A'<=e<='Z' or '0'<=e<='9':
            apb+=e
        else:
            apb+=' '
    apb=apb.lower()
    return apb
def use_fh(c): #คืนy,Y,n,N
    while c not in ['y','Y','n','N']:
        print('Try again.')
        c=input('Use feature hashing ? (y,Y,n,N) ')
    return c
def bow_w(apb): #รับstr pureapb
    BoW=[]
    c=1
    word=apb.strip().split()
    word.sort()
    word.append('[]')
    for i in range(len(word)-1):
        if word[i]==word[i+1]:
            c+=1
        else:
            BoW.append([word[i],c])
            c=1
    BoW.sort()
    return BoW
def fhash(w,M): #รับlistคำ,int
    numord=0
    for i in range(len(w)):
        numord+=ord(w[i])*(37)**(i)
    num=numord%M
    return num
def bow_n(apb):
    word=apb.strip().split()
    word.sort()
    c=1
    BoW=[]
    BOW=[]
    for i in range(len(word)):
        num=fhash(word[i],M)
        BoW.append(num)
    BoW.sort()
    BoW.append([])
    for i in range(len(BoW)-1):
        if BoW[i]==BoW[i+1]:
            c+=1
        else:
            BOW.append([BoW[i],c])
            c=1
        BOW.sort()
    return BOW
filename=input('File name = ')
c=use_fh(input('Use feature hashing ? (y,Y,n,N) '))


file = open(filename,'r')
char=0
apb=0
ms=''
rms=''
line_c=0
for line in file:
    line_c+=1
    for e in line.strip():
           char+=1
    ms+=pure_apb(line)+' '
for e in ms:
    if 'a'<=e<='z' or 'A'<=e<='Z' or '0'<=e<='9':
        apb+=1
file.close()

stop = open('stopwords.txt','r')
stop_word=''
stopword=[]
for line in stop:
    stop_word+= line+' '
    stopword+=stop_word.strip().split()
for e in ms.strip().split():
    if e not in stop_word:
        rms+=e+' '
if c in ['y','Y']:
    M=int(input('M = '))
    BoW=bow_n(rms)
else:
    BoW=bow_w(rms)
word_c=len(ms.strip().split())

print('-------------------')
print('char count = '+str(char))
print('alphanumeric count = '+str(apb))
print('line count = '+str(line_c))
print('word count = '+str(word_c))
print('BoW =',BoW)
# 6330274021 (21.40) 124 (2021-03-22 02:02)
file_name = input('File name = ')
fhash_use = ''
while not fhash_use == 'y' or fhash_use == 'Y'or fhash_use == 'n'or fhash_use == 'N':
    fhash_use = input('Use feature hashing ? (y,Y,n,N) ')
    if fhash_use == 'y'or fhash_use == 'Y':
        m = int(input('M = '))
        break
    elif fhash_use == 'n'or fhash_use == 'N':
        break
    else:
        print('Try again.')
#----------------------------------------------------
def fhash(w,m):
    val = 0
    for i in range(len(w)):
        val += ord(w[i])*37**i
    f_value = val % m
    return f_value
#----------------------------------------------------
stopword = open('stopwords.txt', 'r')
sw = []
for line in stopword:
    w = line.strip().lower().split()
    sw += w
stopword.close()
#====================================================
file = open(file_name, 'r')
cc = 0
ac = 0
lc = 0
wc = 0
linef = ''
word = []
wordf = []
for line in file:
    line = line.strip().lower()
    lc += 1
    for e in line:
        cc += 1
        if 'a'<=e<='z' or 'A'<=e<='Z' or '0'<=e<='9':
            ac += 1
            linef += e
        else:
            linef += ' '
word += linef.split()
for f in word:
    if f not in sw:
        wordf.append(f)
wc = len(word)
file.close()

#====================================================
if fhash_use == 'y' or fhash_use == 'Y':
    hash_data = []
    bow = []
    w = []
    for i in range(len(wordf)):
        hash_data.append(fhash(wordf[i],m))
        if hash_data[i] not in w:
            w.append(hash_data[i])
    f = [0]*len(w)
    for i in range(len(w)):
        for j in range(len(hash_data)):
            if w[i] == hash_data[j]:
                f[i] += 1
        ele = [w[i],f[i]]
        bow.append(ele)
    bow.sort()

elif fhash_use == 'n' or fhash_use == 'N':
    bow = []
    w = []
    for i in range(len(wordf)):
        if wordf[i] not in w:
            w.append(wordf[i])
    f = [0]*len(w)
    for i in range(len(w)):
        for j in range(len(wordf)):
            if w[i] == wordf[j]:
                f[i] += 1
        ele = [w[i],f[i]]
        bow.append(ele)
    bow.sort()
#----------------------------------------------------
print('-------------------')
print('char count = '+str(cc))
print('alphanumeric count = '+str(ac))
print('line count = '+str(lc))
print('word count = '+str(wc))
print('BoW = '+str(bow))
# 6330275721 (30.00) 125 (2021-03-22 21:44)
#--------------------------------------------------------
alphabet = 'abcdefghijklmnopqrstuvwxyz'
number = '0123456789'
special_char = '!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
stopword_file = 'stopwords.txt'
#--------------------------------------------------------
def char_count(x):
    file = open(x, 'r')
    c = 0
    for line in file:
        line = line.strip()
        c += len(line)
    file.close()
    return c
def alphanumeric_count(x):
    file = open(x, 'r')
    c = 0
    for line in file:
        line = line.strip()
        text = ''
        for char in line:
            if char.lower() in alphabet or char in number:
                text += char
        c += len(text)
    file.close()
    return c
def line_count(x):
    file = open(x, 'r')
    c = 0
    for line in file:
        c += 1
    file.close()
    return c
def word_count(x):
    file = open(x, 'r')
    c = 0
    for line in file:
        line = line.strip()
        text = ''
        for char in line:
            if char in special_char:
                text += ' '
            else:
                text += char
        c += len(text.split())
    file.close()
    return c
def fhash(w, M):
    sum = 0
    for i in range(len(w)):
        sum += ord(w[i])*(37**i)
    return sum % M
def list_stopwords(x):
    file = open(x, 'r')
    stopwords_list=[]
    for line in file:
        line = line.strip()
        pre_stw = ''
        for char in line:
            if char in special_char:
                pre_stw += ' '
            else:
                pre_stw += char
        pre_stw = pre_stw.strip().split()
        for i in pre_stw:
            stopwords_list.append(i)
    file.close()
    return stopwords_list
def BoW(x, ufh, M):
    file = open(x, 'r')
    words=[]
    for line in file:
        line = line.strip()
        pre_words = ''
        for char in line:
            if char in special_char:
                pre_words += ' '
            else:
                pre_words += char
        pre_words = pre_words.strip().split()
        for e in pre_words:
            k = e.lower()
            if k not in list_stopwords(stopword_file):
                words.append(k)
    words.sort()
    file.close()
    
    bow = []; bow_n = []; bow_y = []; n_word = []
    if ufh in ['n', 'N']:
        for e in words:
            if e in bow_n:
                n_word[bow_n.index(e)] += 1
            else:
                bow_n.append(e); n_word.append(1)
        for i in range(len(bow_n)):
            bow.append([bow_n[i],n_word[i]])
        return bow
    elif ufh in ['y', 'Y']:
        for e in words:
            p = fhash(e,M)
            if p in bow_y:
                n_word[bow_y.index(p)] += 1
            else:
                bow_y.append(p); n_word.append(1)
        for i in range(len(bow_y)):
            bow.append([bow_y[i],n_word[i]])
        bow.sort()
        return bow

#--------------------------------------------------------
file_name = input('File name = ')
ufh = input('Use feature hashing ? (y,Y,n,N) ')
while ufh not in ['y','Y','n','N']:
    print('Try again.')
    ufh = input('Use feature hashing ? (y,Y,n,N) ')
if ufh in ['y', 'Y']:
    M = int(input('M = '))
elif ufh in ['n', 'N']:
    M = 0
else:
    M = 0
print('-'*19)
print('char count =', char_count(file_name))
print('alphanumeric count =', alphanumeric_count(file_name))
print('line count =', line_count(file_name))
print('word count =', word_count(file_name))
print('BoW =', BoW(file_name, ufh, M))
# 6330276321 (30.00) 126 (2021-03-21 02:17)

#-------------------------------------------------------
def fhash(w, M) : # w เป็นสตริงของคำ และ M เป็น int
    summation = 0
    for i in range(len(w)) :
        summation += ord(w[i]) * ( 37**i )
    result = summation % M
    return result

#-------------------------------------------------------
def import_words_from(filename) :
    a = []
    x = open(filename, "r")
    for line in x :
        words = line.split(" ")
        for e in words :
            a.append(e.strip())
    x.close()
    return a

#-------------------------------------------------------
def count_all_char(file_name) :
    count = 0
    x = open(file_name, "r")
    for line in x :
        count += len(line.strip())
    x.close()
    print("char count =", count)

#-------------------------------------------------------
def count_alphanumeric(file_name) :
    num = "0123456789"
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    alphanumeric = ""
    x = open(file_name, "r")
    for line in x :
        y = line.strip()
        for e in y :
            if e != " " and ( e.lower() in alphabet or e in num ) :
                alphanumeric += e
    x.close()
    count = len(alphanumeric)
    print("alphanumeric count =", count)

#-------------------------------------------------------
def count_line(file_name) :
    count = 0
    x = open(file_name, "r")
    for line in x :
        count += 1
    x.close()
    print("line count =", count)

#-------------------------------------------------------
def count_words(file_name) :
    num = "0123456789"
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    texts = ""
    x = open(file_name, "r")
    for line in x :
        y = line.strip()
        for e in y :
            if e.lower() in alphabet or e in num :
                texts += e
            else :
                texts += " "
        texts += " "
    list_of_words = texts.strip().split()
    count = len(list_of_words)
    print("word count =", count)

#-------------------------------------------------------
            
def get_alphanumeric(file_name) :
    num = "0123456789"
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    texts = ""
    x = open(file_name, "r")
    for line in x :
        y = line.strip()
        for e in y :
            if e.lower() in alphabet or e in num :
                texts += e
            else :
                texts += " "
        texts += " "
    list_of_words = texts.strip().split()
    x.close()
    return list_of_words

#-------------------------------------------------------
def BoW(file_name) :
    texts = get_alphanumeric(file_name)
    stoptexts = get_alphanumeric("stopwords.txt")
    no_stop = []
    for e in texts :
        if e.lower() not in stoptexts :
            no_stop.append(e.lower())
# ---------------------------------------
    no_stop_and_repeat = []
    for f in no_stop :
        if f not in no_stop_and_repeat :
            no_stop_and_repeat.append(f)        
# ---------------------------------------
    result = []
    for g in no_stop_and_repeat :
        result.append([g, no_stop.count(g)])
    result.sort()
    print("BoW =", result)

#-------------------------------------------------------
def BoW_fhash(file_name, M) :
    texts = get_alphanumeric(file_name)
    stoptexts = get_alphanumeric("stopwords.txt")
#     print(texts)          # ได้เป็นลิสต์ของคำกับตัวเลขแต่ยังซ้ำอยู่
#     print(stoptexts)
    no_stop = []
    for e in texts :
        if e.lower() not in stoptexts :
            no_stop.append(e.lower())
# ---------------------------------------
    converted = []
    for f in no_stop :
        converted.append(fhash(f, M))
#     print("converted =", converted)
# ---------------------------------------
    converted_no_repeat = []
    for g in converted :
        if g not in converted_no_repeat :
            converted_no_repeat.append(g)
#     print("converted_no_repeat =", converted_no_repeat)
# ---------------------------------------
    result = []
    for h in converted_no_repeat :
        result.append([h, converted.count(h)])
    result.sort()
    print("BoW =", result)



#-------------------------------------------------------------------



file_name = input("File name = ")
while True :
    condition = input("Use feature hashing ? (y,Y,n,N) ")
    if condition == "y" or condition == "Y" :
        using_fhash = True
        break
    elif condition == "n" or condition == "N" :
        using_fhash = False
        break
    else :
        print("Try again.")


if not using_fhash :
    print("-------------------")
    count_all_char(file_name)
    count_alphanumeric(file_name)
    count_line(file_name)
    count_words(file_name)
    BoW(file_name)
    
else :
    M = int(input("M = "))
    print("-------------------")
    count_all_char(file_name)
    count_alphanumeric(file_name)
    count_line(file_name)
    count_words(file_name)
    BoW_fhash(file_name, M)
# 6330277021 (23.55) 127 (2021-03-22 23:54)

file_name = input("File name = ")
fh = input("Use feature hashing ? (y,Y,n,N) ")
while fh not in ['Y', 'y', 'N', 'n']:
    print("Try again.")
    fh = input("Use feature hashing ? (y,Y,n,N) ")
if fh == 'Y' or fh == 'y':
    M = input("M = ")

f1 = open("stopwords.txt", "r")
stop_words = []
for line in f1:
    if len(line) != 0:
        sw = line.strip().split()
        for e in sw:
            if e.lower() not in stop_words:
                stop_words.append(e.lower())
f1.close()

print("-------------------")

f2 = open(file_name, "r")
char_count = 0
for line in f2:
    if line[-1] == "\n":
        line = line[:len(line)-1:]
    char_count += len(line)
print("char count =", char_count)
f2.close()

f2 = open(file_name, "r")
an_count = 0
for line in f2:
    for ch in line:
        if 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' or '0' <= ch <= '9':
            an_count += 1
print("alphanumeric count =", an_count)
f2.close()

f2 = open(file_name, "r")
line_count = 0
for line in f2:
    line_count += 1
print("line count =", line_count)
f2.close()

f2 = open(file_name, "r")
word_count = []
x = ''
for line in f2:
    for e in line:
        if 'A' <= e <= 'Z' or 'a' <= e <= 'z' or '0' <= e <= '9':
            x += e
        else:
            if x != '' and e != '\n':
                word_count.append(x)
                x = ''
    if x != '':
        word_count.append(x)
        x = ''
print("word count =", len(word_count))
f2.close()

f2 = open(file_name, "r")
BoW = []
y = ''
for line in f2:
    line = line.lower()
    for ch in line:
        if 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' or '0' <= ch <= '9' or ch == ' ':
            y += ch
        elif ch == '\n':
            y += ' '
y = y.split()
y2 = []
for a in y:
    if a not in stop_words:
        y2.append(a)
y3 = []
y4 = []
for b in y2:
    if b not in y3:
        y3.append(b)
        y4.append(1)
    else:
        z = y3.index(b)
        y4[z] += 1
for i in range(len(y3)):
    BoW.append([y3[i], y4[i]])
f2.close()

if fh == 'N' or fh == 'n':
    print("BoW =", BoW)
# 6330278621 (24.40) 128 (2021-03-21 02:45)
file_name = input('File name = ')
x = input('Use feature hashing ? (y,Y,n,N) ')
File_name = open(file_name,'r')
while x not in ['y','Y','n','N']:
    print('Try again.')
    x = input('Use feature hashing ? (y,Y,n,N) ')
if x in ['y','Y']:
    M = int(input('M = '))

#------------------------------------------------------
engalpha = 'abcdefghijklmnopqrstuvwxyz'
num = '0123456789'
#------------------------------------------------------
if x in ['y','Y','n','N']:
    char_count = 0
    alphanumeric_count = 0
    line_count = 0
    word_count = 0
    for line in File_name:
        line = line.strip()
        char_count += len(line)
        line_count += 1
        w = ''
        for e in line:
            if e in engalpha or e in engalpha.upper() or e in num:
                alphanumeric_count += 1
                w += e
            else:
                w += ' '
        w = w.split()
        word_count += len(w)
    print('-'*19)
    print('char count = ',char_count)
    print('alphanumeric count = ',alphanumeric_count)
    print('line count = ',line_count)
    print('word count = ',word_count)
File_name.close()

#------------------------------------------------------

File_name = open(file_name,'r')
stop_words = open('stopwords.txt','r')
list_of_stpw = []
for line in stop_words:
    line = line.strip().split()
    for i in range(len(line)):
        list_of_stpw.append(line[i])
if x in ['n','N']:
    new_line = []
    Line = ''
    for line in File_name: 
        line = line.strip().lower()
        for e in line:
            if e in engalpha or e in num:
                Line += e
            else:
                Line += ' '
    Line = Line.split()
    for i in range(len(Line)):
        if Line[i] not in list_of_stpw:
            new_line.append(Line[i])
    word_in_nl = []
    n_in_nl = []
    for i in range(len(new_line)):
        new_line.sort()
        if new_line[i] not in word_in_nl:
            word_in_nl.append(new_line[i])
    e = new_line[0]
    n = 1
    for i in range(1,len(new_line)):
        if new_line[i] == e:
            n += 1
        else:
            n_in_nl.append(n)
            e = new_line[i]
            n = 1
    n_in_nl.append(n)

    w_n = []
    for i in range(len(word_in_nl)):
        w_n.append([word_in_nl[i],n_in_nl[i]])
    print('BoW = ',w_n)
File_name.close()

#------------------------------------------------------
def fhash(w,M):
    sums = 0
    for i in range(len(w)):
        sums += (ord(w[i])) * (37**i)
    return sums % M

File_name = open(file_name,'r')
stop_words = open('stopwords.txt','r')
if x in ['y','Y']:
    new_line = []
    Line = ''
    for line in File_name: 
        line = line.strip().lower()
        for e in line:
            if e in engalpha or e in num:
                Line += e
            else:
                Line += ' '
    Line = Line.split()
    for i in range(len(Line)):
        if Line[i] not in list_of_stpw:
            new_line.append(Line[i])
    BoW = []
    val = []
    for i in range(len(new_line)):
        val.append(fhash(new_line[i],M))
    val.sort()
    n_of_BoW = []
    e = val[0]
    n = 1
    for i in range(1,len(val)):
        if val[i] == e:
            n += 1
        else:
            n_of_BoW.append(n)
            e = val[i]
            n = 1
    n_of_BoW.append(n)
    new_val = []
    new_val.append(val[0])
    for i in range(1,len(val)):
        if val[i] != val[i-1]:
            new_val.append(val[i])
    for i in range(len(new_val)):
        BoW.append([new_val[i],n_of_BoW[i]])
    print('BoW = ',BoW)
stop_words.close()  
File_name.close()      
# 6330279221 (30.00) 129 (2021-03-22 22:27)

file_name = input('File name = ',)
u = input('Use feature hashing ? (y,Y,n,N) ',)
#--------------------------------------------------------
def fhash(w, M):
    G = 37
    n = 0
    cx = 0
    for i in w:
        f = ord(i)*(G**n)
        cx += f
        n += 1
    c = cx % M
    return c
#---------------------------------------------------------
while u not in ['y', 'Y', 'n', 'N']:
    print('Try again')
    u = input('Use feature hashing ? (y, Y, n, N) ',)
    
#-----------------------------------------------------------
stop = open('stopwords.txt', 'r')
s = stop.readlines()
sw = []
for i in s:
    i = i.lower()
    sw += i.split()
#-----------------------------------------------------------    
fn = open(file_name, 'r')
f = fn.readlines()
linecount = len(f)
for i in range(len(f)):
    f[i] = f[i].strip('\n').lower()
    g = ''
    for e in f[i]:
        if e.isalnum():
            g += e
        else:
            g += ' '
    f[i] = g
chacount = 0
for k in f:
    chacount += len(k)
wordcount = 0
for j in f:
    wordcount += len(j.split())
alnumcount = 0
for i in f:
    alnumcount += len(''.join(i.split()))
wds = []
for i in f:
    wds += i.split()
wds.sort()
words = []
for j in wds:
    if j not in sw:
        words.append(j)
word = []
for i in range(len(words)):
    if words[i] not in words[i+1:]:
        word.append(words[i])
if u == 'y' or u == 'Y':
    M = input('M = ',)
    BoW = []
    fhs = []
    for i in words:
        fhs.append(fhash(i,int(M)))
    fh = []
    fr = []
    for j in fhs:
        if j in fh:
            fr[fh.index(j)] = fr[fh.index(j)]+1
        else:
            fh.append(j)
            fr.append(1)
    for i in range(len(fh)):
        w = [fh[i],fr[i]]
        BoW.append(w)
    BoW.sort()
            
elif u == 'n' or u == 'N':
    BoW = []
    for i in word:
        w = [i]
        c = 0
        for e in range(len(words)):
            if i == words[e]:
                c += 1
        w.append(c)
        BoW.append(w)
            
print('-------------------')
print('char count =', chacount)
print('alphanumeric count =', alnumcount)
print('line count =', linecount)
print('word count =', wordcount)
print('BoW =', BoW)
            
# 6330280821 (0.00) 130 (2021-03-22 22:05)

def char_count(fn):
    file_name = open(fn)
    c = 0
    for e in file_name:
        for a in e:
            if a != "\n":
                c += 1
    file_name.close
    return c
def count_line(fn):
    file_name = open(fn)
    c = 0
    for line in file_name:
        c += 1
    file_name.close()
    return c
def alphanumeric(fn):
    a = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
    file_name = open(fn)
    c = 0
    for e in file_name :
        for d in e:
            if d in a:
                c += 1
    file_name.close()
    return c
def word_count(fn):
    file_name = open(fn)
    c = ""
    for e in file_name:
        for a in e:
            if a not in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789":
                c += " "
            else :
                c += a
    file_name.close
    return c.split()

s = open("stopwords.txt")
stop = ""
for e in s:
    for d in e:
        stop += d
stop2 = stop.lower().split()
s.close()
def bow_no_hashing(fn):
    p = []
    q = word_count(fn)
    for e in q:
        if e.lower() not in stop2:
            p.append(e)
    p.sort()
    p += "!!"
    d = p[0]
    last = []
    num = 1
    for j in range(1,len(p)) :
        if d != p[j]:
            last.append([d,num])
            num = 1
            d = p[j]
        else :
            num += 1
    return last
def fhash(w,M):
    c = 0
    a = 0
    G = 37
    for i in w :
        c += ord(i)*(G**a)
        a += 1
    b = c%M
    return b
def bow_hashing(fn,M):
    p = []
    q = word_count(fn)
    for e in q:
        if e.lower() not in stop2:
            p.append(e)
    s = []
    for i in p :
        v = fhash(i,int(M))
        s.append(v)
    s.sort()
    s += "!!"
    d = s[0]
    last = []
    num = 1
    for j in range(1,len(s)) :
        if d != s[j]:
            last.append([d,num])
            num = 1
            d = s[j]
        else :
            num += 1
    return last
print(bow_hashing("sample.txt",10))
    

x = input("File name = ")
b = input("Use feature hashing ? (y,Y,n,N) ")
while b not in "yYnN":
    print("Try again.")
    b = input("Use feature hashing ? (y,Y,n,N) ")
if b in "yY":
    M = input("M = ")
    print("-------------------")
    print("char count =",char_count(x))
    print("alphanumeric count =",alphanumeric(x))
    print("line count =",count_line(x))
    print("word count =",len(word_count(x)))
    print("BoW = ",bow_hashing(x,M))
else :
    print("-------------------")
    print("char count =",char_count(x))
    print("alphanumeric count =",alphanumeric(x))
    print("line count =",count_line(x))
    print("word count =",len(word_count(x)))
    print("BoW = ",bow_no_hashing(x))

# 6330281421 (27.80) 131 (2021-03-21 12:19)
#--------------------------------------
#ข้อมูลที่แก้ได้
stopword_file='stopwords.txt'
sp_char='!@#$%^&*()_+{}[]:\";\',./<>?\\=-`'
al_and_nume='abcdefghijklmnopqrstuvwxyz0123456789'
#--------------------------------------
#ส่วนฟังก์ชั่น
def c_count(filename):
    with open(filename,'r') as file:
        n=0
        for line in file:
            line=line.strip()
            n+=len(line)
    return n
def alpha_count(filename):
    with open(filename,'r') as file:
        n=0
        for line in file:
            line=line.strip()
            text=''
            for char in line:
                #if not(char in sp_char):
                if char.lower() in al_and_nume :
                    text+=char
            #text=''.join(text.split())
            n+=len(text)
    return n
def line_count(filename):
    with open(filename,'r') as file:
        n=0
        for line in file:
            n+=1
    return n
def word_count(filename):
    with open(filename,'r') as file:
        n=0
        for line in file:
            line=line.strip()
            text=''
            for char in line:
                if char in sp_char:
                    text+=' '
                else:
                    text+=char
            n+=len(text.split())
    return n
def list_of_stopwords(filename):
    with open(filename,'r') as file:
        stopwords_list=[]
        for line in file:
            line=line.strip()
            text=''
            for char in line:
                if char in sp_char:
                    text+=' '
                else:
                    text+=char
            text=text.split()
            for i in text:
                stopwords_list.append(i.lower())
    return stopwords_list
def fhash(word,m):
    sum=0
    for i in range(len(word)):
        sum+=ord(word[i])*(37**i)
    return sum%m
def BoW(filename,condition,m):
    with open(filename,'r') as file:
        words=[]
        for line in file:
            line=line.strip()
            text=''
            for char in line:
                if char in sp_char:
                    text+=' '
                else:
                    text+=char
            text=text.split()
            for i in text:
                if not(i.lower() in list_of_stopwords(stopword_file)):
                    words.append(i.lower())
        words.sort()
    bag_of_word=[]
    repit_word=[]
    bag_of_words=[]
    if condition.lower() =='n':
        for i in words:
            if i in bag_of_word:
                repit_word[bag_of_word.index(i)]+=1
            else:
                bag_of_word.append(i)
                repit_word.append(1)
        for i in range(len(bag_of_word)):
            bag_of_words.append([bag_of_word[i],repit_word[i]])
        return bag_of_words
    else:
        for i in words:
            p=fhash(i,m)
            if p in bag_of_word:
                repit_word[bag_of_word.index(p)]+=1
            else:
                bag_of_word.append(p)
                repit_word.append(1)
        for i in range(len(bag_of_word)):
            bag_of_words.append([bag_of_word[i],repit_word[i]])
        bag_of_words.sort()
        return bag_of_words
#--------------------------------------
#ส่วนทำงาน
file=input('File name = ')
feature=input('Use feature hashing ? (y,Y,n,N) ')
while not(feature in ['Y','n','N','y']):
    print('Try again.')
    feature=input('Use feature hashing ? (y,Y,n,N) ')
if feature.lower() == 'y':
    m=int(input('M = '))
else:
    m=0
print('-------------------')
print('char count =',c_count(file))
print('alphanumeric count =',alpha_count(file))
print('line count =',line_count(file))
print('word count =',word_count(file))
print('BoW =',BoW(file,feature,m))
# 6330282021 (30.00) 132 (2021-03-21 18:23)

def fhash(w,M):
    sum_f = 0
    for a in range(len(w)):
        sum_f += (ord(w[a])*(37**a))
    result = sum_f % M
    return result
    
def words_in_line(line):
    s = ''
    for e in line:
        if 'A' <= e <= 'Z' or 'a' <= e <= 'z' or '0' <= e <= '9':
            s += e
        else:
            s += ' '
    return s.split()

def remove_stopwords(line,stop_words):
    s = []
    line = line.lower().strip()
    line = words_in_line(line)
    for e in line:
        if e not in stop_words:
            s.append(e)   
    return s

def count(line,word):
    c = 0
    while True:
        if word in line:
            c += 1
            line.remove(word)
        else:
            break
    return c

#------------------------------------
file_name = input('File name = ')
ans = input('Use feature hashing ? (y,Y,n,N) ').lower()
while ans != 'y' and ans != 'n':
    print('Try again.')
    ans = input('Use feature hashing ? (y,Y,n,N) ').lower()
if ans == 'y':
    m = int(input('M = '))

print('-------------------')

#------------------------------------
# check stop words
sw_file = open('stopwords.txt','r')
stop_words = []
for l in sw_file:
    l = l.lower().split()
    for data in l:
        stop_words.append(data)

sw_file.close()

#------------------------------------
# check data in file_name
f = open(file_name,'r')
char_c = 0
al_c = 0
line_c = 0
word_c = 0
words = []
words_list = []
line_list_no_stopword = []
line_list = []
for line in f:
    line = line.strip()
    line_c += 1
    char_c += len(line)
    w = words_in_line(line)
    for ww in w:
        al_c += len(ww)
        word_c += 1
        line_list.append(ww.lower())
    words.append(remove_stopwords(line,stop_words))


print('char count =',char_c)
print('alphanumeric count =',al_c)
print('line count =',line_c)
print('word count =',word_c)    
     
for w1 in words:
    for w2 in w1:
        line_list_no_stopword.append(w2)
        if w2 not in words_list:
            words_list.append(w2)

BoW = []
if ans != 'y':
    for d in words_list:
        n = count(line_list,d.lower())
        bag = [d,n]
        BoW.append(bag)
else:
    line_list_fhash = []
    word_list_fhash = []
    for k in line_list_no_stopword:
        word_code1 = fhash(k,m)
        line_list_fhash.append(word_code1)
    for kk in words_list:
        word_code2 = fhash(kk,m)
        if word_code2 not in word_list_fhash:
            word_list_fhash.append(word_code2)
    for d in word_list_fhash:
        n = count(line_list_fhash,d)
        bag = [d,n]
        BoW.append(bag)
BoW.sort()

print('BoW =', BoW)

f.close()
# 6330283721 (22.95) 133 (2021-03-22 18:39)
def fhash(w,M):
    s=0
    for i in range(len(w)):
        s+=ord(w[i])*(37**i)
    s=s%M
    return s
#--------------------------------------------------------------------------------------------
file_name = input('File name = ',)
a=open(file_name,'r')
a=a.read()
b=open('stopwords.txt','r')
b=b.read()
#--------------------------------------------------------------------------------------------
u=len(a)
if a[-1]=="\n":
    c = -1
    u -= 1
else:
    c = 0
t = ""
s = ""
for e in a:
    if e in "\"\'/\\,.:;()[]{}":
        t+=" "
    elif e in "\n":
        t+=" "
        c+=1
    else:
        t+=e
for f in b:
    if f in "\"\'/\\,.:;()[]{}":
        s+=" "
    elif f in "\n":
        s+=" "
    else:
        s+=f
t=t.split()
s=s.split()
x=('').join(t)
b1=[]
b2=[]
b3=[]
b4=[]
BoW1=[]
BoW2=[]
p=[]
q=[]
r=[]
for i in range(len(s)):   
    b2.append(s[i].lower())
for i in range(len(t)):
    if t[i].lower() not in b2:
        b1.append(t[i].lower())
for word in b1:
    if word not in b3:
        b3.append(word)
        count=1
        b4.append(count)
    elif word in b3:
        b4[b3.index(word)]+=1
for i in range(len(b3)):
    BoW1.append([b3[i],b4[i]])

        
        
#--------------------------------------------------------------------------------------------
ch=True
while ch==True:
    fh = input('Use feature hashing ? (y,Y,n,N) ',)
    if fh in 'nN':
        print('-------------------')
        print('char count =',u-c)
        print('alphanumeric count =',len(x))
        print('line count =',c+1)
        print('word count =',len(t))
        print('BoW =',BoW1)
        ch=False
    elif fh in 'yY':
        M = int(input('M = ',))
        print('-------------------')
        print('char count =',u-c)
        print('alphanumeric count =',len(x))
        print('line count =',c+1)
        print('word count =',len(t))
        for i in range(len(b1)):
            p.append(fhash(b1[i],M))
        for n in p:
            if n not in q:
                q.append(n)
                count=1
                r.append(count)
            elif n in q:
                r[q.index(n)]+=1
        for i in range(len(q)):
            BoW2.append([q[i],r[i]])
        print('BoW =',sorted(BoW2))
        ch=False
    else:
        print('Try again.')

    
# 6330284321 (30.00) 134 (2021-03-21 20:15)
fileName = input("File name = ")
M = -1
while True: 
  tp = input("Use feature hashing ? (y,Y,n,N) ")
  if tp == 'Y' or tp == 'y':
    M = int(input("M = "))
    break
  elif tp == 'N' or tp == 'n':
    break
  else:
    print("Try again.")
 
print("-------------------")
 

 
ualpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
lalpha = "abcdefghijklmnopqrstuvwxyz"
num = "0123456789"
 

f = open(fileName, "r")
stringList = []
charCount, alphaCount, lineCount, wordCount = 1, 0, 0, 0
for line in f:
  lineCount += 1
  charCount += len(line) - 1
  cur = ""
  line += " "
  for c in line:
    status = True
    for i in range(26):
      if ualpha[i] == c or lalpha[i] == c:
        cur += lalpha[i]
        status = False
    for i in range(10):
      if num[i] == c:
        cur += num[i]
        status = False
    if status and cur != "":
      stringList.append(cur)   
      alphaCount += len(cur)
      cur = ""
wordCount = len(stringList)

 
print("char count =", charCount)
print("alphanumeric count =", alphaCount)
print("line count =", lineCount)
print("word count =", wordCount)
 

stopwords = []
 
f = open("stopwords.txt", "r")
for line in f:
  x = line.split()
  for z in x:
    stopwords.append(z);
 

 
BoW = []
 
if M == -1: 
  for word in stringList:
    if word in stopwords:
      continue
    status = True
    for idx in range(len(BoW)):
      if BoW[idx][0] == word:
        BoW[idx][1] += 1
        status = False
        break
    if status:
      BoW.append([word, 1]) 
else:
  for word in stringList:
    if word in stopwords:
      continue
    status = True
    
    num = 0
    for c in word[::-1]:
      num *= 37
      num += ord(c)
      num %= M
    for idx in range(len(BoW)):
      if BoW[idx][0] == num:
        BoW[idx][1] += 1
        status = False
        break
    if status:
      BoW.append([num, 1]) 
    
BoW.sort()
print("BoW =", BoW)
# 6330285021 (28.00) 135 (2021-03-22 16:37)

def charcount(file) :
    data = open(file,"r")
    num = 0
    for line in data :
        for d in line :
            if d != "\n" :
                num += 1
    return num
    data.close()
    
def alphacount(file) :
    data = open(file,"r")
    num = 0
    for line in data :
        for d in line :
            d = d.lower()
            if "a" <= d <= "z" or "0" <= d <= "9":
                num += 1
    return num
    data.close()
def linecount(file) :
    data = open(file,"r")
    num = 0
    for line in data :
        num += 1
    return num
    data.close()
    
def wordcount(file) :
    data = open(file,"r")
    ans = ""
    for line in data :
        for d in line :
            if not ("a" <= d <= "z" or "A" <= d <= "Z" or "0" <= d <= "9"):
                ans += " "
            else :
                ans += d.lower()
    return ans.split()
    data.close()
def BoWnohash(file) :
    data = wordcount(file)
    sf = []
    s = open("stopwords.txt","r")
    for line in s :
        sf += line.lower().split()
    data2 = []
    for d in data :
        if not (d in sf) :
            data2.append(d)
    s.close()
    data2.sort()
    final = []
    data2.append("*")
    num = 1
    first = data2[0]
    b = len(data2)
    for j in range(1,b) :
        if first == data2[j] :
            num += 1
        else :
            final.append([first,num])
            first = data2[j]
            num = 1
    return final
def BoWhash(file,M) :
    M = int(M)
    data = wordcount(file)
    sf = []
    s = open("stopwords.txt","r")
    for line in s :
        sf += line.lower().split()
    data2 = []   
    for d in data :
        if not (d in sf) :
            data2.append(d)
    s.close()
    final = []
    for c in data2 :
        num = 0
        a = 0
        for d in c :
            num += ord(d)*(37**a)
            a += 1
        final.append(num%M)
        
    num2 = 1
    final += [max(final)+1000]
    final.sort()
    first = final[0]
    ans = []
    for j in range(1,len(final)) :
        if first == final[j] :
            num2 += 1
        else :
            ans.append([first,num2])
            first = final[j]
            num2 = 1
    return ans
    
file = input("File name = ")
a = True
while a == True :
    feature = input("Use feature hashing ? (y,Y,n,N) ")
    if feature == "y" or feature == "Y" :
        M = input("M = ")
        print("-------------------")
        print("char count =",charcount(file))
        print("alphanumeric count =",alphacount(file))
        print("line count =",linecount(file))
        print("word count =",len(wordcount(file)))
        print("BoW =",BoWhash(file,M))
        a = False
    elif feature == "n" or feature == "N" :
        print("-------------------")
        print("char count =",charcount(file))
        print("alphanumeric count =",alphacount(file))
        print("line count =",linecount(file))
        print("word count =",len(wordcount(file)))
        print("BoW =",BoWnohash(file))
        a = False
    else :
        print("Try again.")
# 6330286621 (30.00) 136 (2021-03-21 06:33)

def fhash(w,M):
    f=0
    G=37
    k=0
    for i in w:
        f+=ord(i)*(G**k)
        k+=1
    return f%M


file_name=input('File name = ')
while True :    
    Fh=input('Use feature hashing ? (y,Y,n,N) ')
    if Fh.upper()=='Y' :
        M=int(input('M = '))
        break
    elif Fh.upper()=='N':
        break
    else: print('Try again.')


fs = open('stopwords.txt', 'r')
stopw=[]
for line in fs:
    line=line.strip()
    stopw+=line.split()
fs.close()


f = open(file_name, 'r')
char_count,alphanumeric_count=0,0
line_count,word_count=0,0
BoW=[]
BoWc=[]
allwords=[]
for line in f:
    line=line.strip()
    char_count+=len(line)
    
    nline=''
    for i in line:
        if i.isalnum() : nline+=i ; alphanumeric_count+=1
        else : nline+=' '
    nline=nline.split()
    word_count+=len(nline)
    line_count+=1
    #print(nline)
    for j in nline:
        if j.lower() not in stopw: allwords.append(j.lower())
        if j.lower() not in BoW and j.lower() not in stopw:
            BoW.append(j.lower()) ; BoWc.append(1)
        elif j.lower() in BoW:
            BoWc[BoW.index(j.lower())]+=1
        
f.close()

BoW2list=[]
for i in range(len(BoW)):
    BoW2list.append([BoW[i],BoWc[i]])

if Fh.upper()=='Y' :
    p=[]
    BoW2list=[]
    for w in allwords:
        if not fhash(w,M) in p: p.append(fhash(w,M)) ; BoW2list.append([fhash(w,M),1])
        elif fhash(w,M) in p:
            BoW2list[p.index(fhash(w,M))][1]+=1
BoW2list.sort()
    
print('char count =',char_count)
print('alphanumeric count =',alphanumeric_count)
print('line count =',line_count)
print('word count =',word_count)
print('BoW =',BoW2list)


# 6330288921 (15.00) 137 (2021-03-22 15:57)
#-------------------------------------------------

file_name = input('File name = ',)
fileee = input("Use feature hashing ? (y,Y,n,N) ",)
while fileee != 'n' and fileee != 'N' and fileee != 'y' and fileee != 'Y' :
        print('Try again.')
        fileee = input("Use feature hashing ? (y,Y,n,N) ",)
if fileee == 'y' or fileee == 'Y' :
    fileee= int(input('M = ',))
#-------------------------------------------------
stw = open('stopwords.txt', 'r')
file = open(file_name, 'r')
#-------------------------------------------------

file1 = file.read().strip()
f1 = 0 
for j in file1 :
    if j == '\n' :
        continue
    else :
        f1 += len(j)

file12 = file1.split()
file123 = " ".join(file12)
c2 = ""
for j in file123 :
    if "0" <= j <= "9" or \
       "A" <= j <= "Z" or \
       "a" <= j <= "z" or \
       j == " ":
        c2 += j+' '
c1 = 0
b= c2.split()
for j in b :
    c1 += len(j)

        
count2 = 0
file = open(file_name, 'r')
for line in file :
    count2 += 1

c3 = ""
for j in file12 :
    if j[0].isalnum()==False :
        for i in range(len(j)) :
            if "0" <= j[i-1] <= "9" or \
               "A" <= j[i-1] <= "Z" or \
               "a" <= j[i-1] <= "z" or \
               j[i-1] == " " :
                c3 += j+' '
                break
    if "0" <= j <= "9" or \
       "A" <= j <= "Z" or \
       "a" <= j <= "z" or \
       j == " ":
        c3 += j+' '
a= c3.split()
d= len(a)

print('-------------------')
print("char count =",f1)
print("alphanumeric count =",c1)
print("line count =",count2)
print("word count =",d)

stw.close()
file.close()
# 6330289521 (27.75) 138 (2021-03-22 22:41)
#---------------------------------------------
def fhash(w,M):
    sum=0
    for i in range (len(w)):
        sum+=(ord(w[i]))*(37**i)
    x=sum%M
    return x
#---------------------------------------------
def remove_expression(x):
    ex=[':',';','.',',','[',']','(',')','{','}',"'",'"','<','>',"?",'+','-','*','#','$','=','-','%','^','&','/','\\','_','$','%','@','!','^','à¸¿']
    a=''
    for c in x:
        if c in ex:
            a+=" "
        else:
            a+=c
    return a
#---------------------------------------------
def bow(l,s):
    l=l.lower()
    l=remove_expression(l)
    l=l.split()
    x=[]
    for e in l:
        if e not in s:
            x+=[e]
    return x
#---------------------------------------------
def freq(x):
    x.sort()
    y=[]
    z=[x[0]]
    count=1
    out=[]
    for i in range(len(x)-1):
        if x[i]==x[i+1]:
           count+=1
        else:
            z+=[x[i+1]]
            y+=[count]
            count=1
    y+=[count]
    for i in range(len(z)):
        out+=[[z[i],y[i]]]
    return out
#---------------------------------------------
def line_count():
    file_name=open(k,'r')
    linecount=0
    for line in file_name:
        line = line.rstrip("\n")
#    if line != '\n'
        linecount+=1
    file_name.close()
    print('line count =',linecount)
#---------------------------------------------
def word_count():
    wordcount=0
    file_name=open(k,'r')
    for line in file_name:
        line = line.rstrip("\n")
        line=remove_expression(line)
        line=line.split()
        for e in line:
#        if e!="\n"
            wordcount+=1
    file_name.close()
    print('word count =',wordcount)
#---------------------------------------------
def char_count():
    file_name=open(k,'r')
    charcount=0
    for line in file_name:
        line = line.rstrip("\n")
        for c in line:
            charcount+=1
    file_name.close()
    print('char count =',charcount)
#---------------------------------------------
def alpha_count():
    file_name=open(k,'r')
    alphacount=0
    for line in file_name:
        line = line.rstrip("\n")
        line=line.lower()
        for c in line:
            if  'a'<=str(c)<='z' or str(9)>=str(c)>=str(0):
                alphacount+=1
    file_name.close()
    print('alphanumeric count =',alphacount)
#---------------------------------------------
def frequency():
    file_name=open(k,'r')
    o=[]
    p=[]
    for line in file_name:
        line = line.rstrip("\n")
        BOW=bow(line,x)
        p+=BOW
        p.sort()
    if len(p)==0:
        return print("BoW = []")
    o+=freq(p)
    n=[]    
    if Feature =="Y" or Feature =="y":
        for a,b in o:
            a=fhash(a,M)
            n+=[[a,b]]
            n.sort()
        r=[]
        count=n[0][1]
        for i in range(len(n)-1):
            if n[i][0]==n[i+1][0]:
                count+=n[i+1][1]
            else:
                r+=[[n[i][0],count]]
                count=n[i+1][1]
        if n[-2][0]!=n[-1][0]:
            r+=[[n[-1][0],n[-1][1]]]
        else:
            r+=[[n[-1][0],count]]
        print("BoW =",r)
    else:
        print("BoW =",o)
    file_name.close()
#---------------------------------------------
file_name=input("File name = ")
k=file_name
a=['y','Y','n','N']
while True:
    Feature=input("Use feature hashing ? (y,Y,n,N) ")
    if Feature in a:
        break
    else:
        print("Try again.")
if Feature =="Y" or Feature =="y":
    M=int(input('M = '))
stopwords=open("stopwords.txt","r")
x=[]
for line in stopwords:
    line.strip()
    line=remove_expression(line)
    line=line.split()
    for e in line:
            x+=[e]
if Feature=="n" or Feature=="N":
    print('-------------------')
    char_count()
    alpha_count()
    line_count()
    word_count()
    frequency()
else:
    print('-------------------')
    char_count()
    alpha_count()
    line_count()
    word_count()
    frequency()

# 6330290021 (28.00) 139 (2021-03-22 21:18)
file_name = open(input('File name = '),'r')
order = input('Use feature hashing ? (y,Y,n,N) ')
def word(line):
    line = line.lower()
    w = ''
    for i in line :
        if 'a' <= i <= 'z'or i in '0123456789':
            w += i
        else:
            w += ' '
    return w
#-------------------------------------------------
def stop_words ():
    stop_words = open('stopwords.txt','r')
    stop = []
    for line in stop_words:
        line = word(line).split()
        stop += line
    stop_words.close()
    return stop
#-------------------------------------------------
def fhash(w,M):
    G = 37
    s = 0
    for i in range(len(w)) :
        s += ord(w[i])*G**i
    return s%M
#-------------------------------------------------
def BoW(n):
    bow = []
    s = []
    n = word(n).split()
    for i in n:
        if i not in s and i not in stop_words():
            s.append(i)
            bow.append([i,n.count(i)])
    return sorted(bow)
#-------------------------------------------------
def fhash_bow(n,m):
    bow = []
    s = []
    f = []
    n = word(n).split()
    for i in n:
        if i not in stop_words():
            f.append(fhash(i,m))
    for e in f:
        if e not in s :
            s.append(e)
            bow.append([e,f.count(e)])
    return sorted(bow)
#-------------------------------------------------        
def word_count(line):
    count = 0
    line = word(line).split()
    count += len(line)
    return count
#-------------------------------------------------
while order not in ['y','Y','N','n']:
    print('Try again.')
    order = input('Use feature hashing ? (y,Y,n,N) ')
try:
    if order == 'Y' or order == 'y' :
        m = int(input('M = '))
        print('-'*19)
        s = ''
        char = 0
        alphanumeric= 0
        line_c = 0
        w_count = 0
        for line in file_name:
            s += word(line)
            if line[-1:] == '\n':
                line = line[:-1]
            line_c += 1
            char += len(line)
            w_count += word_count(line)
            alphanumeric += len(''.join(word(line).split()))
        h = fhash_bow(s,m)
        print('char count =',char)
        print('alphanumeric count =',alphanumeric)
        print('line count =',line_c)
        print('word count =',w_count)
        print('BoW =',h)
        
    elif order == 'n' or order == 'N' :
        print('-'*19)
        s = ''
        char = 0
        alphanumeric= 0
        line_c = 0
        w_count = 0
        for line in file_name:
            s += word(line)
            if line[-1:] == '\n':
                line = line[:-1]
            line_c += 1
            char += len(line)
            w_count += word_count(line)
            alphanumeric += len(''.join(word(line).split()))
        print('char count =',chr)
        print('alphanumeric count =',alphanumeric)
        print('line count =',line_c)
        print('word count =',w_count)
        print('BoW =',BoW(s))
except:
    print('')
finally:
    file_name.close()      
# 6330291721 (30.00) 140 (2021-03-21 16:29)
def fhash(w,M):
    G = 37
    fhash = 0
    for i in range(len(w)):
        fhash += ord(w[i])*(G**i)
    fhash = fhash % int(M)
    return fhash
def get_unique(words):
    unique_words = [ ]
    for i in words:
        if not i in unique_words:
            unique_words.append(i)
    return unique_words
file_name = input('File name = ')
way = input('Use feature hashing ? (y,Y,n,N) ')
while way not in ['y','Y','n','N']:
    print('Try again.')
    way = input('Use feature hashing ? (y,Y,n,N) ')
char_count = 0
alphanumeric_count = 0
line_count = 0
word = []
BoW = []
infile = open(file_name, "r")
for line in infile:
    char_count += len(line.strip())
    line_count += 1
    w = line.lower().strip()
    for i in range(len(line.strip())):
        if line.strip()[i].isalnum() == True:
            alphanumeric_count += 1
        else:
            w = w[:i:] + ' ' + w[i+1::]
    for i in w.split():
        word.append(i)                
infile.close()
word_count = len(word)
if way == 'y' or way == 'Y':
    M = input('M = ')
    print('-------------------')
    print('char count =',char_count)
    print('alphanumeric count =',alphanumeric_count)
    print('line count =',line_count)
    print('word count =',word_count)
    stopword = []
    stopwords = open('stopwords.txt','r')
    for line in stopwords:
        for i in line.strip().split():
            stopword.append(i)
    stopwords.close()
    for k in range(len(word)):
        c = 0
        if word[k] not in stopword:
            for i in range(len(word)):
                if word[k] == word[i]:
                    c += 1
            BoW.append([word[k],c])
    BoWfh = []
    for i in range(len(BoW)):
        BoWfh.append(fhash(BoW[i][0],M))
    BoW = []
    for k in range(len(BoWfh)):
        c = 0
        for i in range(len(BoWfh)):
            if BoWfh[k] == BoWfh[i]:
                c += 1
        BoW.append([BoWfh[k],c])        
    BoW = get_unique(BoW)
    print('BoW =',BoW)
    
elif way == 'n' or way == 'N':
    print('-------------------')
    print('char count =',char_count)
    print('alphanumeric count =',alphanumeric_count)
    print('line count =',line_count)
    print('word count =',word_count)
    stopword = []
    stopwords = open('stopwords.txt','r')
    for line in stopwords:
        for i in line.strip().split():
            stopword.append(i)
    stopwords.close()
    for k in range(len(word)):
        c = 0
        if word[k] not in stopword:
            for i in range(len(word)):
                if word[k] == word[i]:
                    c += 1
            BoW.append([word[k],c])
    BoW = get_unique(BoW)
    print('BoW =',BoW)


# 6330292321 (24.80) 141 (2021-03-22 17:02)
def fhash(w,m):
    ans = 0
    for i in range(len(w)):
        ans+=ord(w[i])*(37**i)
    return ans%m
def d_word(line):
    stopword = []
    ans = []
    for i in open('stopwords.txt','r'):
        stopword.extend(i.split())
    sp = clean(line)
    for i in sp.split():
            if i not in stopword:
                ans.append(i)
    return ans
def clean(line):
    sp = ''
    for char in line:
        if char=="'":
            sp+=' '
        elif (ord(char)==32) or \
           (ord(char)>=48 and ord(char)<=57) or\
           (ord(char)>=65 and ord(char)<=90) or\
           (ord(char)>=97 and ord(char)<=122) :
            sp+=char.lower()
    return sp
def r_count(l):
    try :
        ans = [[l[0],1]]
    except:
        return []
    for i in l[1::] :
        m = 0
        for j in range(len(ans)):
            if i == ans[j][0]:
                ans[j][1]+=1
                m = 1
                break
        if m == 0:
            ans.append([i,1])
    return sorted(ans)
file_name = input('File name = ')
c = input('Use feature hashing ? (y,Y,n,N) ')
while True:
    if c not in 'YyNn':
        print('Try again.')
        c = input('Use feature hashing ? (y,Y,n,N) ')
    else:
        break
file = open(file_name,'r')
sc = []
c_count = 0
l_count = 0
w_count = 0
a_count = 0
for line in file:
    if line[-1]=='\n':
        c_count -= 1
    c_count += len(line)
    l_count += 1
    for i in clean(line):
        if i != ' ' and i != '\n':
            a_count+=1
    w_count += len(clean(line).split())
    sc.extend(d_word(clean(line)))
if c == 'Y' or c =='y':
    m = int(input('M = '))
    h = [fhash(i,m) for i in sc]
    bow = r_count(h)
else:
    h = [i for i in sc]
    bow = r_count(h)
print('-------------------')
print('char count =', c_count)
print('alphanumeric count =', a_count)
print('line count =', l_count)
print('word count =', w_count)
print('BoW =', bow)
# 6330293021 (30.00) 142 (2021-03-21 23:46)
def flash(word , M):
    s = 0 
    for i in range(len(word)):
        s += ord(word[i])* (37**i)
    return s % M

#----------------------------------------------------------------------
def words(line):
    new = ''
    for e in line:
        if not ('0'<=  e <= '9' or 'a'<= e <= 'z' or 'A'<= e <= 'Z'):
            new += ' '
        else:
            new += e
    return new

#----------------------------------------------------------------------
def readFilename(Filename):
    char = '' ;alp = '' ; line_count = 0 ; word = ''
    new_words = []
    for line in Filename:
        char += line.strip('\n')
        for e in line:
            if '0'<=  e <= '9' or 'a'<= e <= 'z' or 'A'<= e <= 'Z':
                alp += e
        word += ' ' + words(line)
        line_count += 1
    print('char count =' , len(char) )
    print('alphanumeric count =' , len(alp))
    print('line count =' ,line_count)
    print('word count =', len(word.split()))
    for e in word.split():
        if e.lower().strip() not in stopwords:
            new_words.append(e.lower().strip())
    return new_words

#----------------------------------------------------------------------
stopwords = []
stoptext = open('stopwords.txt' , 'r')
for line in stoptext:
    for word in line.split():
        stopwords.append(word)
stoptext.close()

Filename = open(input('Filename = '), 'r' )

cmd = input('Use feature hashing ? (y,Y,n,N) ')

while cmd.lower() != 'n' and cmd.lower() != 'y':
    print('Try again.')
    cmd = input('Use feature hashing ? (y,Y,n,N) ')

if cmd.lower() == 'y':
    M = int(input('M = '))
    print('-------------------')
    new_words = readFilename(Filename)
    bow_list = [] ; repeated_list = [] ; new = []
    
    for word in new_words:
        if flash(word , M) not in repeated_list:
            bow_list.append([flash(word , M),1])
            repeated_list.append(flash(word , M))
        else:
            for w in bow_list:
                if flash(word , M) == w[0]:
                    w[1] += 1
    print('BoW =', sorted(bow_list) )

else:
    new_words = readFilename(Filename)
    bow_list = [] ; repeated_list = []
    
    for word in new_words:
        if word in repeated_list:
            for e in bow_list:
                if e[0] == word:
                    e[1] += 1
        else:
            bow_list.append([word , 1])
            repeated_list.append(word)
    print('BoW =', sorted(bow_list))

Filename.close()

# 6330294621 (10.66) 143 (2021-03-22 02:09)

def fhash(w,M):
    G = 37
    c = 0
    sume = 0
    for e in w:
        sume += (ord(e)*(G**c))
        c += 1
    t = sume%M
    return t
def countword(a,b):
    p = 0
    for i in range(len(b)):
        if a == b[i] :
           p += 1
    return p

# sample.txt

file_name = input('File Name = ').strip()
f = open(file_name,'r')
sw = ''
linecount = 0
for line in f:
    sw += line[:-1] + ' '
    linecount += 1
char_count = len(sw) - linecount
aln = ''
for i in range(len(sw)):
    if 'a' <= sw[i].lower() <= 'z' or '0' <= sw[i] <= '9':
       aln += sw[i]
alphanumericcount = len(aln)
swnop = ''
for i in range (len(sw)):
    if sw[i] not in [',','.','"',"'"]:
       swnop += sw[i]
swnop = swnop.strip()
word = swnop.split()
word_count = len(word)

stopwordsop = open('stopwords.txt','r')
st = ''
for line in stopwordsop:
    st += line[:-1] + ' '
stopwords = st.split()

wnst = []
for i in range(len(word)):
    if word[i].lower() not in stopwords :
       wnst.append(word[i].lower())
       
norpwnst = []
for i in range(len(wnst)):
    if wnst[i] not in norpwnst:
       norpwnst.append(wnst[i])
wnstst = ''.join(wnst)


hon = input('Use feature hashing ? (y,Y,n,N) ')
while hon not in  ['y','Y','n','N']:
   print('Try again.')
   hon = input('Use feature hashing ? (y,Y,n,N) ')
if hon in ['y','Y']:
    M = int(input('M = '))
    c  = []
    for i in range(len(wnst)):
        c.append(fhash(wnst[i],M))
    d = []
    for i in range(len(c)):
        if c[i] not in d:
           d.append(c[i])
    BoW = []
    for i in range (len(d)):
        BoW.append([d[i],countword(d[i],c)])
    BoW.sort()  
else:
    BoW = []
    for i in range (len(norpwnst)):
        BoW.append([norpwnst[i],countword(norpwnst[i],wnst)])
print('char count = ' + str(char_count))
print('alphanumeric count = ' + str(alphanumericcount))
print('line count = ' + str(linecount))
print('word count = ' + str(word_count))
print('BoW = ' , BoW)
# 6330295221 (18.90) 144 (2021-03-22 15:10)
def fhash(w,M):
    G = 37
    sum = 0
    for i in range(len(w)):
        sum += ord(w[i])*G**(i)
    fult = sum % M
    return fult
def count( data, element ):
    c = 0
    for e in data:
        if e == element: c += 1
    return c

Alphabet = 'abcdefghijklmnopqrstuvwxyz'
alphabet = Alphabet.lower()
ALPHABET = Alphabet.upper()
numbers = '123456789'
everyt = alphabet+ALPHABET+numbers
y = input('File name = ')
stop_words = open('stopwords.txt', 'r')
stop_wordsl = []
for line1 in stop_words:
    line1st = line1.strip()
    line1stl = line1st.split()
    for e in line1stl:
        stop_wordsl.append(e)
stop_words.close()
 
file_name = open(y , 'r')
char = 0
numeng = 0
linec = 0
olen = ''
for line2 in file_name:
    for e in line2:
        if e not in everyt:
            olen += ' '
        else:
            olen += e
    line2st = line2.strip()
    linec += 1
    for e in line2st:
        char += 1
        if e in everyt:
            numeng += 1
olenl = olen.split()
word_count = len(olenl)
bagow = olen.lower()
bagows = bagow.split()
bow = []
Bow = []
for e in bagows:
    if e not in stop_wordsl:
        bow.append(e)
for e in bow:
    if e not in Bow:
        Bow.append(e)
      
x = input('Use feature hashing ? (y,Y,n,N) ')
while x not in 'yYnN':
    print('Try again.')
    x = input('Use feature hashing ? (y,Y,n,N) ')
if x == 'y' or x =='Y':
    z = input('M = ')
    print('-------------------')
    print('char count =', char)
    print('alphanumeric count =', numeng)
    print('line count =', linec)
    print('word count =', word_count)
    bowc = []
    BoW = []
    for e in bow :
        bowc.append(fhash(e,int(z)))
    bowns = []
    for e in bowc:
        if e not in bowns:
            bowns.append(e)
    bowns.sort()
    for e in bowns:
        c2 = count(bowc,e)
        BoW.append([e,c2])
    print('BoW =', BoW)
else:
    print('-------------------')
    print('char count =', char)
    print('alphanumeric count =', numeng)
    print('line count =', linec)
    print('word count =', word_count)
    BoW = []
    for e in Bow:
        c1 = count( bow, e )
        BoW.append([e,c1])
    
    print('BoW =', BoW)
    
file_name.close()


# 6330296921 (30.00) 145 (2021-03-22 21:54)
def fharsh(w,M):
    M = int(M)
    n=0
    x=0
    z=0
    for c in w:
        x += ord(c)*(37**n)
        n +=1
    z = str(x%M)
    return z
#--------------------------------------------------------
file_name = input('File name = ' )
fh_para = input('Use feature hashing ? (y,Y,n,N) ')
#--------------------------------------------------------
fs = open('stopwords.txt', 'r')
list_sw = []
for line in fs:
    line = line.lower()
    list_sw += line.split()
    
fs.close()
#---------------------------------------------------------
fn = open(file_name, 'r')
list_text = []
char_count = 0
alphanumeric_count = 0
line_count = 0
word_count = 0
no_stop_word =[]
checkBeforeBag =[]
BoW = []

for line in fn:
    line_count += 1
    line = line.strip()
    line = line.lower()
    
    for x in line :
        char_count += 1
    
    for c in range(len(line)):
        if line[c] in '0123456789abcdefghijklmnopqrstuvwxyz':
            alphanumeric_count += 1
        else:
            line = line[0:c]+' '+line[c+1:]
            
    list_text += line.split()
    word_count = len(list_text)
    
for i in list_text:
    if i in list_sw:
        pass
    else :
        no_stop_word.append(i)

for i in no_stop_word:
    if i not in checkBeforeBag:
        checkBeforeBag.append(i)
for i in checkBeforeBag:
    BoW.append([i,list_text.count(i)])

fn.close()
while fh_para != 'n'or'N'or'y'or'Y':
    if fh_para == 'n' or fh_para =='N': 
        print('-------------------')
        print('char count =',char_count)
        print('alphanumeric count =',alphanumeric_count)
        print('line count =',line_count)
        print('word count =',word_count)
        BoW.sort()
        print('BoW =',BoW)
        break
    if fh_para == 'y' or fh_para =='Y':
        M = input('M = ')
        BoW=[]
        fharshed =[]
        for i in range(len(no_stop_word)):
            no_stop_word[i] = fharsh(no_stop_word[i],M)
            
        for i in no_stop_word:
             if i not in fharshed:
                fharshed.append(i)
                
        for i in fharshed:
            BoW.append([int(i),no_stop_word.count(i)])
        BoW.sort()
            
        print('-------------------')
        print('char count =',char_count)
        print('alphanumeric count =',alphanumeric_count)
        print('line count =',line_count)
        print('word count =',word_count)
        print('BoW =',BoW)
        break
    print('Try again.')
    fh_para = input('Use feature hashing ? (y,Y,n,N) ')
    
# 6330298121 (19.85) 146 (2021-03-22 23:25)

# input data
file_name = input("file name = ")
b = input("Use feature hashing ? (y,Y,n,N) ")
while True:
    if b in ['y','Y','n','N']:
        if b == 'y' or b == 'Y':
            fh = True
        elif b == 'n' or b == 'N':
            fh = False
        break
    else :
        print("Try again")
        b = input("Use feature hashing ? (y,Y,n,N) ")
if fh:
    M = int(input("M = "))
print('-------------------')
stw = open("stopwords.txt","r")
fn = open(file_name)

#set variable 
charc = 0
alpc = 0
linec = 0
wcl = []
wc = ''
bowcl = []
bow1 = []
bow2 = []
stwl = []
               
#count               
for line in fn:
    for e in line.strip():
        charc += 1
        if 'a'<=e<='z' or 'A'<=e<='Z' or '0'<=e<='9':
            alpc += 1
            wc += e         
        else :
            if len(wc) != 0:
                wcl.append(wc)
            wc = ''   
    linec += 1
wordc = len(wcl)

for line in stw:
    stwl += (line.strip().split())   
for e in wcl:
    d = e.lower()
    if d not in stwl:
        bowcl.append(d)
        
# func about BoW
def fhash(a,b):
    c = len(a)
    d = 0
    for i in range(c):
        d += ord(a[i])*37**i
    return d%b

if fh:
    for i in range(M):
        bow1.append([i,0])
    for e in bowcl:
        bow1[fhash(e,M)][1] += 1
    bowsp1 = []
    for i in range(len(bow1)):
        if bow1[i][1] != 0:
            bowsp1.append(bow1[i])
else:
    bowcl.sort()
    bowcl += [' ']
    for i in range(len(bowcl)-1):
        if bowcl[i] != bowcl[i+1]:
            bow2.append([bowcl[i],bowcl.count(bowcl[i])])
    bowsp2 = []
    for i in range(len(bow2)):
        if bow2[i][1] != 0:
            bowsp2.append(bow2[i])
            
#print
    
print("char count =",charc)
print("alphanumeric count =",alpc)
print("line count =",linec)
print("word count =",wordc)
if fh:
    print("BoW =",bowsp1)
else:
    print("BoW =",bowsp2)
# 6330299821 (30.00) 147 (2021-03-21 20:46)

#====================================================
alphanumeric = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
def op_check():
    while True:
        a = input('Use feature hashing ? (y,Y,n,N) ', )
        if a == 'y' or a == 'Y':
            M = int(input('M = ', ))
            return a,M
        elif a == 'n' or a == 'N':
            M = ''
            return a,M
        else:
            print('Try again.')
def charcount(line):
    line = line.strip()
    q = len(line)
    return q
def alphanumericcount(line):
    h = ''
    for i in range(len(line)):
        if line[i] in alphanumeric:
            h += line[i]
    q = len(h)
    return q
def wordcount(line):
    h = ''
    for i in range(len(line)):
        if line[i] in alphanumeric:
            h += line[i].lower()
        else:
            h += ' '
    h = h.strip().split()
    q = len(h)
    return q, h
def BagofWords(words,M):
    w = []
    temp = []
    for i in words:
        if i not in stopwords:
            w.append(i)
    if type(M) == int:
        for i in range(len(w)):
            w[i] = fhash(w[i],M)
    w.sort()
    for i in range(len(w)):
            if i == 0:
                temp.append([w[i],1])
            elif w[i] == w[i-1]:
                temp[-1][1] += 1
            else:
                temp.append([w[i],1])
    return temp
def fhash(w,M):
    G = 37
    r = 0
    for i in range(len(w)):
        r += (ord(w[i])*(G**i))
    ans = r % M
    return ans

#----------------------------------------------------
file_name = input('File name = ', )
a, M = op_check()
x = open('stopwords.txt')
y = open(file_name)
stopwords = []
c = 0
d = 0
f = 0
g = 0
words = []

for line in x:
    stopwords += line.strip().split()

for line in y:
    c += charcount(line)
    d += alphanumericcount(line)
    f += 1
    g += wordcount(line)[0]
    words += wordcount(line)[1]


print('-------------------')
print('char count =',c)
print('alphanumeric count =',d)
print('line count =',f)
print('word count =',g)
print('BoW =', BagofWords(words,M))

x.close()
y.close()
# 6330300721 (17.30) 148 (2021-03-20 23:58)
file_name = input('File name = ')
q = input('Use feature hashing ? (y,Y,n,N) ')
stopwords = open( 'stopwords.txt', 'r')
f = [ '(', ')', '-', '_', '[', ']' ,'"' ,"'" ,';', ':', '>', '<','.','/','\\']
alpha_up = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
alpha_low = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
number = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']


while q not in ['Y', 'y','N','n'] :
    print('Try again.')
    q = input('Use feature hashing ? (y,Y,n,N) ')

if q == 'y' or q == 'Y' :
    M = int(input('M = '))
def l2_non(x) :
    
    l = []
    for e in x :
        if e not in l :
            l.append(e)
    return(l)
def sortt(data) :
    
    l = []
    for i in range(min(data), max(data)+1):
        l.append(i)
    
    return(l)
def count( data, t ) :

    count = 0
    for e in data:
        if e == t :
            count += 1
    
    return(count)
def ceep(data) :

    a = sortt(data)
    l = []
    for e in a :
        l.append([e,count(data,e)])
    
    return (l)
def fhash(w,M) :
    
    x = 0
    n = 1
    for e in w :
        x += ord(e)*37**(n-1)
        n += 1
    return(x%M)
def char_count() :
    
    file = open( file_name , 'r')
    chcount = 0
    for line in file :
        line = line.strip()
        for i in range(len(line)):
            chcount += 1
    file.close()
    return(chcount)
    
def alphanumeric_count() :
    
    file = open( file_name , 'r')
    alcount = 0
    for line in file :
        line = line.strip()
        for e in line :
            if e in alpha_up or e in alpha_low or e in number :
                alcount += 1
            else :
                pass
    file.close()
    return(alcount)
    
def word_count() :
    
    file = open( file_name , 'r')
    l = ''
    for line in file :
        line = line.strip()
        for e in line :
            if e in alpha_up or e in alpha_low or e in number :
                l += e
            else :
                l += ' '
    l = l.split()       
    file.close()    
    return(len(l))
def BoW_y(q,M) :
    
    file = open( file_name , 'r')
    l = ''
    for line in file :
        for e in line :
            line = line.strip()
            if e in alpha_up or e in alpha_low or e in number :
                l += e.lower()
            else :
                l += ' '
    l = l.split()
    file.close() 
    
    stop = open( 'stopwords.txt' , 'r' )
    s = []
    for line in stop :
        l_line = line.strip().split()
        for k in l_line :
            s.append(k)
    stop.close()
    
    l2 = []
    for e in l :
        if e not in s :
            l2.append(e)
    b = []
    for e in l2 :
        b.append(fhash(e,M))
    
    pre_bow = ceep(b)
    bow = []
    for e in pre_bow :
        if e[-1] != 0 :
            bow.append(e)
    return(bow)      
def BoW_n(q) :
    
    file = open( file_name , 'r')
    l = ''
    for line in file :
        line = line.strip()
        for e in line :
            if e in alpha_up or e in alpha_low or e in number :
                l += e.lower()
            else :
                l += ' '
    l = l.split()
    file.close() 
    
    stop = open( 'stopwords.txt' , 'r' )
    s = []
    for line in stop :
        l_line = line.strip().split()
        for k in l_line :
            s.append(k)
    stop.close()
    
    l2 = []
    for e in l :
        if e not in s :
            l2.append(e)
    l3 = l2_non(l2)
    bow = []
    for e in l3 :
        bow.append([e,count(l2,e)])
    
    return(bow)
    
    
print('-------------------')
print('char count = ' ,char_count())
print('alphanumeric count = ',alphanumeric_count())
print('word count = ',word_count())
if q == 'y' or q == 'Y' :
    print('BoW = ',BoW_y(q,M))
else :
    print('BoW = ',BoW_n(q))
# 6330301321 (30.00) 149 (2021-03-22 22:57)

G=37

#-------------
def main():
    file_name = input('File name = ')

    stopwords = getWordsList('stopwords.txt')
    myWords = getWordsList(file_name)
    myWords = normalized(myWords,stopwords)
    while True:
        isfHash = input('Use feature hashing ? (y,Y,n,N) ')
        if isfHash in ['Y','y'] :
            M = int(input('M = '))
            print('-------------------')
            report(file_name)
            for i in range(len(myWords)):
                myWords[i] = fHash(myWords[i],M)
            print('BoW =', bagWords(myWords))
            break
        elif isfHash in ['N','n'] :
            print('-------------------')
            report(file_name)
            print('BoW =', bagWords(myWords))
            break
        else :
            print('Try again.') 
        
#-------------
def getRidPunc(mystring) :
    temp = ''
    for i in range(len(mystring)) :
        if mystring[i].isalnum() or mystring[i] == ' ' :
            temp+=mystring[i]
        else:
            temp+=' '
    return temp
def getWordsList(file_name) :
    f = open(file_name,'r')
    wArr = []
    for line in f :
        nline = getRidPunc(line)
        for w in nline.split() :
            wArr.append(w)
    f.close()
    return wArr
def report(file_name) :
    f = open(file_name,'r')
    chCount = 0
    alnumCount = 0
    wCount = 0
    lCount = 0
    for line in f :
        for ch in line :
            if ch != '\n' :
                chCount += 1
                if ch.isalnum() :
                    alnumCount += 1
        wCount += len(getRidPunc(line).split())
        lCount += 1
    print('char count =',chCount)
    print('alphanumeric count =',alnumCount)
    print('line count =',lCount)
    print('word count =',wCount)
    f.close()
def normalized(wArr,stopwords) :
    norm = []
    for w in wArr :
        if w.isalnum() and w.lower() not in stopwords :
            norm.append(w.lower())
    return norm
    
def bagWords(wArr) :
    wArr.sort()
    if wArr == [] :
        return []
    w = wArr[0]
    c = 1
    BoW = []
    for i in range(len(wArr)):
        if i>0 and wArr[i] == w:
            c+=1
        elif i>0 and i<(len(wArr)-1):
            BoW.append([w,c])
            c = 1
            w = wArr[i]
        if i==(len(wArr)-1) :
            BoW.append([w,c])
            if wArr[i] != wArr[i-1] :
                BoW.append([wArr[i],1])
    return BoW
def fHash(w,M) :
    global G
    return (sum([ord(w[i])*(G**i) for i in range(len(w))]))%M

#-------------
main()
# 6330302021 (20.00) 150 (2021-03-22 23:41)

file = open(input('File name = ', ),'r')
# typeinput = input('Use feature hashing ? (y,Y,n,N) ', )
listtypeinput = ['y','Y','n','N']
char_count = 0
line_count = 0
alphanumeric_count = 0

for line in file:
    for ch in line:
        if ch != '\n':
            char_count += len(ch)
        if ((ord(ch) >= 65 and ord(ch) <= 90) or (ord(ch) >= 97 and ord(ch) <= 122)) or (ord(ch) >= 48 and ord(ch) <= 57):
            alphanumeric_count += 1      
    if line != "\n":
        line_count += 1
file.seek(0)
def BoW(file):
    listword = []
    stopword = ['it','they','the','a','an','of','on','in','at','is','am','are','was','were']
    ans = []
    for line in file:
        for p in line:
            if ((ord(p) >= 65 and ord(p) <= 90) or (ord(p) >= 97 and ord(p) <= 122)) or (ord(p) >= 48 and ord(p) <= 57):
                pass
            else:
                line = line.replace(p,' ')
            line = line.lower()
        for e in line.split():
            listword.append(e.strip())
    listcount = []
    for i in range (len(listword)):
        count = 0
        for k in listword:
            if listword[i] == k:
                count += 1           
        listcount.append(count)
    for i in range (len(listword)):
        ans.append([listword[i],listcount[i]])
        ans.sort()
    BoWn = []
    BoWn += [ans[0]]
    listword_count = []
    listword_count += [ans[0][1]]
    word_count2 = []
    word_count2 += [ans[0]]
    for i in range (len(ans)):
        if ans[i] not in BoWn and ans[i][0] not in stopword :
            BoWn.append(ans[i])
        if ans[i] not in word_count2:
            word_count2.append(ans[i])
            listword_count.append(ans[i][1])
    word_count = sum(listword_count)
    file.seek(0)
    return BoWn , word_count

BoWn , word_count = BoW(file)
def fhash(w,M):
    M = int(M)
    wordvalue = 0
    for i in range (len(w)):
        value = int(ord(w[i]))
        wordvalue += value*(37)**i
    result = int(wordvalue)%M      
    return int(result)
 
while True:
    typeinput = input('Use feature hashing ? (y,Y,n,N) ', )
    if typeinput in listtypeinput:
        break
    else:
        print('Try again.')
        
    
if typeinput == 'y' or typeinput == 'Y' :
    M = input('M = ', )
    BoWy = BoWn
    listfhash = []
    listfhash2 = []

    for r in range (len(BoWy)):
            listfhash.append([fhash(BoWy[r][0],M),BoWy[r][1]])
    listfhash = sorted(listfhash)

    c = 0
    listfhash2 = []
    find = [listfhash[0][0],0]
    for e in listfhash + [['end',1000]]:
        if e[0] == find[0]:
            c += e[1]
        else:
            listfhash2.append([find[0],c])
            c = e[1]
            find = e
    BoW = listfhash2
    print('-------------------')
    print('char count =',char_count)
    print('alphanumeric count =',alphanumeric_count) 
    print('line count =',line_count)
    print('word count =',word_count) 
    print('BoW =',BoW)

elif typeinput == 'n' or typeinput == 'N' :
    BoW = BoWn
    print('-------------------')
    print('char count =',char_count)
    print('alphanumeric count =',alphanumeric_count) 
    print('line count =',line_count)
    print('word count =',word_count) 
    print('BoW =',BoW)

file.close



# 6330303621 (21.90) 151 (2021-03-21 22:13)

file_name = input('File name = ',)
a = input("Use feature hashing ? (y,Y,n,N) ",)
while a != 'n' and a != 'N' and a != 'y' and a != 'Y' :
        print('Try again.')
        a = input("Use feature hashing ? (y,Y,n,N) ",)
if a == 'y' or a == 'Y' :
    b = int(input('M = ',))
print('-------------------')
def fhash(w, M) :
    n = 0
    for i in range(len(w)) :
        n = n+(ord(w[i])*37**i)
    n %= M
    return n
def count_words(w) :
    count5 = 0
    for i in range(len(g)):
        if g[i] == w :
            count5 += 1
        else :
            count5 += 0
    return count5

sw = open('stopwords.txt', 'r')
f = open(file_name, 'r')
ff1 = f.read().strip()
ff = ff1.split()
fff = " ".join(ff)
sww = sw.read().strip().split()

count1 = 0
for line in ff1 :
    if line == '\n' :
        count1+=0
    else :
        count1 += len(line)
print('char count =', count1)
    
count2 = 0
x=[]
z=[]
for line in ff :
    for i in range(len(line)) :
        if line[i].isalnum()==True :
            count2 += 1
            x.append(line[i])
            y = ''.join(x)
        else :
            continue
    z.append(y)
    x=[]
print('alphanumeric count =', count2)

count3 = 0
f = open(file_name, 'r')
for line in f :
    count3 += 1
print('line count =', count3)
    
count4 = 0
for i in range(len(fff)) :
    if fff[i]==fff[0] :
        continue
    if fff[i].isalnum()==False and fff[i].isalnum() != fff[i-1].isalnum() :
        count4 += 1
    else :
        continue
print('word count =', count4)
  
BoW = []
BoW0 = []
BoW1 = []
BoW2 = []
g = " ".join(z).lower().split()
if a == 'y' or a == 'Y' :
    for i in range(len(g)) :
        if g[i] not in sww and g[i] not in BoW0 :
            BoW0.append(g[i])
            BoW1.append([fhash(g[i],b), count_words(g[i])])
    k = sorted(BoW1)
    for i in range(len(k)) :
        if i < len(k)-1 :
            for j in range(i+1,len(k)) :
                if k[i][0]==k[j][0] :
                    k[i][1]+=k[j][1]
    for i in range(len(k)) :
        if k[i][0]==k[i-1][0] :
            continue
        else :
            BoW.append(k[i])
    print('BoW =', BoW)
elif a == 'n' or a == 'N' :
    for i in range(len(g)) :
        if g[i] not in sww :
            BoW2.append([g[i], count_words(g[i])])
    k = sorted(BoW2)
    for i in range(len(k)) :
        if k[i][0]==k[i-1][0] :
            continue
        else :
            BoW.append(k[i])
    print('BoW =', BoW)
f.close()
sw.close()
# 6330304221 (30.00) 152 (2021-03-21 17:42)

# [Done] file_name --> Get the words from filename.txt
# [Done] stopwords --> Get stop words from stopwords.txt
# [Done] while feature hashing not in ('y', 'n') --> lowercase
# [Done] if 'y': Use fhash --> M = ?

# Output: 1. [Done] character count 
#         2. [Done] alphanumeric count --> isalnum
#         3. [Done] line count
#         4. [Done] word count
#         5. [Done] Bow --> (Use fhash or not)

# ---------------------------------------------------------------
stopwords_file = open('stopwords.txt', 'r')
stopwords = []
for line in stopwords_file:
    stopwords.extend(line.strip().lower().split())
def test():
    file_name = input('File name = ') # somename.txt
    hashing = input('Use feature hashing ? (y,Y,n,N) ').lower()
    while hashing not in ('y', 'n'):
        print('Try again.')
        hashing  = input('Use feature hashing ? (y,Y,n,N) ').lower()
    if hashing  == 'y':
        useFeatureHash = True
        m_value = int(input('M = '))
    else: 
        useFeatureHash, m_value = False, None
    print('-------------------')
    char_count, alnum_count, line_count, word_count = 0, 0, 0, 0
    text = []
    testfile = open(file_name, 'r')
    for line in testfile:
        words = line.strip()
        for i, char in enumerate(words):
            # 1. character count
            char_count += 1
            if char.isalnum():
                # 2. alphanumeric count
                alnum_count += 1
            else:
                words = words[0:i] + ' ' + words[i+1:]
        # 3. line count
        line_count += 1
        words = words.lower().split()
        # 4. word count
        word_count += len(words)
        text.extend(words)
    stopwords_file.close()
    testfile.close()
    # Output
    print('char count =', char_count)
    print('alphanumeric count =', alnum_count)
    print('line count =', line_count)
    print('word count =', word_count)
    print('BoW =', bag_of_word(text, useFeatureHash, m_value))

# ---------------------------------------------------------------
def unique_of(words):
    unique_words = []
    for word in words:
        if word not in unique_words:
            unique_words.append(word)
    return unique_words
def remove_stopwords(words):
    none_stopword = []
    for word in words:
        if word not in stopwords:
            none_stopword.append(word)
    return none_stopword
def bag_of_word(text: list, useFeatureHash: bool, m_value):
    if useFeatureHash:
        all_fhash = []
        temp_text = remove_stopwords(text)
        for word in temp_text:
            fhash_value = fhash(word, m_value)
            all_fhash.append(fhash_value)
        unique_words = remove_stopwords(unique_of(all_fhash))
        text = all_fhash
    else:
        unique_words = remove_stopwords(unique_of(text))

    counter = []
    for unique in unique_words:
        count = 0
        for word in text:
            if word == unique:
                count += 1
        counter.append([unique, count])
    return counter
def fhash(word, m_value):
    n = len(word)
    fhash_value = 0
    for i in range(n):
        fhash_value += ord(word[i]) * 37 ** i
    return int(fhash_value % m_value)

# ---------------------------------------------------------------
test()
# 6330305921 (26.00) 153 (2021-03-21 22:57)
#Prog-08: Bag-of-words
#6330305921 (26.00) Pras Pitasawad
file_name = input('File name = ')
k = True 
while k == True :
    x = input('Use feature hashing ? (y,Y,n,N) ')
    if x == 'y'or x =='Y' :
        m = True
        M = int(input('M = '))
        break
    elif x == 'n'or x == 'N':
        m = False
        break
    else :
        print('Try again.')
def char(x) :
    f = open(x,'r')
    k = 0
    for line in f :
        if line[-1] == '\n' :
            k += len(line) - 1
        else :
            k += len(line)
    f.close()
    return str(k)
def alphacount(x) :
    f = open(x,'r')
    k = 0
    for line in f :
        for e in line :
            if '0'<= e <= '9' or 'A'<= e <= 'Z' or 'a' <= e <= 'z' :
                k += 1
    f.close()
    return str(k)
def linecount(x) :
    f = open(x,'r')
    k = 0
    for line in f :
        k += 1
    f.close()
    return str(k)
def word(x) :
    f = open(x,'r')
    y = ''
    for line in f :
        for e in line :
            if '0'<= e <= '9' or 'A'<= e <= 'Z' or 'a' <= e <= 'z' :
                y += e 
            else :
                y += ' '
    f.close()
    return y.strip().split()
def flash(w,M) :
    s = 0 
    for i in range(len(w)):
        s += ord(w[i])*(37**i)
    return s%M
def BoW(words,stop) :
    for i in range(len(words)) :
        words[i] = words[i].lower()
    for i in range(len(stop)) :
        stop[i] = stop[i].lower()
    x = []
    for e in words :
        if e not in stop :
            x.append(e)
    if m == False :
        x.sort()
        BoW =[]
        y = x[0]
        k = 1
        for i in range(1,len(x)) :
            if x[i] != y :
                BoW.append([y,k])
                y = x[i]
                k = 1
            else :
                k += 1
        BoW.append([y,k])
        return BoW
    else :
        BoW = []
        n = []
        for e in x :
            n.append(flash(e,M))
        n.sort()
        y = n[0]
        k = 1
        for i in range(1,len(n)) :
            if n[i] != y :
                BoW.append([y,k])
                y = n[i]
                k = 1
            else :
                k += 1
        BoW.append([y,k])
        return BoW

print('-------------------')
print('char count = '+ char(file_name))
print('alphanumeric count = ' + alphacount(file_name))
print('line count = ' + linecount(file_name))
print('word count = ' + str(len(word(file_name))))
print('BoW = ' + str(BoW(word(file_name),word('stopwords.txt'))))
# 6330306521 (24.07) 154 (2021-03-22 21:28)
def fhash(a,b) :
    c=0
    d=0
    for i in a.lower() :
        c+=ord(i)*(37**d)
        d+=1
    return c%int(b)
def count(a) :
    c=''
    for i in a :
        if 'a'<=i<='z' or 'A'<=i<='Z' or '0'<=i<='9' :
            c+=i
        else :
            c+=' '
    return c.split()
def countalpha(a) :
    b=''
    for i in a :
        if 'a'<=i<='z' or 'A'<=i<='Z' or '0'<=i<='9' :
            b+=i
    return len(b)

file_name=open(input('File name = '))
b=input('Use feature hashing ? (y,Y,n,N) ')
while b not in 'yYnN' :
    print('Try again.')
    b=input('Use feature hashing ? (y,Y,n,N) ')
if b=='y' or b=='Y' :
    m=input('M = ')
print('-------------------')


stop=open('stopword.txt')
q=stop.readline()
stopword=[]
while len(q)>0 :
    stopword+=q.split()
    q=stop.readline()
stop.close()
a=file_name.readline()
nuba=0
aa=a
bow=[]
while len(a)>0 :
    nuba+=1
    a=file_name.readline()
    aa+=a

print('char count = '+str(len(aa)-nuba+1))
print('alphanumeric count = '+str(countalpha(aa)))
print('line count = '+str(nuba))
print('word count = '+str(len(count(aa))))
if b=='y' or b=='Y' :
    for i in count(aa) :
        if i.lower() not in stopword :
            bow.append([fhash(i,m),1])
    bow.sort()
    bow.append('.')
    rbow=[]
    bow0=bow[0][0]
    bow1=[[bow[0][0]]]
    w=-1
    for i in range(len(bow)) :
        if bow[i][0]==bow0 :
            w+=1
        else :
            rbow.append(w)
            bow1.append([bow[i][0]])
            bow0=bow[i][0]
            w=1
    for i in range(len(rbow)) :
        bow1[i].append(rbow[i])
    print('BoW = ',bow1[:-1])
else :
    for i in count(aa) :
        if i.lower() not in stopword :
            bow.append(i)
    bow.sort()
    bow.append('.')
    rbow=[]
    bow0=bow[0]
    bow1=[[bow[0]]]
    w=0
    for i in range(len(bow)) :
        if bow[i]==bow0 :
            w+=1
        else :
            rbow.append(w)
            bow1.append([bow[i]])
            bow0=bow[i]
            w=1
    for i in range(len(rbow)) :
        bow1[i].append(rbow[i])
    print('BoW = ',bow1[:-1])  
# 6330308821 (30.00) 155 (2021-03-20 16:16)

def fhash(w,M):
    s = 0
    for i in range(len(w)):
        s += ord(w[i])*(37)**i
    return s%M
def cut_stopwords(text):
    l_of_stopwords = []
    text = text.lower()
    s = ""
    f = open("stopwords.txt", "r")
    for line in f:
        n = line.split()
        for e in n:
            l_of_stopwords.append(e)
    f.close()
    for e in text:
        if e in "abcdefghijklmnopqrstuvwxyz0123456789":
            s += e
        else:
            s += " "
    l = s.split()
    m = []
    for e in l:
        if e not in l_of_stopwords:
            m.append(e)
    return m
def chr_count_lst(l_text):
    n = 0
    for e in l_text:
        n += len(e)
    return n
def alp_count(l_text):
    n = 0
    for e in l_text:
        for k in e:
            k = k.lower()
            if k in "abcdefghijklmnopqrstuvwxyz0123456789":
                n += 1
    return n
def words_count(l_text):
    n = 0
    for e in l_text:
        s = ""
        for k in e:
            k = k.lower()
            if k in "abcdefghijklmnopqrstuvwxyz0123456789":
                s += k
            else:
                s += " "
        n += len(s.split())
    return n
def BoW(l_text):
    l_words = []
    l_words_notdupli = []
    l_bow = []
    for e in l_text:
        for i in cut_stopwords(e):
            l_words.append(i)
    for e in l_words:
        if e not in l_words_notdupli:
            l_words_notdupli.append(e)
    for e in l_words_notdupli:
        n = 0
        for k in l_words:
            if e == k:
                n += 1
        l_bow.append([e,n])
    l_bow.sort()
    return l_bow

def BoW_fhash(l_text,M):
    l_words = []
    l_words_notdupli = []
    l_bow = []
    for e in l_text:
        for i in cut_stopwords(e):
            l_words.append(fhash(i,M))
    for e in l_words:
        if e not in l_words_notdupli:
            l_words_notdupli.append(e)
    for e in l_words_notdupli:
        n = 0
        for k in l_words:
            if e == k:
                n += 1
        l_bow.append([e,n])
    l_bow.sort()
    return l_bow
def main():
    fn = input("File name = ")
    con = input("Use feature hashing ? (y,Y,n,N) ")
    while True:
        if con in ["y","Y","n","N"]:
            break
        else:
            print("Try again.")
            con = input("Use feature hashing ? (y,Y,n,N) ")
    f = open(fn,"r").read().splitlines()

    if con in ["y","Y"]:
        M = int(input("M = "))
        print("-------------------")
        print("char count =", chr_count_lst(f))
        print("alphanumeric count =", alp_count(f))
        print("line count =", len(f))
        print("word count =", words_count(f))
        print("BoW =",BoW_fhash(f,M))
    elif con in ["n","N"]:
        print("-------------------")
        print("char count =", chr_count_lst(f))
        print("alphanumeric count =", alp_count(f))
        print("line count =", len(f))
        print("word count =", words_count(f))
        print("BoW =",BoW(f))
main()
# 6330309421 (0.00) 156 (2021-03-22 21:01)

def countchar(txt):
    txt = open('sample.txt', 'r')
    txtr = txt.readline()
    k = ''
    while len(txtr) > 0:
        for i in txtr.strip():
            k += i
        txtr = txt.readline()
    txt.close()
    c1 = len(k)
    return c1
def countalpnum(txt):
    txt = open('sample.txt', 'r')
    txtr = txt.readline()
    txtr.strip()
    st = ''
    while len(txtr) > 0:
        st += txtr.strip()
        st += ' '
        txtr = txt.readline()
    not_alpnum = ''' "\'[]{}(),.;?!:'''   
    n_alpandnum = []
    [n_alpandnum.append(x) for x in st if not x in not_alpnum]
    c2 = len(n_alpandnum)
    txt.close()
    return c2
def linecount(txt):
    txt = open('sample.txt', 'r')
    txtr = txt.readline()
    txtr.strip()
    c3 = 0
    while len(txtr) > 0:
        txtr = txt.readline()
        c3 += 1
    return c3
def wordcount(txt):
    txt = open('sample.txt', 'r')
    txtr = txt.readline()
    txtr.strip()
    s = ''
    not_alpnum = '"\'[]{}(),.;?!:'
    while len(txtr) > 0:
        s += txtr.strip()
        s += ' '
        txtr = txt.readline()
    s_alpnum = []   
    for x in s:
        if not x in not_alpnum:
            s_alpnum.append(x)
    word = ''        
    for i in range(len(s_alpnum)-1):
        word += s_alpnum[i]
    wd = word.split(' ')
    c4 = len(wd)
    return c4
def fhash(w,M):
    G = 37
    cal = 0
    e = 0
    for i in w :
            c = ord(i) * (G**e)
            cal += c
            e += 1
    cal = cal % int(M)
    return cal
def stopwords(s):
    s = open('stopwords.txt','r')
    st = s.readline()
      
    stp = ''
    b = 0
    while len(st) > 0:
            for i in st.strip():
                if b > 0:
                    stp += ' ' + i
                    b -= 1
                else:
                    stp += i
            st = s.readline()
            b += 1
    s.close()
    return stp
def count(a):
    n = 0
    b = []
    c = []
    ans = []
    for i in a:
        n = 0
        for j in a:
            if j == i :
                n += 1
        if i not in b:
            b.append(i)
            c.append(n)
    for i in range(len(b)):
        ans.append([b[i],c[i]])
    ans.sort()
    return ans
def word(txt):
    txt = open('sample.txt', 'r')
    txtr = txt.readline()
    txtr.strip()
    s = ''
    not_alpnum = '"\'[]{}(),.;?!:'
    while len(txtr) > 0:
        s += txtr.strip()
        s += ' '
        txtr = txt.readline()
    s_alpnum = []   
    for x in s:
        if not x in not_alpnum:
            s_alpnum.append(x)
    word = ''        
    for i in range(len(s_alpnum)-1):
        word += s_alpnum[i]
    wd = word.split(' ')
    a = []
    b = []
    for i in wd:
        a.append(i.lower())
    a.sort()
    for i in a :
        if i not in b and i not in stopwords('stopwords.txt'):
            b.append(i)
    return b

file_name = input('File name = ')

BoW = []

a = []

w = word(file_name)

M = 0

while M <= 0:
    fhashkey = input('Use feature hashing? (y,Y,n,N) ')
    if fhashkey == 'y' or fhashkey == 'Y':
        M = input('M = ')
        for i in w:
            a.append(fhash(i,M))
        BoW = count(a)
        break
    elif fhashkey == 'n' or fhashkey == 'N':
        M = 0
        BoW = count(w)
        break
    else:
        print('Try again.')
        continue
        


print('-------------------')
print('char count =',countchar(file_name))
print('alphanumeric count =',countalpnum(file_name))
print('line count =',linecount(file_name))
print('word count =',wordcount(file_name))
print('BoW =',BoW )
# 6330310021 (28.00) 157 (2021-03-22 22:07)
# ---------------------------------------
def stop_word(filename):
    stpw = []
    for line in filename:
        line = line.strip("\n")
        word = line.split()
        for e in word:
            stpw.append(e)
    return stpw
# ---------------------------------------
def char_count(filename):
    char = 0
    for line in filename:
        line = line.strip("\n")
        char += len(line)
    return char
# ---------------------------------------
def replace_sym(line):
    out = ""
    for e in line:
        if e in "!+@/#$%^&฿*()_-=/*-\\|,]}[{:;\'\".?><":
            out += " "
        else:
            out += e.lower()
    return out
# ---------------------------------------
def alpha_count(filename):
    ap_count = 0
    word = 0
    for line in filename:
        line = line.strip("\n")
        clear_line = replace_sym(line)
        list_alpha = clear_line.split()
        word += len(list_alpha)
        for x in line:
            if x.lower() in "abcdefghijklmnopqrstuvwxyz0123456789":
                ap_count += 1
    return ap_count, word
# ---------------------------------------
def line_count(filename):
    n = 0
    for line in filename:
        n += 1
    return n
# ---------------------------------------
def BoW(filename):
    ready_for_BoW = []
    have = []
    BoW_list = []
    for line in filename:
        line = line.strip("\n")
        clear_line = replace_sym(line)
        list_of_word_in_line = clear_line.split()
        for e in list_of_word_in_line:
            if e not in stop_words:
                ready_for_BoW.append(e)
    for e in ready_for_BoW:
        if e not in have:
            have.append(e)
            BoW_list.append([e, 1])
        else:
            for info in BoW_list:
                if e in info:
                    info[1] += 1
    BoW_list.sort()
    return BoW_list
# ---------------------------------------
def flash(w, M):
    G = 37
    fls = 0
    for i in range(len(w)):
        fls += ord(w[i])*(G**i)
    return fls % M
# ---------------------------------------
def BoW_Hashing(filename):
    ready_for_BoW = []
    have = []
    BoW_list = []
    hash = []
    hashing = []
    for line in filename:
        line = line.strip("\n")
        clear_line = replace_sym(line)
        list_of_word_in_line = clear_line.split()
        for e in list_of_word_in_line:
            if e not in stop_words:
                ready_for_BoW.append(e)
    for e in ready_for_BoW:
        if e not in have:
            have.append(e)
            BoW_list.append([e, 1])
        else:
            for info in BoW_list:
                if e in info:
                    info[1] += 1
    for i in range(len(BoW_list)):
        BoW_list[i][0] = flash(BoW_list[i][0], M)
    for e in BoW_list:
        if str(e[0]) not in hash:
            hash.append(str(e[0]))
            hashing.append([e[0], e[1]])
        else:
            for x in hashing:
                if str(e[0]) == str(x[0]):
                    x[1] += e[1]
    hashing.sort()
    return hashing
# ---------------------------------------
file_name = input("File name = ")
feature_hashing = input("Use feature hashing ? (y,Y,n,N) ")
while feature_hashing.lower() != "y" and feature_hashing.lower() != "n":
    print("Try again.")
    feature_hashing = input("Use feature hashing ? (y,Y,n,N) ")
M = 0
if feature_hashing.lower() == "y":
    M = int(input("M = "))
print("-------------------")
stop_file = open("stopwords.txt", "r")
stop_words = stop_word(stop_file)
# ---------------------------------------
file1 = open(file_name, "r")
file2 = open(file_name, "r")
file3 = open(file_name, "r")
file4 = open(file_name, "r")
cc = char_count(file1)
print("char count =", cc)
ap, w = alpha_count(file2)
print("alphanumeric count =", ap)
l = line_count(file3)
print("line count =", l)
print("word count =", w)
if feature_hashing in "Nn":
    B1 = BoW(file4)
    print("BoW =", B1)
elif feature_hashing in "Yy":
    B2 = BoW_Hashing(file4)
    print("BoW =", B2)

file1.close()
file2.close()
file3.close()
file4.close()
stop_file.close()
# 6330311621 (30.00) 158 (2021-03-18 22:11)
def num_all(fn):
    c = 0
    fn = open(fn,'r').read()
    for i in fn:
        if i != '\n':
            c += 1
    return c
def num_char(fn):
    out = ''
    fn = open(fn,'r').read()
    for i in fn:
        if 'a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9':
            out += i
    return len(out)
def num_line(fn):
    c = 0
    fn = open(fn,'r')
    for line in fn:
        c += 1
    return c 
            
def num_word(fn):
    out = ''
    fn = open(fn,'r').read()
    for i in fn:
        if not('a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9'):
            i = ' '
        out += i
    word = out.split()
    return len(word)
def listword(fn):
    out = ''
    fn = open(fn,'r').read()
    for i in fn:
        if not('a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9'):
            i = ' '
        out += i
    out = out.lower().split()
    sw = open('stopwords.txt','r').read()
    t = sw.split()
    list_word = []
    for i in out:
        if not i in t:
            list_word.append(i)
    return list_word
def BoW(fn):
    word = listword(fn)
    t_word = []
    for i in word:
        if not i in t_word:
            t_word.append(i)
    slot = [0]*len(t_word)
    for i in word:
        if i in t_word:
            slot[t_word.index(i)] += 1
    f_word = []
    for i in range(len(slot)):
        f_word.append([t_word[i],slot[i]])
    return f_word
        
def fhash(w,M):
    num = 0
    for i in range(len(w)):
        num += ord(w[i])*(37**i)
    return num%int(M)
    
def BoW_fhash(fn,M):
    word1st = listword(fn)
    word = []
    for i in word1st:
        word.append(fhash(i,M))
    t_word = []
    for i in word:
        if not i in t_word:
            t_word.append(i)
    slot = [0]*len(t_word)
    for i in word:
        if i in t_word:
            slot[t_word.index(i)] += 1
    f_word = []
    for i in range(len(slot)):
        f_word.append([t_word[i],slot[i]])
    return f_word

file_name = input('File name = ')
choose = input('Use feature hashing ? (y,Y,n,N) ')
while not choose in 'nNyY':
    print('Try again.')
    choose = input('Use feature hashing ? (y,Y,n,N) ')
if choose in 'yY':
    M = input('M = ')
    print('-------------------')
    print('char count =',num_all(file_name))
    print('alphanumeric count =',num_char(file_name))
    print('line count =',num_line(file_name))
    print('word count =',num_word(file_name))
    print('BoW =',BoW_fhash(file_name,M))
else:
    print('-------------------')
    print('char count =',num_all(file_name))
    print('alphanumeric count =',num_char(file_name))
    print('line count =',num_line(file_name))
    print('word count =',num_word(file_name))
    print('BoW =',BoW(file_name))
    
    
# 6330312221 (22.80) 159 (2021-03-21 16:41)
def fhash(w,M):
    s = 0
    for i in range(len(w)):
        s += ord(w[i])*(37**i)
    fs = s%M
    return fs
def remove_punc(t):
    out = ''
    for e in t:
        if e in "\"\'/\\().,;:[]-<>?!%&*_+@#^$":
            out += ' '
        else:
            out += e
    return out
def count_alpha(t):
    out = ''
    count = 0
    for e in t:
        if e != ' ' and e != '\n':
            out += e
            count += 1
    return count
    
def count_words(t):
    n = len(t)
    return n
def find_bow(c):
    count = 1
    first_word = c[0]
    ans = []
    for i in range(1,len(c)) :
        if c[i] == first_word :
            count += 1
        else :
            ans.append([c[i-1], count])
            first_word = c[i]
            count = 1
    ans.append([c[i],count])
    return ans
def find_hash_bow(c,M):
    ans1 = []
    ans2 = []
    ans = []
    for i in c:
        n = fhash(i,M)
        ans1.append(n)
    ans1.sort()
    count = 1
    first_ans = ans1[0]
    for i in range(1,len(ans1)) :
        if ans1[i] == first_ans :
            count += 1
        else :
            ans.append([ans1[i-1], count])
            first_ans = ans1[i]
            count = 1
    ans.append([ans1[i],count])
    return ans
    
st = ''
stop_words = ''
c = 0
n = ''
bow = []
st_word = []
no_stop_word = []
file_name = input('File name = ')
h = input('Use feature hashing ? (y,Y,n,N) ')
fn = open(file_name, 'r')
fn2 = open('stopwords.txt','r')
for line in fn2:
    stop_words += line
fn2.close()
stop_words = stop_words.lower()
stop_words = stop_words.split()
while h not in ['y','Y','n','N']:
    print('Try again.')
    h = input('Use feature hashing ? (y,Y,n,N) ')
if h in ['y','Y']:
    M = int(input('M = '))
    print('-------------------')
    for line in fn:
        st += line
        c += 1
    fn.close()
    st = remove_punc(st)
    st = st.lower()
    st_word = st.split()
    st_word.sort()
    for e in st_word:
        if e not in stop_words:
            no_stop_word.append(e)
    fn = open(file_name, 'r')
    for line in fn:
        if line[-1] == '\n':
            n += line[0:-1]
        elif line[-1] != '\n':
            n += line
    fn.close()
    char = len(n)
    print('char count = '+str(char))
    print('alphanumeric count = '+str(count_alpha(st)))
    print('line count = '+str(c))
    print('word count = '+str(count_words(st_word)))
    bow = find_hash_bow(no_stop_word,M)
    print('BoW =',bow)
elif h in ['n','N']:
    print('-------------------')
    for line in fn:
        st += line
        c += 1
    fn.close()
    st = remove_punc(st)
    st = st.lower()
    st_word = st.split()
    st_word.sort()
    for e in st_word:
        if e not in stop_words:
            no_stop_word.append(e)
    fn = open(file_name, 'r')
    for line in fn:
        if line[-1] == '\n':
            n += line[0:-1]
        elif line[-1] != '\n':
            n += line
    fn.close()
    char = len(n)
    print('char count = '+str(char))
    print('alphanumeric count = '+str(count_alpha(st)))
    print('line count = '+str(c))
    print('word count = '+str(count_words(st_word)))
    bow = find_bow(no_stop_word)
    print('BoW =',bow)
# 6330313921 (28.40) 160 (2021-03-22 21:05)


def char_count(name1) :
    file = open(name1 ,'r')
    file_read = file.readline()
    
    n = ''
    while len(file_read) > 0 :
        for i in file_read.strip() :
            n += i
        file_read = file.readline()
        
    ch_count = len(n)
        
    file.close()
    return ch_count
def alphanumeric_count(name2) :
    file = open(name2 ,'r')
    file_read = file.readline()
    ex = 'qwertyuiopasdfghjklzxcvbnm0123456789'
    file_read.strip()
    c = ''
    c2 = ''
    c3 = ''
    while len(file_read) > 0 :
        for i in file_read.lower() :
            c += i
        file_read = file.readline()
    for j in c :
        if j in ex :
            c2 += j
    
    for k in c2 :
        if 'a' <= k <= 'z' or '0' <= k <= '9' :
            c3 += k
    alc = len(c3)
    
    return alc
def line_count(name3) :
    file = open(name3 ,'r')
    file_read = file.readline()
    line = 0
    while len(file_read) :
        line += 1
        file_read = file.readline()
    
    file.close()
    return line
def word_count(name4) :
    file = open(name4 ,'r')
    file_read = file.readline()
    nmn = ''
    while len(file_read) > 0 :
        nmn += file_read.strip() 
        nmn += ' '
        file_read = file.readline()

    wc = nmn.split() 
    return len(wc)
#------------------------------------------------------
def alphanumeric2(names) :
    file = open(names,'r')
    file_read = [i.strip() for i in file.readlines()]
    alpNum = 'qwertyuiopasdfghjklzxcvbnm0123456789'
    an = []
    for i in range(len(file_read)):
        word = ""
        for j in range(len(file_read[i])):
            if file_read[i][j].lower() in alpNum:
                word += file_read[i][j].lower()
            else:
                an.append(word)
                word = ""
        an.append(word)

    string = ""
    for i in range(len(an)):
        if an[i] != "":
            string += an[i] + " "
    
    return (string)

def stop_word(sw) :  
    stopw = open('stopwords.txt','r')
    stw = stopw.readline()
    stwu = ''
    while len(stw) > 0:
            stwu += stw.strip()
            stwu += ' '
            stw = stopw.readline()
            
    stopw.close()
    
    return stwu

#------------------------------------------------------
def fhash(w,M) :
    G = 37
    tisy = 0
    n = 0
    for i in w :
            c = ord(i) * (G**n)
            tisy += c
            n += 1
    tisy = tisy % int(M) 
    return tisy
#------------------------------------------------------
def bownofhash(name) :
    file = open(name,'r')
    file_read = file.readline()
    sname = 'stopwords.txt'
    stopw = stop_word(sname)
    sp = ''
    stopw1 = stopw.split()
    wline = alphanumeric2(name)
    wl = wline
    wl = wl.split()
    
    for i in range(len(wl)):
            if wl[i] not in stopw1 :
                    sp += wl[i]
                    sp += ' '
    foruse = sp.split()
    times = []
    
    for wl in foruse :
        c = foruse.count(wl)
        times.append(c)
    free = []
    
    for k in range(len(foruse)):
            M = [foruse[k],times[k]]
            free += [M]
    result = []
    
    
    for n in free :
        if n not in result :
            result.append(n)
    
    result.sort()
        
    
    return result
#------------------------------------------------------
def bownusefhash(name) :
    
    file = open(name,'r')
    file_read = file.readline()
    sname = 'stopwords.txt'
    stopw = stop_word(sname)
    sp = ''
    stopw1 = stopw.split()
    wline = alphanumeric2(name)
    wl = wline.lower()
    wl = wl.split()
    
    for i in range(len(wl)):
            if wl[i] not in stopw1 :
                    sp += wl[i]
                    sp += ' '
    foruse = sp.split()
    times = []
    
    for wl in foruse :
        c = foruse.count(wl)
        times.append(c)
    
    usefh = []
    for f in range(len(foruse)):
        w = foruse[f]
        usefh.append(fhash(w,M))

    c = []
    usefh.sort()
    for h in usefh:
        c.append(usefh.count(h))

    ref = []
    for r in range(len(foruse)):
        re = [usefh[r],c[r]]
        ref += [re]    

    res = [] 
    for s in ref :
        if s not in res :
            res.append(s)
    
    return res
#------------------------------------------------------

name = input('File name = ')

filename = open(name ,'r')
file_read = filename.readline()
isHash = False

while True:
    ans = input('Use feature hashing ? (y,Y,n,N) ')
    if ans not in ['n' , 'N' , 'y' , 'Y' ]:
        print("Try again.")
    else:
        if ans in ['Y','y']:
            isHash = True
            M = int(input('M = '))
        else:
            pass
        break

print('-------------------')
print('char count = ' , char_count(name))
print('alphanumeric count = ' , alphanumeric_count(name))
print('line count = ' , line_count(name))
print('word count = ' , word_count(name))

if not isHash :
    print('BoW = ' , bownofhash(name))
else:
    print('BoW = ' , bownusefhash(name))
# 6330314521 (19.00) 161 (2021-03-21 22:47)


file_name = input('File name = ')
feat = input('Use feature hashing ? (y,Y,n,N) ')
end = False
if feat == 'y' or feat == 'Y':
    M = input('M = ')
    fhashbow = True
elif feat == 'N' or feat == 'n':
    fhashbow = False
else:
    end = True
    print('Try again.')
while end == True:
    feat = input('Use feature hashing ? (y,Y,n,N) ')
    if feat == 'y' or feat == 'Y':
        M = input('M = ')
        fhashbow = True
        end = False
        break
    elif feat == 'N' or feat == 'n':
        fhashbow = False
        end == False
        break
    else:
        print('Try again.')

stop_w = open('stopwords.txt','r')
stopw = stop_w.readline()



if feat == 'y' or feat == 'Y':
    print('-------------------')
else:
    skip = True



file = open(file_name ,'r')
filed = file.readline()

#=================line count================
def line_count(fileds):
    file = open(file_name ,'r')
    filed = file.readline()
    f1 = filed
    linum = 0            #line count
    while len(f1) > 0:
        linum += 1
        f1 = file.readline()
    file.close()
    return linum

#=================word count================
def count_word(fileds):
    file = open(file_name ,'r')
    filed = file.readline()
    liword = ''
    while len(filed) > 0:
        liword += filed.strip()
        filed = file.readline()

    nope = '"\'[]{}(),.;'
    for i in range(len(liword)):
        if i == 0:
            pass
        elif liword[i] in nope:
            liword = liword[:i] + ' ' + liword[i+1:]

    word = liword.split()
    k = 0
    for e in range(len(word)):
        k += 1
    file.close()
    return k    
        
#=================char count================
def count_char(fileds):
    file = open(file_name ,'r')
    filed = file.readline()
    
    mix = ''
    while len(filed) > 0:
        for i in filed.strip():
            mix += i
        filed = file.readline()
    
    file.close()
    return len(mix)

#==============alphnum=====================
def alphnum(fileds):
    file = open(file_name ,'r')
    filed = file.readline()

    mix = ''
    while len(filed) > 0:
        for i in filed.strip():
            mix += i
        filed = file.readline()
        
    nope = '"\'[]{}(),.;?!'
    for i in range(len(mix)):
        if i == 0:
            pass
        elif mix[i] in nope:
            mix = mix[:i] + ' ' + mix[i+1:]
    mix.lower()
    
    file.close()
    return mix

#=================alphanumeric count===========
def count_alphnum(fileds):
    file = open(file_name ,'r')
    filed = file.readline()

    mix = ''
    while len(filed) > 0:
        for i in filed.strip():
            mix += i
        filed = file.readline()
        
    nope = '"\'[]{}(),.;?!'
    for i in range(len(mix)):
        if i == 0:
            pass
        elif mix[i] in nope:
            mix = mix[:i] + ' ' + mix[i+1:]

    m = ''
    for t in mix.lower():
        if 'a' <= t <= 'z' or '0' <= t <= '9':
            m += t
        
    file.close()
    return len(m)
        
#===================fhash===============
def fhash(w,M):
    G = 37
    cal = 0
    z = 0
    for i in w :
            c = ord(i)
            c *= G**z
            z +=1
            cal += c
    cal = cal % int(M)
    return cal

#================stop words=====================
def stopwords(stopws):
    stop_w = open('stopwords.txt','r')
    stopw = stop_w.readline()
            
    stp = ''
    while len(stopw) > 0:
        stp += stopw.strip()
        stp += ' '
        stopw = stop_w.readline()

    stop_w.close()
    return stp

#====================bow===================== 
def bow(filed):
    file = open(file_name ,'r')
    filed = file.readline()
        
    stp = stopwords(stopw)

    sigwords = ''
    stpw = stp.split()
    wordlines = alphnum(filed)
    w = wordlines.lower().split()

    for e in range(len(w)):
        if w[e] not in stpw:
            sigwords += w[e]
            sigwords += ' '
        
    wordlist = sigwords.split()
    wordfreq = []
    for w in wordlist:
        wordfreq.append(wordlist.count(w))

    bow = []
    for q in range(len(wordlist)):
        b = [wordlist[q],wordfreq[q]]
        bow += [b]
    
    BOW = [] 
    [BOW.append(x) for x in bow if x not in BOW]
    
    file.close()
    return BOW
    
#=====================fhash bow================
def fhash_bow(filed):
    file = open(file_name ,'r')
    filed = file.readline()
            
    stp = stopwords(stopw)

    sigwords = ''
    stpw = stp.split()
    wordlines = alphnum(filed)
    w = wordlines.lower().split()

    for e in range(len(w)):
        if w[e] not in stpw:
            sigwords += w[e]
            sigwords += ' '
            
    wordlist = sigwords.split()
    wordfreq = []
    for w in wordlist:
        wordfreq.append(wordlist.count(w))


    wordfh = []
    for s in range(len(wordlist)):
        D = wordlist[s]
        wordfh.append(fhash(D,M))

    wordfh_f = []
    wordfh.sort()
    for z in wordfh:
        wordfh_f.append(wordfh.count(z))

    bowfh = []
    for q in range(len(wordlist)):
        b = [wordfh[q],wordfh_f[q]]
        bowfh += [b]    

    BOWfh = [] 
    [BOWfh.append(x) for x in bowfh if x not in BOWfh] 
    
    file.close()
    return BOWfh

#================================================================    
#========================output process==========================

# if end == False:
if fhashbow == True:
        print('char count =', count_char(filed))
        print('alphanumeric count =', count_alphnum(filed))
        print('line count =',line_count(filed))
        print('word count =', count_word(filed))
        print('BoW =',fhash_bow(file))
elif fhashbow == False:
        print('char count =', count_char(filed))
        print('alphanumeric count =', count_alphnum(filed))
        print('line count =',line_count(filed))
        print('word count =', count_word(filed))        
        print('BoW =',bow(filed)) 
# elif end == True:
#     print('Try again.')
        
stop_w.close()
file.close()


# 6330315121 (28.20) 162 (2021-03-22 18:14)
def fhash(w,M):
    G = 37
    fh_sum = 0
    for i in range(len(w)):
        fh_sum += (ord(w[i]))*(G**i)
    fh = fh_sum%M
    return fh
def clear_shid(t):
    cleared = ""
    for e in t:
        if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9":
            cleared += e
        else:
            cleared += ""
    return cleared
def clear2(t):
    cleared = ""
    for e in t:
        if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9":
            cleared += e
        else:
            cleared += " "
    return cleared
def BoW1(t):
    cleared = ""
    for e in t:
        if e in x:
            cleared += " "
        elif "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9":
            cleared += e + " "
        else:
            cleared += " "
    bow1 = cleared.split()
    return bow1
def n_BoW(t):
    bow_solo = []
    for e in t:
        if e not in bow_solo:
             bow_solo.append(e)
    n_bow = []
    for e in bow_solo:
        n_bow.append([e, t.count(e)])
    return n_bow
def fh_BoW(t):
    fh_list = []
    for e in t:
        fh_list.append(fhash(e,M))       
    bow_solo = []
    for e in fh_list:
        if e not in bow_solo:
             bow_solo.append(e)
    fh_bow = []
    for e in bow_solo:
        fh_bow.append([e, fh_list.count(e)])
    return fh_bow
        
x = []
stop_w = open("stopwords.txt", "r")
for line in stop_w:
    y = line.split()
    for i in range(len(y)):
        x.append(y[i])
stop_w.close()   
    
file_name = input("File name = ")
choose = input("Use feature hashing ? (y,Y,n,N) ")
while choose not in ["y", "Y", "n", "N"]:
    print("Try again.")
    choose = input("Use feature hashing ? (y,Y,n,N) ")   
if choose in ["y", "Y"]:
    M = int(input("M = "))
    fn1 = open(file_name, "r")
    a = ""
    for line in fn1:
        a += clear2(line)
    t1 = a.lower().split()
    a_bow1 = BoW1(t1)
    final_bow = fh_BoW(a_bow1)
    fn1.close()
elif choose in ["n", "N"]:
    fn2 = open(file_name, "r")
    b = ""
    for line in fn2:
        b += clear2(line)
    t2 = b.lower().split()
    b_bow1 = BoW1(t2)
    final_bow = n_BoW(b_bow1)
    fn2.close()
    
char_c = 0
line_c = 0
word_c = ""
apnum_c = ""
f_n = open(file_name, "r")
for line in f_n:
    apnum_c += clear_shid(line)
    line_c += 1
    word_c += clear2(line)
    for e in line:
        if e!= "\n":
            char_c +=1
f_n.close()
print("-------------------")
print("char count =",char_c)
print("alphanumeric count =",len(apnum_c))
print("line count =",line_c)
print("word count =",len(word_c.split()))
print("BoW =",final_bow)



# 6330316821 (28.00) 163 (2021-03-22 21:39)

def new_string(x):
    t = ""
    for e in x:
        if e in "\"\'/\\,.:;()[]{}+-*=_&^%#@!|$><?":
            t += " "
        else:
            t += e.lower()
    return t
#------------------------------------
def count_word(words, w):
    c = 0
    for e in words:
        if e == w:
            c += 1
    return c
#------------------------------------
def Bow(string):
    list_Bow = string.split()
    list_Bow.sort()
    x = []
    y = []
    for e in list_Bow:
        if e not in y:
            if e not in stop_words:
                x.append([e, count_word(list_Bow, e)])
                y.append(e)
    return x
#------------------------------------
def fhash(w,M):
    sum = 0
    for i in range(len(w)):
        sum += ord(w[i])*(37**i)
    fhash = sum % M
    return fhash
#------------------------------------

file_name = input("File name = ")
feature = input('Use feature hashing ? (y,Y,n,N) ')
while feature.lower() != 'y' and feature.lower() != 'n':
    print('Try again.')
    feature = input('Use feature hashing ? (y,Y,n,N) ')
if feature.lower() == 'y':
    M = input("M = ")
print('-------------------')

stop = open('stopwords.txt', 'r')
stop_words = []
for line in stop:
    x = line.split()
    for e in x :
        stop_words.append(e)

fn = open(file_name)
word = []
char_count = 0
alphanumeric_count = 0
line_count = 0
word_count = 0
string = ''
Bow_1 = []
Bow_2 = []
test_Bow = []

for line in fn:
    x = new_string(line).split()
    y= new_string(line)
    string += " " + y
    line_count += 1
    for e in y:
        if e in 'abcdefghijklmnopqrstuvwxyz0123456789':
            alphanumeric_count += 1
        if e != '\n':
            char_count += 1
    for e in x:
        word.append(e)
Bow_1 = Bow(string)

print('char count = ' + str(char_count))
print('alphanumeric count = ' + str(alphanumeric_count))
print('line count = ' + str(line_count))
print('word count = ' + str(len(word)))
if feature.lower() == 'n':
    print('BoW =', Bow_1)
if feature.lower() == 'y':
    for i in range(len(Bow_1)):
        Bow_1[i][0] = fhash(Bow_1[i][0], int(M))
    Bow_1.sort()
    for e in Bow_1:
        if e[0] not in test_Bow:
            test_Bow.append(e[0])
            Bow_2.append([e[0], e[1]])
        elif e[0] in test_Bow:
            Bow_2[test_Bow.index(e[0])][1] += e[1]
    print('BoW = ', Bow_2)

fn.close()
stop.close()
# 6330317421 (21.85) 164 (2021-03-22 17:11)
name = input('File name = ').strip()
file_name = open(name,'r')
stop_words = open('stopwords.txt','r')
read_fn = file_name.read()
read_fn = read_fn.lower()
read_fn2 = read_fn.lower().split()
read_sw = stop_words.read().split()  
fn_sw = []
line = 0
co = read_fn.split("\n") 
for i in co: 
    if i: 
        line += 1  
def ac(read):
    k = ''
    for j in read:
        if j in 'abcdefghijklmnopqrstuvwxyz0123456789':
            k += j
        else: k += ''
    return k
for i in read_fn2:
    if i not in read_sw:
        k = ac(i)
        fn_sw += [k]
    else: i = ''
def dup(z):
    dup = []
    for i in z:
        if i not in dup:
            dup.append(i)
    return dup
WoW = dup(fn_sw)
def fhash(w,M):
    o = 0
    for i in range(len(w)):
        o += ord(w[i])*37**(i)
    return o % M
def char(name):
    count = 0
    file_name = open(name,'r')
    for line in file_name.read():
        if line[-1:] == '\n':
            line = line[:-1]
        count += len(line)
    return count
def show(B):
    print('char count =',char(name))
    print('alphanumeric count =',len(ac(read_fn)))
    print('line count =',line)
    print('word count =', len(read_fn2))
    print('BoW =',dup(B))   
x = input('Use feature hashing ? (y,Y,n,N) ')
while x not in ['y','Y','n','N']:
    print('Try again.')
    x = input('Use feature hashing ? (y,Y,n,N) ')
else:
    if x == 'y' or x == 'Y':
        M = int(input('M = '))
        print('-------------------')
        BoWf = []
        for i in WoW:
            f = fn_sw.count(i)
            BoWf += [[fhash(i,M),f]]
        show(BoWf)
    elif x == 'n' or x == 'N':
        print('-------------------')
        BoW = []
        for i in WoW:
            i = str(i)
            i = i.strip('[]')
            f = fn_sw.count(i)
            BoW += [[i,f]]
        show(BoW)
file_name.close()

# 6330318021 (20.00) 165 (2021-03-22 09:40)

def use_fhash(w,M) :
 G=37
 list_of_ord=[ord(e) for e in w]
 sum_bow=0
 for i in range(len(list_of_ord)):
     sum_bow+=list_of_ord[i]*(G**(i)) 
 fhash=sum_bow%M
 return fhash

file_name=input('File name = ')
while True :
    hash_or_not=input('Use feature hashing ? (y,Y,n,N) ')
    if hash_or_not=='Y' or hash_or_not=='y' :
        M=input('M = ')
        fhash='Y'
        break
    elif hash_or_not=='n' or hash_or_not=='N' :
        fhash='N'
        break
    elif hash_or_not!='n' or hash_or_not!='N' :
        print('Try again.')
    
stop_word=open('stopwords.txt','r')
list_of_stopwords=[]
for line in stop_word :
    line.strip
    x=line.split()
    list_of_stopwords+=[e for e in x]
    
readed_file=open(file_name,'r')
character_count=0
eng_num_count=0
word_count=0
line_count=0
words=[]
for line in readed_file :
    list_of_char=[e.lower() for e in line]
    line_count+=1
    while True :
        if '\n' in list_of_char :
            list_of_char.remove('\n')
        else :
            break
    for e in list_of_char :
        character_count+=len(e)
        for i in range(len(e)) :
            if 'a'<=e[i]<='z' or '0'<=e[i]<='9' :
                eng_num_count+=1
    for i in range(len(list_of_char)) :
        if ord('a')<=ord(list_of_char[i])<=ord('z') or ord('0')<=ord(list_of_char[i])<=ord('9') :
            pass
        else :
            list_of_char[i]=' '
    word=('').join(list_of_char).split()
    word_count+=len(word)
    words+=word
print('-'*19)
print('char count = '+str(character_count))
print('alphanumeric count = '+str(eng_num_count))
print('line count = '+str(line_count))
print('word count = '+str(word_count))

for i in range(len(words)) :
    if words[i] in list_of_stopwords :
        words[i]=''
while True :
    if '' in words :
        words.remove('')
    else :
        break

if fhash=='N' :
    list_normbow=[]
    for e in words :
        bow=[e,'']
        count_word=0
        for i in range(len(words)) :
            if words[i]==e :
                count_word+=1
                words[i]=''
        bow[1]=count_word
        list_normbow.append(bow)
    display_list=[]
    for i in range(len(list_normbow)) :
        if list_normbow[i][0]!='' :
            display_list.append(list_normbow[i])
    print(display_list)

if fhash=='Y' :
    list_fhash=[]
    fhash_list=[use_fhash(e,int(M)) for e in words]
    for e in fhash_list :
        bow=[e,'']
        count_word=0
        for i in range(len(fhash_list)) :
            if fhash_list[i]==e :
                count_word+=1
                fhash_list[i]=''
        bow[1]=count_word
        list_fhash.append(bow)
    display_list=[]
    for i in range(len(list_fhash)) :
        if list_fhash[i][0]!='' :
            display_list.append(list_fhash[i])        
    print(display_list)
    

readed_file.close()
stop_word.close()
# 6330319721 (26.60) 166 (2021-03-22 17:04)
def fhash(w,M):
    G = 37
    frac = 0
    letters = list(w)
    for i in letters:
        frac += ord(i)*(G**(len(letters)-1))
    ans = frac % M
    return ans
#=============================================================
def stopwords():
    b = []
    stop = open('stopwords.txt')
    for line in stop:
        if line != "\n":
            line1 = line.strip('\n')
            line2 = line1.split(' ')
            for j in range(len(line2)):
                b.append(line2[j])
    stop.close()
    return b
#=============================================================
def text(file):
    file = open(file)
    a = ''
    for line in file:
        if line != "\n":
            line = line.lower()
            l = line.strip('\n')
            a += ''.join(l)+' '
    file.close()
    return a
#=============================================================
def char(file):
    file = open(file)
    char = ''
    for line in file:
        linex = line.strip()
        if linex != "\n":
            line = line.lower()
            l1 = line.strip('\n')
            char += ''.join(l1)
    file.close()
    ans = len(char)
    return ans
#=============================================================
def alphanum(cn):
    ans = ''
    for i in cn:
        if i == ' ':
            ans += ' '
        elif 48<=ord(i)<=57 or 97<=ord(i)<=122 or 65<=ord(i)<=90:
            ans += i
        else:
            ans += ' '
    return ans
#=============================================================
def line(file_name):
    file = open(file_name)
    ans = 0
    r = file.read()
    r1 = r.strip('\n')
    r2 = r1.split('\n')
    for i in r2:
        ans += 1
    file.close()
    return ans
#=============================================================
def BoW(file_name):
    a1 = file_name.split()
    ans = []
    num = 0
    for i in a1:
        for k in range(len(a1)):
            if i == a1[k]:
                num += 1
        a2 = [i,num]
        if a2 in ans:
            num = 0
        else:
            ans.append([i,num])
            num = 0
    return ans
#=============================================================
def BoWfhash(w,m):
    a1 = w.split()
    ans = []
    list1 = []
    for i in a1:
        feh = fhash(i,m)
        list1.append(feh)
    num = 0
    for j in list1:
        for k in range(len(list1)):
            if j == list1[k]:
                num+=1
        a2 = [j,num]
        if a2 in ans:
            num = 0
        else:
            ans.append(a2)
            num = 0
    return ans
#=============================================================
file_name = input('File name = ')
yn = input('use feature hashing ? (y,Y,n,N) ')
do = 0
b = stopwords()
a = text(file_name)
cn1 = alphanum(a)
cn2 = ''.join(cn1.split())
cut = ' '.join([i for i in cn1.split() if i not in b])
while yn != 'y' or yn != 'Y':
    if yn == 'n' or yn == 'N':
        break
    elif yn == 'y' or yn == 'Y':
        do = 1
        m = input('M = ')
        break
    else:
        print('Try again.')
        yn = input('Use feature hashing ? (y,Y,n,N) ')
if do == 1:
    print('-------------------')
    print('char count =', char(file_name))
    print('alphanumeric count =', len(cn2))
    print('line count =', line(file_name))
    print('word count =', len(a.split()))
    print('BoW =', BoWfhash(cut,int(m)))
else:
    print('-------------------')
    print('char count =', char(file_name))
    print('alphanumeric count =', len(cn2))
    print('line count =', line(file_name))
    print('word count =', len(cn1.split()))
    print('BoW =', BoW(cut))
# 6330320221 (24.90) 167 (2021-03-21 02:13)
def char_count(file_name):
    file=open(file_name,'r')
    c=0
    for line in file:
        for e in line:
            if e!='\n':
                c+=1
    file.close() 
    return c
def alphanumeric_count(file_name):
    file=open(file_name,'r')
    c=0
    for line in file:
        for e in line:
            if '0'<=e<='9' or 'a'<=e.lower()<='z':
                c+=1
    file.close() 
    return c
def line_count(file_name):
    file=open(file_name,'r')
    c=0
    line_list=[]
    for line in file:
        c+=1
    file.close() 
    return c
def word_count(file_name):
    file=open(file_name,'r')
    c=0
    new_file=''
    for line in file:
        for e in line:
            if 'a'<=e.lower()<='z' or '0'<=e<='9' or e==' ':
                new_file+=e
        new_file+=' '
    wordlist=new_file.split()
    for e in wordlist:
        c+=1
    file.close() 
    return c
def fhash(w,M):
    sum=0
    for i in range(len(w)):
        sum+=ord(w[i])*37**i
    fhash=sum%M
    return fhash
def BoW(file_name):
    file=open(file_name,'r')
    cuttext=''
    bowtext=[]
    for line in file:
        for e in line:
            if 'a'<=e.lower()<='z' or '0'<=e<='9' or e==' ':
                cuttext+=e.lower()
        cuttext+=' '
    cuttext=cuttext.split()
    for word in cuttext:
        if word not in stopwords:
            bowtext.append(word)
        
    if command==True:#---------fhash
        fhash_list=[]
        for word in bowtext:
            fhash_list.append(fhash(word,M))
        BoW_Order=[]
        for num in fhash_list:
            if num not in BoW_Order:
                BoW_Order.append(num)
        BoW_Order.sort()
        count=0
        BoW_Fhash=[]
        for find_num in BoW_Order:
            for num in fhash_list:
                if num==find_num:
                    count+=1
            BoW_Fhash.append([find_num,count])
            count=0
        file.close() 
        return BoW_Fhash
    else:#-------------Normal
        count=0
        BoW=[]
        bowtext.sort()
        for find_word in bowtext:
            for word in bowtext:
                if word==find_word:
                    count+=1
            BoW.append([find_word,count])
            count=0
        BoW_Undup=[]
        for e in BoW:
            if e not in BoW_Undup:
                BoW_Undup.append(e)
        file.close() 
        return BoW_Undup
#-----INPUT-------------------
file_name=input('File name = ').strip()
while True:
    command=input('Use feature hashing ? (y,Y,n,N) ').strip()
    if command=='Y' or command=='y':
        command=True
        break
    elif command=='N' or command=='n':
        command=False
        break
    else:
        print('Try again.')
if command==True:
    M=int(input('M = ').strip())
#---STOPWORD-PREP-----------------
stopwords=[]
stop_file=open('stopwords.txt','r')
for line in stop_file:
    wordinline=line.split()
    for e in wordinline:
        stopwords.append(e)
stop_file.close()
#------SHOW-----------------------
print('-------------------')
print('char count =',char_count(file_name))
print('alphanumeric count =',alphanumeric_count(file_name))
print('line count =',line_count(file_name))
print('word count =',word_count(file_name))
print('BoW =',BoW(file_name))
#-------------------------------
# 6330321921 (12.00) 168 (2021-03-22 13:15)
#Prog-08: Bag-of-words
#6330321921 (12.00) Poonnawich Kerdsup
def fhash(w, M):
    k = 0
    for i in range(len(w)):
        k += ord(w[i])*(37**i)
    return k % M
def blank(t):
    result = ''
    for c in t:
        if c in '\'\"/\\,.:;?!':
            result += ' '
        else:
            result += c
    return result
def count_word(list):
    c = 0
    for e in list:
        if 'a' <= e.lower() <= 'z' or \
           '0' <= e <= '9':
            c += 1
    return c
def stopwords_list(file_name):
    stopwords = []
    for line in file_name:
        sw_eachline = line.split(' ')
        for sw in sw_eachline:
            stopwords.append(sw)
    return stopwords
#-----------------------------------------------------
#stopwords
swin = open('stopwords.txt', 'r')

stopwords = stopwords_list(swin)

swin.close()
#-----------------------------------------------------

file_name = input('File name = ').strip()
yorn = input('Use feature hashing ? (y,Y,n,N) ').strip()

fn = open(file_name, 'r')
t = True
while t:
    if yorn == 'y' or yorn == 'Y':
        M = int(input('M = '))
        t = False
    elif yorn == 'n' or yorn == 'N':
        M = -1
        t = False
    else:
        print('Try again')
        yorn = input('Use feature hashing ? (y,Y,n,N) ').strip()

print('-'*19) # 19 '-'

#----------------------------------------------------------

c = 0
d = 0
line_count = 0
word_count = 0
for line in fn:
    if line[-1] == '\n':
        c += len(line) - 1
    else:
        c += len(line)
    line_count += 1
    for e in line:
        if 'a' <= e.lower() <= 'z' or \
           '0' <= e <= '9':
            d += 1
    b = blank(line)
    x = b.split(' ')
    word_count += count_word(x)



print('char count =', str(c)) # charactor count from file
print('alphanumaric count =', str(d)) # alphanumaric count from file
print('line count =', str(line_count)) # lines counted
print('word count =', str(word_count)) # words counted

fn.close()

#-----------------------------------------------------------

#BoW ต่อ
fn = open(file_name, 'r')

words = ''
for line in fn:
    w = line.split(' ')
    for e in w:
        if blank(e.lower()) in stopwords:
            words += ' '
        else:
            words += e
    bb = blank(words)
    xx = bb.split(' ')
    y = []
    for e in xx:
        if e == '' or e == '\n':
            pass
        else:
            e = e.lower()
            y.append(e)
list_fhash = []
BoW = []
if M != -1:
    for i in range(len(y)):
        if len(y[i]) == 0:
            pass
        else:
            fh = fhash(y[i], M)
            list_fhash.append(fh)
    list_fhash.sort()
    list_fhash.append('!')
    q = 1
    for j in range(1, len(list_fhash)):
        if list_fhash[j - 1] == list_fhash[j]:
            q += 1
        else:
            BoW.append([list_fhash[j - 1], q])
            q = 1
elif M == -1:
    y.sort()
    y.append('!')
    qq = 1
    for ii in range(1, len(y)):
        if y[ii-1] == y[ii]:
            qq += 1
        else:
            BoW.append([y[ii-1], qq])
            qq = 1

print('BoW =', BoW)
fn.close()




# 6330322521 (26.00) 169 (2021-03-22 14:44)
def BoW(words):
    o=[]
    words.sort()
    i=0
    while (i <= len(words)-1): 
        c = 1
        ch = words[i] 
        j = i 
        while (j < len(words)-1): 
            if (words[j] == words[j+1]): 
                c = c+1
                j = j+1
            else:break
        o.append([words[i],c])
        i = j+1
    return o
file_name=open(input('File name = '),'r')
p=input('Use feature hashing ? (y,Y,n,N) ')
while p != 'y' and p != 'Y' and p != 'n' and p != 'N':
    print('Try again.')
    print('Use feature hashing ? (y,Y,n,N) ')
    p=input()
    
if p == 'y' or p== 'Y':M=int(input('M = '))
print('-------------------') 
S=open('stopwords.txt','r')
stop1=''
stop2=''
for l in S:
    stop1+=l
for g in stop1:
    if g == '\n':
        stop2+=' '
    else : stop2+=g
s=stop2.split()
S.close()

a=''
for l in file_name:
    a+=l.lower()

c=0
for e in a:
    if e != '\n':c+=1
print('char count = ',c)

a1=''
for e in a:
    if 'a'<=e<='z' or 'A'<=e<='Z' or'0'<=e<='9':
        a1+=e
    else :a1+=' '
    
c=0
for e in a1:
    if not e == ' ':c+=1
print('alphanumeric count = ',c)

c=0
for e in a:
    if e == '\n':c+=1
print('line count = ',c)

c=len(a1.split())
print('word count = ',c)

F=[]
a1=a1.split()
for e in a1:
    if e not in s:
        F.append(e)

if p == 'n' or p=='N':
     print('BoW =',BoW(F))
if p == 'y' or p == 'Y':
    a2=F
    a3=[]
    for e in a2:
        num=0
        for i in range(len(e)):
            num+=ord(e[i])*(37**i)
        a3.append(num%M)
    print('BoW =',BoW(a3))
file_name.close()
# 6330323121 (22.43) 170 (2021-03-22 21:47)

#.....................................................................................
#ให้ w คือคำที่ประกอบด้วยอักขระ c0 c1 c2 ... cn –1
#fhash(w,M) = fhash(c0 c1 c2 ... cn –1, M) = ( ord(c0) + ord(c1)G1 + ord(c2)G2 + ... + ord(cn –1)Gn –1) % M


def fhash(w,M) :
    u=0
    G=37
    fh=0
    for i in range(len(w)):
        fh+=ord(w[i])*(G**u)
        u+=1
    return fh%M
def char_count(file_name):
    n = -1
    c = 0
    f = open(file_name)
    for line in f:
        n += 1
        c += len(line)
    
    f.close()
    c -= n
    return c
            
    return c
def a_and_num_count(file_name):
    f=open(file_name)
    c=0
    alphabet='abcdefghijklmnopqrstuvwxyz'
    num='0123456789'
    for line in f:
        for i in line:
            if i in alphabet or i in alphabet.upper() or i in num:
                c+=1
    f.close()
            
 
    return c
def words_count(file_name):
    f=open(file_name)
    s=''
    alphabet='abcdefghijklmnopqrstuvwxyz'
    num='0123456789'
    for line in f:
        for i in line :
            if i in alphabet or i in alphabet.upper() or i in num :
                s+=i
            else:
                s+=' '
    x=s.split()
    f.close()
    return len(x)
    
    
    
def line_count(file_name):
    c = 0
    f = open(file_name)
    for line in f:
        c += 1
    f.close()
    return c
def BoW_Nn(file_name,stop):
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    num = "1234567890"
    f = open(file_name)
    f2 = ""
    s2 = ""
    
    
   
    for line in f:
        for e in line:
            if e.lower()  in alphabet or e in num:
                
                f2 += e
            else:
               f2 += " "
    s = open(stop)
    for line in s:
        for e in line:
            s2 += e
    
    f3 = f2.lower().split()
    s3 = s2.lower().split()
    x = []
    for e in f3:
        if e not in s3:
            x.append(e)
    x.sort()
    
    b0 = [x[0]]
    b1 = [1]
 
    for i in range(1,len(x)):
        if x[i] != x[i-1]:
            b0.append(x[i])
            b1.append(1)
        else:
           
            b1[-1] += 1
    
            
    b = []
    for i in range(len(b0)):
        b.append([b0[i],b1[i]])
    f.close()
    s.close()
    return b
def BoW_Yy(file_name,stop,M):
    b=BoW_Nn(file_name,stop)
    by=[]
   
    for i in range(len(b)):
        by.append(fhash(b[i][0],M))
    by.sort()
    #[1,1,2,3,3,4,5,5,6]
    by0=[by[0]]
    by1=[1]
    for i in range(1,len(by)):
        if by[i-1]!=by[i]:
            by0.append(by[i])
            by1.append(1)
        else:
            by1[-1]+=1
    bowyes=[]
    for i in range(len(by0)):
        bowyes.append([by0[i],by1[i]])
    return bowyes
#..........................................

file_name=input('File name = ')
yn=input('Use feature hashing ? (y,Y,n,N) ')
while yn not in ['Y','y','N','n'] :
    print('Try again.')
    yn=input('Use feature hashing ? (y,Y,n,N) ')
if yn =='N' or yn=='n':
    print('char count = '+str(char_count(file_name)))
    print('alphanumeric count = '+str(a_and_num_count(file_name)))
    print('line count = '+str(line_count(file_name)))
    print('word count = '+str(words_count(file_name)))
    print('BoW = '+str(BoW_Nn(file_name,'stopword.txt')))
elif yn=='Y' or yn=='y':
     
    M=int(input('M = '))
    print('char count = '+str(char_count(file_name)))
    print('alphanumeric count = '+str(a_and_num_count(file_name)))
    print('line count = '+str(line_count(file_name)))
    print('word count = '+str(words_count(file_name)))
    print('BoW = '+str(BoW_Yy(file_name,'stopword.txt',M)))


# 6330324821 (30.00) 171 (2021-03-21 15:24)
def count(Bow_wordsi,new_sentence):#Yes
    N=0
    for eachword in new_sentence:
        if eachword == Bow_wordsi:
            N+=1
    return N
    
def delete(line):#Yes คืน listรวมช่องว่าง
    newline=''
    for c in line.strip():
        if c in 'ABCDEFGHIJKLMNOPQRSTUVWXUZabcdefghijklmnopqrstuvwxyz0123456789' :
            newline+=c
        else:
            newline+=' '
    return newline
def change(sentence,allstopwords):#Yesรับเป็นstring
    list_newsentence=[]
    sentence_strip=sentence.strip().lower()
    sentences=delete(sentence_strip).split()
    for c in sentences:
        if c not in allstopwords:
            list_newsentence.append(c.lower())
    return list_newsentence
def Bag_of_words(list_newsentence):#Yes
    list_newsentence.sort()
    Bow_words=[]
    for c in list_newsentence:
        if c not in Bow_words:
            Bow_words.append(c)
    Bow=[]
    for i in range(len(Bow_words)):
        N=count(Bow_words[i],list_newsentence)
        Bow1=Bow_words[i]
        Bow2=N
        Bow.append([Bow1,Bow2])
    return Bow
def fhash(w,M)\
:
    set_ans=[]
    ans=0
    for i in range(len(w)):
        set_ans.append(ord(w[i])*(37**i))
    for c in set_ans:
        ans+=c
    Fhash=ans%M
    return Fhash

#รับอินพุต
file_name = input('File name = ')
choice = input('Use feature hashing ? (y,Y,n,N) '"")
if choice == 'y' or choice == 'Y':
    check=True
    M=int(input('M = '))
elif choice == 'n' or choice == 'N':
    check=False
else:
    while choice not in ['y','Y','n','N']:
        print('Try again.')
        choice = input('Use feature hashing ? (y,Y,n,N) '"")
    if choice == 'y' or choice == 'Y':
        check=True
        M=int(input('M = '))
    elif choice == 'n' or choice == 'N':
        check=False

#อ่านstopwords    
stopwords=open('stopwords.txt', 'r')
allstopwords=[]
for line in stopwords:
    stopword_in_line=line.strip().split()
    for c in stopword_in_line:
        if c not in allstopwords:
            allstopwords.append(c)
stopwords.close()
print('-------------------')
#อ่านไฟล์
fn=open(file_name , 'r')
line_count=0
character_count=0
word_count=0
ch_nb_count=0
AllBows=[]
newline_string=''
sentence=''
for line in fn:
    character_count+=len(line.strip())
    line_count+=1
    newline=delete(line)
    for c in newline:
        if c != ' ':
            ch_nb_count+=1
    newline_list=newline.split()
    word_count+=len(newline_list)
    for i in range(len(newline_list)):
        sentence+=newline_list[i]+' '
fn.close()
print('char count =',character_count)
print('alphanumeric count =',ch_nb_count)
print('line count =',line_count)
print('word count =',word_count)
Bows=Bag_of_words(change(sentence,allstopwords))
if check:
    new1=[]
    for i in range(len(Bows)):
        some=Bows[i][1]
        for k in range(some):
            new1.append(fhash(Bows[i][0],M))
    new=[]
    new2=[]
    new1_sort=sorted(new1)
    c=1
    xxx=[]
    for c in new1_sort:
        if c not in xxx:
            N=count(c,new1_sort)
            new2.append(N)
            xxx.append(c)
    for i in range(len(xxx)):
        new.append([xxx[i],new2[i]])
    Bows=new
print('BoW =',Bows)
# 6330325421 (27.00) 172 (2021-03-22 23:44)
def cut(a):
    b=''
    for c in a:
        if not ('a'<=c<='z' or 'A'<=c<='Z' or '0'<=c<='9'):
            b+=' '
        else:
            b+=c
    return b
def fhash(w,M):
    G=37
    a=0
    for c in range(len(w)):
        if 'a'<=w[c]<='z' or w[c] in '0123456789':
            a+=ord(w[c])*(G**c)
    return int(a%M)
def count( data, element ):
    c = 0
    for e in data:
        if e == element: c += 1
    return c
#-----------------------------
word2=[]
file_name2='stopwords.txt' 
infile2=open(file_name2,"r")
for line2 in infile2:
    line2=line2.lower().split()
    word2+=line2
    
#------------------------------
file_name =input('File name = ' )
x=input('Use feature hashing ? (y,Y,n,N) ')
while x not in 'yYnN': 
    print('Try again.')
    x=input('Use feature hashing ? (y,Y,n,N) ')
if x=='y' or x=='Y':
    M=int(input('M = '))
    infile=open(file_name,"r")
    word=[]
    word_count=0
    character_count=0
    alphanumeric_count=0
    line_count=0
    BoW=[]
    bow=[]
    realbow=[]
    true=[]
    truefhash=[]
    realbowfhash=[]
    print('-------------------')
    for line in infile:
        if '\n' in line:
            character_count+=len(line)-1
        else:
            character_count+=len(line)
        line=cut(line).split()
        word_count+=len(line)
        word+=line
#-------------------------------------------------
        for i in line:
            true.append(i.lower())
            if i.lower() not in bow:
                bow.append(i.lower())
            alphanumeric_count+=len(i)
        line_count+=1
        for c in line:
            if c.lower() not in word2:
                truefhash.append(fhash(c.lower(),M)) 
        for a in range(len(bow)):
            if bow[a] not in word2 and (bow[a] not in realbow):
                realbow.append(bow[a])
    print('char count =',character_count)
    print('alphanumeric count =',alphanumeric_count)
    print('line count =',line_count)
    print('word count =',word_count)
#-------------------------------------------------
    for i in range(len(realbow)):
        realbow[i]=fhash(realbow[i],M)
        if realbow[i] not in realbowfhash:
            realbowfhash.append(realbow[i])
            BoW.append([realbow[i],count( truefhash, realbow[i] )])
    print('BoW =',BoW)
elif x=='n' or x=='N':
    infile=open(file_name,"r")
    word=[]
    word_count=0
    character_count=0
    alphanumeric_count=0
    line_count=0
    BoW=[]
    bow=[]
    realbow=[]
    true=[]
    print('-------------------')
    for line in infile:
        if '\n' in line:
            character_count+=len(line)-1
        else:
            character_count+=len(line)
        line=cut(line).split()
        word_count+=len(line)
        word+=line
        for i in line:
            true.append(i.lower())
            if i.lower() not in bow:
                bow.append(i.lower())
            alphanumeric_count+=len(i)
        line_count+=1 
        for a in range(len(bow)):
            if bow[a] not in word2 and (bow[a] not in realbow):
                realbow.append(bow[a])
    print('char count =',character_count)
    print('alphanumeric count =',alphanumeric_count)
    print('line count =',line_count)
    print('word count =',word_count)  
    for i in range(len(realbow)):
        BoW.append([realbow[i],count( true, realbow[i] )])   
    print('BoW =',BoW) 
infile.close()
infile2.close()

# 6330326021 (30.00) 173 (2021-03-21 18:07)
def words_in_line(line):
    new_line = ""
    for a in line:
        if a in ".,<>/?\|!~`()*&^%$#@_-+=][}{\'\";:\\":
            new_line += " "
        else:
            new_line += a
    return new_line.split()
def list_of_words(f):
    char_count = 0
    alphanumeric_count = 0
    line_count = 0
    word_count = 0
    words = []
    for line in f:
        line_count += 1
        line = line.strip()
        char_count += len(line)
        words += words_in_line(line)
    word_count = len(words)
    for w in words:
        alphanumeric_count += len(w)
    print("char count = " + str(char_count))
    print("alphanumeric count = " + str(alphanumeric_count))
    print("line count = " + str(line_count))
    print("word count = " + str(word_count))
    return words
def mod_words(words):
    result = []
    for w in words:
        if w.lower() not in stop_words:
            result.append(w.lower())
    return result
def non_feature_hashing(words):
    result = []
    did = []
    for w in words:
        if w not in did:    
            c = words.count(w)
            did.append(w)
            result.append([w, c])
    return sorted(result)
def fhash(word, M):
    temp = 0
    for i in range(len(word)):
        temp += ord(word[i]) * 37**i
    return temp % M
def feature_hashed(words, M):
    return [fhash(word, M) for word in words]
def feature_hashing(words, M):
    result = []
    did = []
    new_words = feature_hashed(words, M)
    for f in new_words:
        if f not in did:
            c = new_words.count(f)
            did.append(f)
            result.append([f, c])
    return sorted(result)

file_name = input("File name = ")
fin = open(file_name, "r")

stop_words_file = open("stopwords.txt", "r")
stop_words = []
for line in stop_words_file:
    line = line.strip().split()
    for w in line:
        stop_words.append(w) 
stop_words_file.close()

correct = False
while not correct:
    command = input("Use feature hashing ? (y,Y,n,N) ")
    if command == "n" or command == "N":
        correct = True
        print("-" * len("Use feature hashing"))
        words = list_of_words(fin)
        print("BoW =", non_feature_hashing(mod_words(words)))
    elif command == "y" or command == "Y":
        correct = True
        M = int(input("M = "))
        print("-" * len("Use feature hashing"))
        words = list_of_words(fin)
        print("BoW =", feature_hashing(mod_words(words), M))
    else:
        print("Try again.")

fin.close()
# 6330327721 (26.00) 174 (2021-03-22 21:12)
def fhash(w,M):
    fh=0
    for i in range(len(w)):
        fh+=ord(w[i])*(37**i)
    fh=fh%M
    return fh
def BoW(st):
    sw = open("stopwords.txt ",'r')
    swn=[]
    stn =[]
    n=0
    for i in sw:
        i = i.split()
        for j in i:
            swn.append(j)
    for i in st:
        if i not in swn:
            stn.append(i)
    stn.sort()
    bow=[[stn[0],1]]
    for i in stn:
        if bow[-1][0] == i:
            n+=1
            bow[-1][1]=n
        else:
            n=1
            bow.append([i,n])
    sw.close()
    return bow
def fhB(st,M):
    bow = BoW(st)
    for i in range(len(bow)):
        bow[i][0] = fhash(bow[i][0],M)
    bow.sort()
    fhb = [bow[0]]
    for i in range(1,len(bow)):
        if fhb[-1][0] == bow[i][0]:
            fhb[-1][1]=int(fhb[-1][1])+int(bow[i][1])
        else:
            fhb.append(bow[i])
    return fhb
def show(cc,cnc,lc,wc):
    print('char count =',cc)
    print('alphanumeric count =',cnc)
    print('line count =',lc)
    print('word count = ',wc)
    
def analize(file_name,M):
    ch = 'abcdefghijklmnopqrstuvwxyz'
    num = '0123456789'
    f = open(file_name,'r')
    x=''
    cc=0
    cnc=0
    wc=0
    lc=0
    for i in f:
        lc+=1
        i=i.lower().strip()
        for l in i:
            cc+=1
            if l in ch or l in num:
                cnc+=1
                x+=l
            else:
                x+=' '
        x+=' '
    x = x.strip().split()
    wc=len(x)
    show(cc,cnc,lc,wc)
    if M == '':
        bow = BoW(x)
    else:
        bow = fhB(x,M)
    print('BoW =',bow)
    f.close()
def choice():
    file_name = input('File name = ')
    c=input('Use feature hashing ? (y,Y,n,N) ')
    while c not in ['n','N','y','Y']:
        print('Try again.')
        c=input('Use feature hashing ? (y,Y,n,N) ')
    if c in ['y','Y']:
        M = int(input('M = '))
        print('-------------------')
        analize(file_name,M)
    elif c in ['n','N']:
        print('-------------------')
        analize(file_name,'')

choice()
# 6330328321 (30.00) 175 (2021-03-21 15:49)
def readfile(filename):
    file = open(filename,'r')
    s=list()
    for line in file:
        s.append(line.strip())
    file.close()
    return s
def charcount(lines):
    char_count=0
    for i in range(len(lines)):
        char_count+=len(lines[i])
    return char_count
def removespecial(s):
    t=''
    for ch in s:
        if 'a'<=ch<='z' or\
           '0'<=ch<='9':
            t+=ch
        else:
            t+=' '
    return t
def dosomething(do):
    M=-1
    while True:
        if do =='y':
            M = input('M = ')
            print('-------------------')
            break
        elif do=='n':
            print('-------------------')
            break
        else:
            print('Try again.')
            do=input('Use feature hashing ? (y,Y,n,N) ').lower()
    return do,M
def fhash(word,M):
    G=37
    nsum=0
    for i in range(len(word)):
        n=ord(word[i])
        nsum+=n*(37**i)
    return nsum%int(M)
        
def calbow(listwords):
    wordlist=list()
    nlist=list()
    for i in range(len(listwords)):
        if listwords[i] not in wordlist:
            wordlist.append(listwords[i])
            nlist.append(1)
        else:
            idx=wordlist.index(listwords[i])
            nlist[idx]=nlist[idx]+1
    bowlist=list()
    for i in range(len(wordlist)):
        tem = list()
        tem.append(wordlist[i])
        tem.append(nlist[i])
        bowlist.append(tem)
    bowlist.sort()
    return bowlist
def calbowfhash(listwords,M):
    wordlist=list()
    nlist=list()
    for i in range(len(listwords)):
        if fhash(listwords[i],M) not in wordlist:
            wordlist.append(fhash(listwords[i],M))
            nlist.append(1)
        else:
            idx=wordlist.index(fhash(listwords[i],M))
            nlist[idx]=nlist[idx]+1
    bowlist=list()
    for i in range(len(wordlist)):
        tem = list()
        tem.append(wordlist[i])
        tem.append(nlist[i])
        bowlist.append(tem)
    bowlist.sort()
    return bowlist
def main():
    x=input('File name = ')
    do=input('Use feature hashing ? (y,Y,n,N) ')
    do,M = dosomething(do.lower())
    s=readfile(x)
    line_count=len(s)
    n=charcount(s)
    s=' '.join(s)
    t=removespecial(s.lower())
    word = t.split()
    print('char count =',n)
    print('alphanumeric count =',len(''.join(word)))
    print('line count =',line_count)
    print('word count =',len(word))
    stopwords = readfile('stopwords.txt')
    stopwords =' '.join(stopwords)
    stopwords = stopwords.split()
    listwords=list()
    for i in range(len(word)):
        if word[i] not in stopwords:
            listwords.append(word[i])
    if do=='y':
        print('BoW =',calbowfhash(listwords,M))
    else:
        print('BoW =',calbow(listwords))
main()
    
    

# 6330329021 (30.00) 176 (2021-03-22 00:33)

file_name = input('File name = ')
hash_check = ''
while hash_check not in ['y','Y','n','N']:
    hash_check = input('Use feature hashing ? (y,Y,n,N) ')
    if hash_check not in ['y','Y','n','N']:
        print('Try again.')
    
if hash_check.lower() == 'y':
    M = int(input('M = '))
print('-------------------')

stopwords_file = open('stopwords.txt', 'r')
stopwords_list = []
for line in stopwords_file:
    line = line.strip().split()
    for e in line:
        stopwords_list.append(e)
stopwords_file.close()

file = open(file_name, 'r')
chars = 0
for line in file:
    for e in line:
        if e not in '\n':
            chars += 1
print('char count = '+str(chars))
file.close()

file = open(file_name, 'r')
alphanumeric = 0
for line in file:
    for e in line:
        if e.lower() in '0123456789abcdefghijklmnopqrstuvwxyz':
            alphanumeric += 1
print('alphanumeric count = '+str(alphanumeric))
file.close()

file = open(file_name, 'r')
line_count = 0
for line in file:
    line_count += 1
print('line count = '+str(line_count))
file.close()

file = open(file_name, 'r')
word_count = 0
clean_line = ''
for line in file:
    line = line.lower()
    for e in line:
        if e in '0123456789abcdefghijklmnopqrstuvwxyz':
            clean_line += e
        else:
            clean_line += ' '
    line = clean_line.strip().split()
    word_count += len(line)
    clean_line = ''
print('word count = '+str(word_count))
file.close()

file = open(file_name, 'r')
BoW = []
list_of_words = []
clean_line = ''
for line in file:
    line = line.lower()
    for e in line:
        if e in '0123456789abcdefghijklmnopqrstuvwxyz':
            clean_line += e
        else:
            clean_line += ' '
    clean_line = clean_line.strip().split()
    for e in clean_line:
        if e not in stopwords_list:
            list_of_words.append(e)
    clean_line = ''
file.close()
def fhash(w,M):
    value = 0
    for i in range(len(w)):
        value += ord(w[i])*(37**i)
    return value%M

if hash_check.lower() == 'y':
    for i in range(len(list_of_words)):
        list_of_words[i] = fhash(list_of_words[i], M)

list_of_words.sort()
c = 0
if len(list_of_words) != 0:
    x = list_of_words[0]
    for i in range(len(list_of_words)):
        if list_of_words[i] == x:
            c += 1
        else:
            BoW.append([x,c])
            x = list_of_words[i]
            c = 1
    BoW.append([x,c])
print('BoW = '+str(BoW))
# 6330330521 (0.00) 177 (2021-03-22 01:16)

#============================================================
def remove_punc(t):
    out = ""
    for e in t:
        if e  not in '''!()-[]{};:'"\,<>./?@#$%^&*_~''':
            out += e
        else:
            out += ' '
    return out
#=======================================================================
def fhash(w, M):
    sum = 0
    for i in range (len(w)):
        sum += ord(w[i])*(37**i)
    fhash = sum % M
    return fhash
#=========================================================================
def bow_n():
    list_word =[]
    unique_word = []
    list_stopwords = []
    word_frequencies = []
    last_bow =[]
    fn = open("sample.txt")
    fr = open("stop words.txt")
    for line in fn:
        line = line.lower()
        line = remove_punc(line)
        line = line.split()
        list_word += line
    for line in fr:
        line = line.split()
        list_stopwords += line
    for word in list_word:
        if word not in unique_word and word not in list_stopwords:
            unique_word += [word]
    for word in unique_word:
        word_frequencies += str(list_word.count(word))
    for i in range(len(unique_word)):
        r=[]
        r.append(unique_word[i])
        r.append(int(word_frequencies[i]))
        last_bow.append(r)
    last_bow.sort()
    fn.close()
    fr.close()
    return last_bow
#=========================================================================
def count_char():
    count_char = 0
    count_line =0
    fn = open("sample.txt")
    for line in fn:
        count_line += 1
        for i in range(len(line)):
            count_char += 1
    fn.close()
    count_char = count_char - count_line
    return count_char
#=============================================
def count_line():
    count_line =0
    fn = open("sample.txt")
    for line in fn:
        count_line += 1
    fn.close()
    return count_line

#================================================================
def count_word():
    count_word =0
    fn = open("sample.txt")
    for line in fn:
        line = remove_punc(line)
        line = line.split()
        count_word += len(line)
    fn.close()
    return count_word
#==============================================================
def count_alpha():
    alpha = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    number = ['0','1','2','3','4','5','6','7','8','9']
    count_alpha = 0
    fn = open("sample.txt")
    for line in fn:
        for i in range(len(line)):
            line = line.lower()
            if line[i] in alpha or line[i] in number:
                count_alpha += 1
    fn.close()
    return count_alpha
#==============================================================
def bow_y():
    list_word =[]
    unique_word = []
    list_stopwords = []
    word_frequencies = []
    last_bow =[]
    fn = open("sample.txt")
    fr = open("stop words.txt")
    for line in fn:
        line = line.lower()
        line = remove_punc(line)
        line = line.split()
        list_word += line
    for line in fr:
        line = line.split()
        list_stopwords += line
    for word in list_word:
        if word not in list_stopwords:
            unique_word += [word]
    for word in unique_word:
        word_frequencies += str(list_word.count(word))
    for i in range(len(unique_word)):
        r=[]
        r.append(unique_word[i])
        r.append(int(word_frequencies[i]))
        last_bow.append(r)
    last_bow.sort()
    fn.close()
    fr.close()
    return last_bow

#==============================================================
file_name = input('File name = ')
fh = input('Use feature hashing ? (y,Y,n,N) ')
while fh != 'y' and fh != 'Y' and fh != 'n' and fh != 'N':
    print('Try again.')
    fh = input('Use feature hashing ? (y,Y,n,N) ')
if fh == 'n' or fh == 'N':
    print('-'*19)
    print('-'*19)
    print('char count = '+str(count_char()))
    print('alphanumeric count = '+str(count_alpha()))
    print('line count = '+str(count_line()))
    print('word count = '+str(count_word()))
    print('BoW = '+ str(bow_n()))
elif fh == 'y' or fh == 'Y':
    list_bow = []
    unique_bow = []
    num_frequencies =[]
    num_bow = []
    M = input('M = ')
    print('-'*19)
    print('char count = '+str(count_char()))
    print('alphanumeric count = '+str(count_alpha()))
    print('line count = '+str(count_line()))
    print('word count = '+str(count_word()))
    bow_y = bow_y()
    for i in range (len(bow_y)):
        sum = 0
        for k in range (len(bow_y[i][0])):
            sum += ord(bow_y[i][0][k])*(37**k)
        bow = sum % int(M)
        list_bow.append(bow)
    for e in list_bow:
        if e not in unique_bow:
            unique_bow += [e]
    for e in unique_bow:
        num_frequencies += str(list_bow.count(e))
    for i in range(len(unique_bow)):
        r=[]
        r.append(unique_bow[i])
        r.append(int(num_frequencies[i]))
        num_bow.append(r)
    num_bow.sort()
    print('BoW = '+ str(num_bow))

# 6330331121 (30.00) 178 (2021-03-22 14:35)

def fhash (w, M):
    G = 37
    a = 0
    for i in range(len(w)):
        a += (ord(w[i])) * (G**i)
        b = a % M
    return b

file_name = input("File name = ")
use = input("Use feature hashing ? (y,Y,n,N) ")
d = ["y","Y"]
e = ["n","N"]
f = ["y","Y","n","N"]  
while use not in f:
    print("Try again.")
    use = input("Use feature hashing ? (y,Y,n,N) ")
if use in d:
    M = int(input("M = "))
    print("-------------------")
elif use in e:
    print("-------------------")
    
charcou = 0
op_file = open(file_name, "r")
for i in op_file:
    for e in i:
        charcou += 1
        if e == "\n":
            charcou += -1
op_file.close()
print("char count =", charcou)

alpha = 0
op_file = open(file_name, "r")
for i in op_file:
    for e in i:
        if "A" <= e <= "Z":
            alpha += 1
        if "a" <= e <= "z":
            alpha += 1
        if "0" <= e <= "9":
            alpha += 1
        else:
            alpha += 0
op_file.close()
print("alphanumeric count =", alpha)

op_file = open(file_name, "r")
linecou = 0
for i in op_file:
    linecou += 1
op_file.close()
print("line count =", linecou)


word = ""
wordcou = 0
op_file = open(file_name, "r")
for i in op_file:
    for e in i:
        if "A" <= e <= "Z":
            word += e.lower()
        elif "a" <= e <= "z":
            word += e
        elif "0" <= e <= "9":
            word += e
        else:
            word += " "
       
op_file.close()
wordlist = word.split()

for f in range(len(wordlist)):
    wordcou += 1

print("word count =", wordcou)

sn = ""
op_stop = open("stopwords.txt", "r")
for i in op_stop:
    for e in i:
        if "A" <= e <= "Z":
            sn += e
        elif "a" <= e <= "z":
            sn += e
        elif "0" <= e <= "9":
            sn += e
        else:
            sn += " "
       
op_stop.close()
stopwordlist = sn.split()   

pppcorrect = ""
for i in wordlist:
    if i not in stopwordlist and i not in pppcorrect:
        pppcorrect += i +" "
                
pppcorrect = pppcorrect.lower()        
pppcorrect = pppcorrect.split()

pppwrong = ""
for i in wordlist:
    if i not in stopwordlist:
        pppwrong += i +" "
                
pppwrong = pppwrong.lower()        
pppwrong = pppwrong.split()


bowlistn = []

if use in ["y","Y"]:
    pppcorrect = ""
    for i in wordlist:
        if i not in stopwordlist and str(fhash(i,M)) not in pppcorrect:
            pppcorrect += str(fhash(i,M)) +" "
                         
    pppcorrect = pppcorrect.split()

    pppwrong = ""
    for i in wordlist:
        if i not in stopwordlist:
            pppwrong += str(fhash(i,M)) +" "
                      
    pppwrong = pppwrong.split()

    
    bowlistn = []

    for i in range(len(pppcorrect)):
        summ = 0
        for e in range(len(pppwrong)):
            if pppcorrect[i] == pppwrong[e]:
                summ += 1
        bowlistn.append([int(pppcorrect[i]),summ])
            
    print("BoW =", bowlistn)

else:
    if use in ["n","N"]:
        for i in range(len(pppcorrect)):
            summ = 0
            for e in range(len(pppwrong)):
                if pppcorrect[i] == pppwrong[e]:
                    summ += 1
            bowlistn.append([pppcorrect[i],summ])
            
        print("BoW =", bowlistn)


# 6330332821 (30.00) 179 (2021-03-22 14:10)
def fhash(w,M):
    G = 37
    
    i = 0
    sumChar = 0
    
    for c in w:
        sumChar += ord(c)*pow(G,i)
        i += 1

    return sumChar % M
def setBoWList(L):
    BoW = []
    checkRedundant = []

    for e in L:
        if e not in checkRedundant:
            BoW.append([e,L.count(e)])
            checkRedundant.append(e)

    return sorted(BoW,key = lambda x: x[0])
def main():
    fileStopWords = open("stopwords.txt", "r")
    stopWords = []

    for line in fileStopWords:
        stopWords.extend(line.split())
    fileStopWords.close()

    txtFile = input("File name = ")

    while True:
        hashing = input("Use feature hashing ? (y,Y,n,N) ")
        if hashing == "y" or hashing == "Y":
            hashing = True
            M = int(input("M = "))
            break
        elif hashing == "n" or hashing == "N":
            hashing = False
            break
        else:
            print("Try again.")

    print("-------------------")

    txtFile = open(txtFile, "r")
    data = []

    char = 0
    alnum = 0
    line = 0
    word = 0

    for txtLine in txtFile:
        text = txtLine.strip()
        char += len(text)
        line += 1

        t = ""
        space = 0
        for c in text:
            if not (c.isalnum() or c.isspace()):
                t += " "
                space += 1
            else:
                t += c.lower()
                if c.isspace():
                    space += 1
        text = t
        alnum += len(text) - space

        text = text.split()
        word += len(text)
        
        data.extend(text)
    txtFile.close()

    print("char count =", char)
    print("alphanumeric count =", alnum)
    print("line count =", line)
    print("word count =", word)

    clearStop = []
    for e in data:
        if e not in stopWords:
            clearStop.append(e)

    L = []
    if hashing:
        for w in clearStop:
            L.append(fhash(w,M))
    else:
        L = clearStop

    BoW = setBoWList(L)
    print("BoW =",BoW)
    
main()
# 6330333421 (21.00) 180 (2021-03-22 17:45)
def read_file_to_list(filepath):
    file = open(filepath, 'r')
    lines = []
    for line in file.readlines():
        lines.append(line.strip())
    file.close()
    return lines
def fhash(w, M):
    sum_ord = 0
    for i in range(len(w)):
        sum_ord += ord(w[i])*(37**i)
    return sum_ord % M
    
def remove_non_anumeric(word):
    result = "".join(e for e in word if e.isalnum())
    return result
def describe_file(lines):
    no_of_char = 0
    no_of_lines = 0
    no_of_anumeric = 0
    no_of_word = 0
    for line in lines:
        no_of_char += len(line)
        no_of_lines += 1
        no_of_anumeric += len(remove_non_anumeric(line))
        no_of_word += len([w for w in line.split()])
    return(no_of_char,no_of_lines, no_of_anumeric, no_of_word)
def clean_words(words):
    
    result = ''
    for c in words:
        if c.isalnum() or c == ' ':
            result += c
        else:
            result += ' ' 
    return result.strip().split()
            
def get_stop_words():
    stopwords = []
    stopwords_lines = read_file_to_list('stopwords.txt')
    for line in stopwords_lines:
        for word in line.strip().split():
            stopwords.append(remove_non_anumeric(word.lower()))
    return stopwords
def find_bow(words, useFHash=False, M=0):
    all_word = []
    words = clean_words(words)
    stop_words = get_stop_words()
    for word in words:
        if word not in stop_words:            
            if useFHash:
                word = fhash(word, M)
            all_word.append(word)
    word_bow = []
    word_count = []
   
    for word in all_word:
        if word not in word_bow:
            word_bow.append(word)
            word_count.append(1)
        else:
            word_count[word_bow.index(word)] += 1 
            
    result = []
    for i in range(len(word_bow)):
        result.append([word_bow[i], word_count[i]])
    
    return result
# ===========================================================
def main():
    
    file_name = input("File name = ").strip()
    use_f_hash = input("Use feature hashing ? (y,Y,n,N) ").lower().strip()
    while use_f_hash not in ['n','y']:
        print("Try again")
        use_f_hash = input().lower().strip()

    use_f_hash = True if use_f_hash=='y' else False
    if use_f_hash:
        M = int(input("M = "))
    
    lines = read_file_to_list(file_name)
    no_of_char,no_of_lines, no_of_anumeric, no_of_word = describe_file(lines)
    
    print("-------------------")
    print("char count = {}".format(no_of_char))
    print("alphanumeric count = {}".format(no_of_anumeric))
    print("line count = {}".format(no_of_lines))
    print("word count = {}".format(no_of_word))
    
    all_words = []
    for line in lines:
        all_words.append(line)
    bow = find_bow(' '.join(all_words))
    
    if use_f_hash:
        bow = find_bow(' '.join(all_words), use_f_hash, M) 
        
    print("BoW = {}".format(bow))
        
# ===========================================================

main()
# 6330334021 (30.00) 181 (2021-03-22 17:51)

file_name = input('File name = ')
hashing = input('Use feature hashing ? (y,Y,n,N) ')
while not (hashing in ['y', 'Y', 'n', 'N']):
    print("Try again.")
    hashing = input('Use feature hashing ? (y,Y,n,N) ')
M = 0
if hashing == 'y' or hashing == 'Y':
    M = int(input('M = '))
print('-------------------')

def fhash(w,M):
    summ = 0
    s = 0
    for l in w:
        summ += (ord(l)*37**s)
        s += 1
    return summ%M

ss = ''
stop = []
stpw = open('stopwords.txt', 'r')
for line in stpw:
    for i in range(len(line) - 1):
        if line[i] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789':
            ss += line[i]
            if not (line[i + 1] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'):
                stop.append(ss)
                ss = ''
    if line[-1] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789':
        ss += line[-1]
        stop.append(ss)        
stpw.close()


char = 0
alp_count = 0
line_count = 0
ww = ''
word = []
word_count = 0
BoW = []
fakebow = []
alp_num = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
analyse = open(file_name, 'r')


for line in analyse:
    line_count += 1
    for i in line:
        char += 1
        if i in alp_num:
            alp_count += 1
    for ii in range(len(line) - 1):
        if line[ii] in alp_num:
            ww += line[ii]
            if not (line[ii + 1] in alp_num):
                word.append(ww)
                ww = ''
    if line[-1] in alp_num:
        ww += line[-1]
        word.append(ww)
word.sort()


char -= (line_count - 1)
word_count = len(word)


print('char count = ' + str(char))
print('alphanumeric count = ' + str(alp_count))
print('line count = ' + str(line_count))
print('word count = ' + str(word_count))


for iii in range(len(word)):
    word[iii] = word[iii].lower()
for iiii in range(len(word)):
    if not (word[iiii] in stop):
        if not (word[iiii] in fakebow):
            BoW.append([word[iiii], 1])
            fakebow.append(word[iiii])
        else:
            BoW[fakebow.index(word[iiii])][1] += 1
            
            
if hashing == 'y' or hashing == 'Y':
    BoW = []
    fakebow = []
    for iiiii in range(len(word)):
        if not (word[iiiii] in stop):
            if not (fhash(word[iiiii], M) in fakebow):
                BoW.append([fhash(word[iiiii], M), 1])
                fakebow.append(fhash(word[iiiii], M))
            else:
                BoW[fakebow.index(fhash(word[iiiii], M))][1] += 1
    BoW.sort()
print('BoW =' ,BoW)    
    

analyse.close()
# 6330335721 (24.90) 182 (2021-03-18 20:53)

#-----------------------------------------------------------------
def count(lis,word  ):
    # return the count of the given element in the given data
    c = 0
    for e in lis:
        if e == word: c += 1
    return c
#-----------------------------------------------------------------
def hasss(w,m):
    z=0
    mod=0
    for i in w:
        mod+=ord(i)*(37**z)
        z+=1
    return mod%int(m)
#-----------------------------------------------------------------
file_name = input("Filename = ")
file_name = open(file_name)
linecount= 0
#-----------------------------------------------------------------
stop =open("stopwords.txt")
stoplis = []
for line in stop:
    x = line.split()
    for i in x:
        stoplis.append(i)

#-----------------------------------------------------------------
yes=False       
while True:
    has = input("Use feature hashing ? (y,Y,n,N) ")
    if has == "y"or has == "Y"or has == "n"or has == "N":
        if has == "y"or has == "Y":
            m = input("M = ")
            yes=True
        break
    else: print("Try again.")
#-----------------------------------------------------------------   
l = '';ls=[];uni=[];bow=[];unih = [];char =""
for line in file_name:
    for i in line:
        if i != "\n":
            char +=i
    linecount +=1
    l = ''
    for i in line:
        if i.lower() in "abcdefghigklmnopqrstuvwxyz0123456789":l+=i.lower()
        elif i == " ":l+= " "
        else:i+=" "
    l =l.split()
    for i in l:
        ls.append(i)
ls =sorted(ls)
#-----------------------------------------------------------------
uni2=[]
uni2h=[]
for i in ls:
    if i not in stoplis:
        uni.append(i)
    if i not in stoplis and i not in uni2:
        uni2.append(i)
if yes ==True:
    for i in uni:
        unih.append(hasss(i,m))
    unih =sorted(unih)
    for i in unih:
        if i not in uni2h:
            uni2h.append(i)
    

#-----------------------------------------------------------------            
if yes ==False:
    for i in uni2 :
        bow.append([i,count(ls,i)])

#-----------------------------------------------------------------
else:
    for i in uni2h :
        bow.append([i,count(unih,i)])
wordcount = "".join(ls)
#print("word = ",ls)
#print("unique = ",uni)
#print("unique = ",uni2)
#print("uniqueh = ",unih)
print("char count =",len(char))
print("alphanumeric count =",len(wordcount))
print("line count =",linecount)
print("word count =",len(ls))
print("bow =",bow)
#print("stop = ",stoplis)
stop.close()
file_name.close()
# 6330336321 (30.00) 183 (2021-03-21 20:05)
Alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
number = "1234567890"
def fhash(w, m):
    G = 37
    value = 0
    for i in range(len(w)):
        value += ord(w[i]) * (G ** i)
    value = value % m
    return value

def alp_count(x):
    ans = 0
    for i in range(len(x)):
        if x[i] in Alphabet or x[i] in number:
            ans += 1
    return ans

def word_counter(x):
    num = 0
    ref = ""
    for i in range(len(x)):
        if x[i] not in Alphabet and x[i] not in number:
            ref += " "
        else:
            ref += x[i]
    ref = ref.split()
    return len(ref)

def clear_stopword(li, cl):
    ref = ""
    for i in range(len(li)):
        if li[i] not in Alphabet and li[i] not in number:
            ref += " "
        else:
            ref += li[i]
    ref = ref.split()
    ans = []
    cl = cl.split()
    for i in ref:
        if i not in cl:
            ans.append(i)
    return ans

def bow_noFH(x):
    check = []
    ans = []
    for i in x:
        if i not in check:
            check.append(i)
            ans.append([i, x.count(i)])
    return ans

def bow_hvFH(x):
    check = []
    ans = []
    x.sort()
    for i in range(len(x)):
        if x[i][0] not in check:
            check.append(x[i][0])
            ans.append([x[i][0], 0])
        ans[-1][1] += x[i][1]
    return ans


file = input("File name = ")
check = ["y", "Y", "n", "N"]
i = True
while i == True:
    choice = input("Use feature hashing ? (y,Y,n,N) ")
    if choice not in check:
        print("Try again.")
    elif choice in check:
        i = False
fn = open(file, "r")
if choice == 'Y' or choice == 'y':
    M = int(input("M = "))
print("-------------------")
char_count = 0
line_count = 0
alpha_count = 0
total_str = ""
for line in fn.readlines():
    char_count += len(line.strip())
    line_count += 1
    total_str += line + " "
total_str = total_str.lower()
# print(total_str)
alpha_count += alp_count(total_str)
print("char count = " + str(char_count))
print("alphanumeric count = " + str(alpha_count))
print("line count = " + str(line_count))
word_count = word_counter(total_str)
print("word count = " + str(word_count))
fn2 = open('stopwords.txt')
stopword = ""
for line in fn2.readlines():
    stopword += line.strip() + " "
stopword = stopword.lower()
message = clear_stopword(total_str, stopword)
#print(message)
bow1 = bow_noFH(message)
bow1.sort()
if choice == "n" or choice == "N":
    print("BoW =", bow1)
else:
    bow2 = []
    for i in range(len(bow1)):
        bow2.append([fhash(bow1[i][0], M), bow1[i][1]])
    #print(bow2)
    print("BoW =", bow_hvFH(bow2))
fn.close()
fn2.close()
# 6330337021 (23.05) 184 (2021-03-21 18:00)
File_name=input("File name = ")
key = input('Use feature hashing ? (y,Y,n,N) ')
while key not in ["y","Y","n","N"] :
    print("Try again.")
    key = input('Use feature hashing ? (y,Y,n,N) ')
pol = open(File_name,"r")
C_count,A_count,L_count,W_count = 0,0,0,0
yo = []
for line in pol :
    line     = line.strip()
    line_r   = ""
    L_count += 1
    C_count += len(line)
    for i in range(len(line)) :
        if '0'<=line[i]<='z'  :
            line_r  += line[i]
            A_count += 1
        else :
            line_r  += " "             
    yo += line_r.lower().split()
    W_count = len(yo)
fn = open("stopwords.txt","r")
stop, you = [], []
for line in fn :
    line  = line.strip()
    stop += line.lower().split()
for c in yo :
    if c not in stop :
        you.append(c)
if key in ["y","Y"] :
    M = int(input("M = "))
    bow, hum = [], []
    for c in you :
        u = 0
        for i in range(len(c)) :
            u += ord(c[i])*(37**i)
        hum.append(u%M)
    for i in range(M) :
        if hum.count(i) != 0 :
            bow.append([i,hum.count(i)])
else :
    bow = []
    name = []
    for c in you :
        if c not in name :
            name.append(c)
            bow.append([c,you.count(c)])
pol.close()
fn.close()
print("-------------------")
print("char count =",C_count)
print("alphanumeric count = ",A_count)
print("line count =",L_count)
print("word count = ",W_count)
print("Bow =",sorted(bow)) 
# 6330338621 (30.00) 185 (2021-03-22 08:35)

def word_not_tag(sen):
    alp = list(sen)
    for e in alp:
        if e.isalnum() == True:
            pass
        else:
            x = alp.index(e)
            alp.remove(e)
            alp.insert(x,' ')
    newalp = ''.join(alp)
    return newalp
def alp_count(sen):
    alp = list(sen)
    c = 0
    for e in alp:
        if e.isalnum() == True:
            c += 1
    return c
def fhash(w,M):
    w = list(w)
    wnum = 0
    for i in range(len(w)):
        wnum += (ord(w[i]))*(37**i)
    ans = wnum%M
    return ans
def bow(sen,fh,M):
    ansfh = ['y','Y','n','N']
    stp = ''
    file = open("stopwords.txt", "r")
    for line in file:
        newline = line
        stp += newline
    file.close()
    mystp = word_not_tag(stp)
    mystp = mystp.split()
    mysen = word_not_tag(sen)
    mysen = mysen.split()
    newsen = []
    for e in mysen:
        if e not in mystp:
            newsen.append(e)
    if fh in ansfh[2:]:
        bow = []
        numbow = []
        for e in newsen:
            if e not in bow:
                num = 1
                bow.append(e)
                numbow.append(num)
            else:
                i = int(bow.index(e))
                numbow[i] += 1
        fullbow = []
        for i in range(len(bow)):
            fullbow.append([bow[i],numbow[i]])
    else:
        bow = []
        numbow = []
        for e in newsen:
            newe = fhash(e,M)
            if newe not in bow:
                num = 1
                bow.append(newe)
                numbow.append(num)
            else:
                i = int(bow.index(newe))
                numbow[i] += 1
        fullbow = []
        for i in range(len(bow)):
            fullbow.append([bow[i],numbow[i]])
    return fullbow


#----------------------------------------------------------------------------------------

file_name = input('File name = ')
fh = input('Use feature hashing ? (y,Y,n,N) ')
ansfh = ['y','Y','n','N']
v = True
while (v):
    if fh not in ansfh:
        print('Try again.')
        fh = input('Use feature hashing ? (y,Y,n,N) ')
    else:
        v = False
if fh in ansfh[:2]:
    M = input('M = ')
    M = int(M)
else:
    M = None

sen = ''
numline = 0
file = open(file_name, "r")
for line in file:
    numline += 1
    newline = line
    sen += newline
file.close()
sen = sen.lower()

print('-------------------')
count_char = len(sen) - (numline - 1)
print('char count =', count_char  )
count_alp = alp_count(sen)
print('alphanumeric count =', count_alp  )
print('line count =', numline  )
deltag = word_not_tag(sen)
count_word = len(deltag.split())
print('word count =', count_word  )
set_bow = bow(sen,fh,M)
print('BoW =', set_bow  )
# 6330339221 (24.50) 186 (2021-03-21 22:39)
file_name = input('File name = ')
op = open(file_name , 'r')
sw = open('stopwords.txt' , 'r')
fh = input('Use feature hashing ? (y,Y,n,N) ')
while fh != 'y' and fh != 'Y' and fh != 'n' and fh != 'N' :
    print('Try again.')
    fh = input('Use feature hashing ? (y,Y,n,N) ')
word = ''
sentence = ''
lc = 0
cc = 0
ac = 0
for line in op :
    lc +=1
    cc += len(line)-1
    for i in line :
        if 'Z'>=i>='A' or 'z'>=i>='a' or '9'>= i >='0' :
            word += i
            ac += 1
        else :
            sentence += word
            if word != '' :
                sentence+= ' '
            word = ''
if sentence[-len(word)-1:-1] != word :
    sentence += word+' '
sentence = sentence[:-1]
sentence = sentence.lower()
cc += 1
words = sentence.split(' ')
wc = len(words)
stopwords = []
for line in sw :
    line = line[:-1]
    line = line.split(' ')
    stopwords += line
for i in stopwords :
    while i in words :
        words.remove(i)
wordBoW = []
countwords = []
for i in words :
    if i not in wordBoW :
        wordBoW.append(i)
        countwords.append(1)
    else :
        countwords[wordBoW.index(i)] +=1
bow = []
for i in range(len(wordBoW)) :
    bow.append([wordBoW[i],countwords[i]])
if fh == 'n' or fh == 'N' :
    print('-------------------')
    print('char count =',cc)
    print('alphanumeric count =',ac)
    print('line count =',lc)
    print('word count =',wc)
    print('BoW =',sorted(bow))
elif fh == 'y' or fh == 'Y' :
    M = int(input('M = '))
    print('-------------------')
    print('char count =',cc)
    print('alphanumeric count =',ac)
    print('line count =',lc)
    print('word count =',wc)
    Pi = 0
    G = 37
    od = 0
    fhash = []
    countfhash = []
    for word in words :
        for i in word :
            od += ord(i)*G**Pi
            Pi +=1
        odmod = od%M
        fhash.append(odmod)
        Pi = 0
        od = 0
    FhashBoW = []
    for i in fhash :
        if i not in FhashBoW :
            FhashBoW.append(i)
            countfhash.append(1)
        else :
            countfhash[FhashBoW.index(i)] +=1
    bow = []
    for i in range(len(FhashBoW)) :
        bow.append([FhashBoW[i],countfhash[i]])
    print('BoW =',sorted(bow))
op.close()
sw.close()

# 6330340821 (30.00) 187 (2021-03-21 02:21)
file_name=input('File name = ')
op=input('Use feature hashing ? (y,Y,n,N) ')
while op not in ['y','Y','n','N']:
    print('Try again.')
    op=input('Use feature hashing ? (y,Y,n,N) ')
if op in ['y','Y']:
    M=input('M = ')
print('-------------------')
stop=open('stopwords.txt','r')
file=open(file_name,'r')
linecount=0
wordcount=0
xyz=''
words=[]
charcount=0
alphacount=0
for line in file:
    linecount+=1
    charcount+=len(line)
    for e in line:
        if e.isalnum():
            xyz+=e
        else:
            xyz+=" "
word=xyz.split()
wordcount+=len(word)
for i in word:
    words.append(i.lower())
for e in range(len(word)):
    for u in word[e]:
        if u.lower() in'abcdefghijklmnopqrstuvwxyz0123456789':
            alphacount+=1
            
charcount=charcount-linecount+1

print('char count =',charcount)
print('alphanumeric count =',alphacount)
print('line count =',linecount)
print('word count =',wordcount)
       
aa=[]
stopword=[]
for line in stop:
    n= line.split()
    for i in n:
        stopword.append(i.lower())

#######################################
def removepunc(x):
    y=[]
    k=''
    for i in x:
        for e in i:
            if e not in '\'\"\(\),\/\\.:;-><+-*=' :
                k+=e
        y.append(k)
        k=''
    return y
#######################################
for i in words:
    if i not in stopword:
        aa.append(i)
ww= removepunc(aa)
#######################################
w=[]#word
n=[]#fre
for i in range (len(ww)):
    if ww[i] not in w:
        w.append(ww[i])
        n.append(1)
    else:
        n[w.index(ww[i])]+=1
wn=[]
for i in range (len(w)):
    wn.append([w[i],n[i]])
wn.sort()
#######################################
def fhash(w,M):
    G=37
    y=0
    for i in range (len(w)):
        y+=ord(w[i])*G**(i)
    z=y%int(M)
    return z
#######################################
if op.lower()=='y':
    ss=[]
    tt=[]
    for i in range (len(ww)):
        if fhash(ww[i],M) not in ss:
            ss.append(fhash(ww[i],M))
            tt.append(1)
        else:
            tt[ss.index(fhash(ww[i],M))]+=1
    fn=[]
    for i in range (len(ss)):
        fn.append([ss[i],tt[i]])
    fn.sort()
    print('BoW =',fn)
else:
    print('BoW =',wn)
#######################################
stop.close()
file.close()
# 6330341421 (23.15) 188 (2021-03-21 17:10)

print('File name =',end = ' ')
fn = str(input())
print('Use feature hashing ? (y/Y/n/N)',end = ' ')
ynok = str(input())
use = 0 #0again 1yes 2no
while use == 0:
    if ynok == 'y' or ynok == 'Y':
        use = 1
        print('M =',end = ' ')
        M = int(input())
    elif ynok == 'n' or ynok == 'N':
        use = 2
    else:
        use = 0
        print('Try again.')
        print('Use feature hashing ? (y/Y/n/N)',end = ' ')
        ynok = str(input())
print('-------------------')
fo = open(fn,'r') 
sym = '\"\'/\\,.:;()[]{} '
st = open('stopwords.txt','r')
evst = ''
for line in st:
    evst += line
evst = evst.split()
lineno = 0
char = 0
evfo = ''
word = 0
alpha = 0
boww = []
for line in fo:
    evfo += line
    lineno += 1
    for i in line:
        if i != '\n':
            char += 1
        if i not in sym and i != '\n':
            alpha += 1

evfo = evfo.split()
for i in range(len(evfo)):
    evfo[i] = evfo[i].strip('\"\'/\\,.:;()[]{} ')
for i in range(len(evfo)):
    if evfo[i] not in sym or evfo[i] != ' ':
        word += 1
    if evfo[i].lower() not in evst and evfo[i] not in sym and evfo[i]  != '\n':
        boww.append(evfo[i].lower())

print('char count =', char)
print('alphanumeric count =', alpha)
print('line count =', lineno)
print('word count =', word)
#bow
G = 37
BOW = []
for i in boww:
    if [i,boww.count(i)] not in BOW:
        BOW.append([i,boww.count(i)])
def fhash(word,M):
    #word = str
    G = 37
    wn = []
    word = list(word)
    for i in word:
        wn.append(ord(i))
    ff = 0
    for i in range(len(wn)):
        ff += wn[i]*(G**i)
    fhashed = ff % M
    return fhashed

if use == 2:
    print('BoW =',BOW)
elif use == 1:
    fB = []
    for i in range(len(BOW)):
        k = BOW[i][1]
        c = 0
        while c != k:
            fB.append(fhash(BOW[i][0],M))
            c+=1
    fBOW = []
    for i in fB:
        if [i,fB.count(i)] not in fBOW:
            fBOW.append([i,fB.count(i)])
    fBOW.sort()
    print('BoW =',fBOW)
# 6330342021 (21.40) 189 (2021-03-22 02:06)
def fhash(w,M):
    a=0
    G=37
    for i in range(len(w)):
        a+=(ord(w[i])*(G**i))
    A=a%M
    return A
#-------------------------------------------------------------

file_name = open(input('File name = '),'r')
fn=''
line_count=0
for line in file_name:
    fn+=line
    line_count+=1
file_name.close()
#-----------------------------------------------------------
stopwords=open('stopwords.txt','r')
sw=''
for line in stopwords:
    sw+=line
    
stopwords.close()
#----------------------------------------------------------
    
hashing='m'
while hashing not in ['Y','y','N','n']:
    hashing=input('Use feature hashing ? (y,Y,n,N) ' )
    
    if hashing.lower()=='n':
        print('-------------------')
        cc=''
        for i in range(len(fn)):
            if fn[i]!='\n'   :
                cc+=fn[i]
        char_count=len(cc)
        print('char count =',char_count)

        ac=''
        for i in range(len(cc)):
            if 'a'<=cc[i].lower()<='z' or '0'<=cc[i]<='9'   :
                ac+=cc[i]
        alphanumeric_count=len(ac)
        print('alphanumeric count =',alphanumeric_count)

        print('line count = ',line_count)
  

        wc=''
        for i in range(len(cc)):
            if 'a'<=cc[i].lower()<='z' or '0'<=cc[i]<='9' or cc[i]==' '   :
                wc+=cc[i]
            else :
                wc+=' '
            wc1=wc.lower().split()
        word_count=len(wc1)
        print('word count = ',word_count)

        sw1=''
        for i in range(len(sw)):
            if sw[i]!='\n'   :
                sw1+=sw[i]
            else :
                sw1+=' '
        sw2=sw1.split()
        bo=[]
        for i in range(len(wc1)):
            if wc1[i] not in sw2 :
                bo.append(wc1[i])
        bow=[]
        for i in range(len(bo)):
            c=0
            for k in bo:
                if k==bo[i]:
                    c+=1
            bow.append([bo[i],c])
        bow.sort()
        BoW=[]
        for i in range(len(bow)-1):
            if bow[i]!=bow[i+1]:
                BoW.append(bow[i])
        BoW.append(bow[-1])
        print('BoW =',BoW)

    elif hashing.lower()=='y':
        M=int(input('M = '))
        print('-------------------')
        cc=''
        for i in range(len(fn)):
            if fn[i]!='\n'   :
                cc+=fn[i]
        char_count=len(cc)
        print('char count =',char_count)

        ac=''
        for i in range(len(cc)):
            if 'a'<=cc[i].lower()<='z' or '0'<=cc[i]<='9'   :
                ac+=cc[i]
        alphanumeric_count=len(ac)
        print('alphanumeric count =',alphanumeric_count)

        print('line count = ',line_count)


        wc=''
        for i in range(len(cc)):
            if 'a'<=cc[i].lower()<='z' or '0'<=cc[i]<='9' or cc[i]==' '   :
                wc+=cc[i]
            else :
                wc+=' '
            wc1=wc.lower().split()
        word_count=len(wc1)
        print('word count = ',word_count)


        sw1=''
        for i in range(len(sw)):
            if sw[i]!='\n'   :
                sw1+=sw[i]
            else :
                sw1+=' '
        sw2=sw1.split()


        bo=[]
        for i in range(len(wc1)):
            if wc1[i] not in sw2 :
                bo.append(wc1[i])
    

        fh=[]
        for i in range(len(bo)):
            k=fhash(bo[i],M)
            fh.append(k)
        bow=[]
        for i in range(len(fh)):
            c=0
            for k in fh:
                if k==fh[i]:
                    c+=1
            bow.append([fh[i],c])
        bow.sort()
        BoW=[]
        for i in range(len(bow)-1):
            if bow[i]!=bow[i+1]:
                BoW.append(bow[i])
        BoW.append(bow[-1])
        print('BoW =',BoW)
    else :
        
        print('Try again.')
        
    
    
        
        
    
    
    







# 6330343721 (23.10) 190 (2021-03-22 23:34)
filename = input('File name = ')
a = input('Use feature hashing ? (y,Y,n,N) ')
f = open(filename,'r',encoding='utf-8')
xxx = ['Y', 'y', 'N', 'n']
AN = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', \
      'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', \
    'a', 'b','c','d','e','f','g','h','i','j','k','l','m', \
    'n','o', 'p' 'q','r','s','t','u','v','w','x','y','z', \
        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
while a not in xxx:
    print('Try again.')
    a = input('Use feature hashing ? (y,Y,n,N) ')
if a in ['y', 'Y']:
    a2 = input('M = ')
print('-------------------')
data1 = []
charf = 0
numberofline = 0
for r in f:
    for i in range (0, len(r), 1):
        if str(r[i]) != '\n':
            charf += 1
    numberofline +=1
    for i in range(0, len(r), 1):
        if r[i] not in AN :
            r = r[:i]+" "+r[i+1:]
    x = r.split()
    for i in range(0, len(x), 1):
        x[i] = x[i].lower()
        data1.append(x[i])
print('char count =',charf)
alphanumericcount = 0
for i in range(len(data1)):
    alphanumericcount += len(data1[i])
print('alphanumeric count =', alphanumericcount)
print('line count =',numberofline)
print('word count =', len(data1))
stopword = open('stopwords.txt', 'r', encoding = 'utf-8')
dataremoval = []
for r in stopword:
    x = r.split()
    for i in range(len(x)):
        dataremoval.append(x[i])
datawithoutstopword = []
for i in range(0,len(data1), 1):
    if str(data1[i]) not in dataremoval:
        datawithoutstopword.append(data1[i])
stopword.close()
checknumberofeachword = [datawithoutstopword[0]]
for i in range(0, len(datawithoutstopword)-1, 1):
    if datawithoutstopword[i+1] not in checknumberofeachword:
        checknumberofeachword.append(datawithoutstopword[i+1])
    else:
        pass
newlist = []
for i in range(0, len(checknumberofeachword), 1):
    t = 0
    for j in range(0, len(datawithoutstopword), 1):
        if datawithoutstopword[j] == checknumberofeachword[i]:
            t+= 1
    newlist.append([checknumberofeachword[i],t])
newlist = sorted(newlist)
def Bagofwords(bag, M): #datawithoutstopwordstopwords
    bagofwords = []
    for i in range(0, len(bag), 1):
        G = 0
        for j in range(0, len(bag[i]),1):
            G += ord(str(bag[i])[j])*37**(j)
        modulo = G%M
        bagofwords.append(int(modulo))
    return bagofwords
if a in ['y', 'Y']:
    BagofWords = Bagofwords(datawithoutstopword, int(a2))
    BoWreduced = [BagofWords[0]]
    for i in range(0, len(BagofWords)-1, 1):
        if BagofWords[i+1] not in BoWreduced:
            BoWreduced.append(BagofWords[i+1])
        else:
            pass
    TrueBoW = []
    for i in range(0, len(BoWreduced), 1):
        t = 0 
        for j in range(0, len(BagofWords), 1):
            if BagofWords[j] == BoWreduced[i]:
                t+=1
        TrueBoW.append([BoWreduced[i], t])
    TrueBoW = sorted(TrueBoW)
    print('BoW =',TrueBoW)
else:
    print('BoW =',newlist)


# 6330345021 (30.00) 191 (2021-03-21 15:46)

file_name = input("File name = ")
use = input("Use feature hashing ? (y,Y,n,N) ")
if use == "y" or use == "Y":
    m = int(input("M = "))
elif use != "y" and use != "Y" and use != "n" and use != "N":
    print("Try again.")
while use != "y" and use != "Y" and use != "n" and use != "N":
    use = input("Use feature hashing ? (y,Y,n,N) ")
    if use == "y" or use == "Y":
        m = int(input("M = "))
    elif use != "y" and use != "Y" and use != "n" and use != "N":
        print("Try again.")
#--------------------------------
def fhash(w,M):
    G = 37 ; sum_f = 0 ; i = 0
    for e in w:
        sum_f += ord(e)*(G**i)
        i+=1
    fhash_done = sum_f % M
    return fhash_done
#--------------------------------
char_count = 0 ; alpha_count = 0 ; line_count = 0 ; word_count = 0
file = open(file_name,"r")
for line in file:
    line_count += 1
    if line[-1:] == "\n":
        lineout = line[:-1:]
    else:
        lineout = line
    line_ = lineout.lower()
    new_text = ""
    for e in line_:
        if "a" <= e <= "z" or "0" <= e <= "9":
            alpha_count += 1    
        char_count += 1
    for l in line_:
        if not l in "abcdefghijklmnopqrstuvwxyz0123456789":
             new_text += " "       
        else:
            new_text += l
    list_new_text = new_text.split()
    word_count += len(list_new_text)
file.close()
#--------------------------------
print("-------------------")
print("char count =",char_count)
print("alphanumeric count =",alpha_count)
print("line count =",line_count)
print("word count =",word_count)
#--------------------------------
file_stop = open("stopwords.txt","r")
word_stop = []
for line_s in file_stop:
    word_stop += line_s.split()
file_stop.close()
#--------------------------------
file = open(file_name,"r")
BoW = [] ; text_stop = [] ; c = 1
BoW_new_text = ""
for line_BoW in file:
    line_BoW_l = line_BoW.lower()
    for ln in line_BoW_l:
        if not ln in "abcdefghijklmnopqrstuvwxyz0123456789":
            BoW_new_text += " "       
        else:
            BoW_new_text += ln
lise_BoW_new_text = BoW_new_text.split()
for th in lise_BoW_new_text:
    if not th in word_stop:
        text_stop.append(th)
text_stop.sort()
#--------------------------------
if use == "N" or use == "n":   
    for i in range(len(text_stop)-1):
        if text_stop[i] == text_stop[i+1]:
            c+=1
        if text_stop[i] != text_stop[i+1]:
            BoW.append([text_stop[i],c])
            c = 1
    if text_stop == []:
        pass
    elif len(text_stop) == 1:
        BoW.append([text_stop[0],c])
    elif text_stop[i+1] == text_stop[-1] :
        BoW.append([text_stop[i+1],c])
else:
    list_fh_word = []
    for fh_word in text_stop:
        list_fh_word.append(fhash(fh_word,m))
    list_fh_word.sort()
    for i in range(len(list_fh_word)-1):
        if list_fh_word[i] == list_fh_word[i+1]:
            c+=1
        if list_fh_word[i] != list_fh_word[i+1]:
            BoW.append([list_fh_word[i],c])
            c = 1
    if list_fh_word == []:
        pass
    elif len(list_fh_word) == 1:
        BoW.append([list_fh_word[0],c])
    elif list_fh_word[i+1] == list_fh_word[-1]:
        BoW.append([list_fh_word[i+1],c])              
#--------------------------------
print("BoW =",BoW)
file.close()

# 6330346621 (26.00) 192 (2021-03-21 23:58)

file_name=open(input('File name = '),'r')
F=''
F1=''
for l in file_name:
    F+=l.lower()
for e in F:
    if 'a'<=e<='z' or 'A'<=e<='Z' or'0'<=e<='9':
        F1+=e
    else :F1+=' '
    

a=input('Use feature hashing ? (y,Y,n,N) ')
while a != 'y' and a != 'Y' and a != 'n' and a != 'N':
    print('Try again.')
    print('Use feature hashing ? (y,Y,n,N) ')
    a=input()
    
    
if a == 'y' or a == 'Y':M=int(input('M = '))
print('-------------------')    
S=open('stopwords.txt','r')
S1=''
S2=''
for l in S:
    S1+=l
for e in S1:
    if e == '\n':
        S2+=' '
    else : S2+=e
s=S2.split()    
F2=F1.split()
F2.sort()
F3=[]
for e in F2:
    if e not in s:
        F3.append(e)
S.close()

count=0
for e in F:
    if e != '\n':count+=1
print('char count = ',count)

count1=0
for e in F1:
    if e != ' ':count1+=1
print('alphanumeric count = ',count1)

count2=0
for e in F:
    if e == '\n':count2+=1
print('line count = ',count2)

count3=len(F1.split())
print('word count = ',count3)


F4=[]
out=[]
o=0
if a == 'y' or a == 'Y':
    
    for i in range(len(F3)):
        o=0
        for j in range(len(F3[i])):
            o+=ord(F3[i][j])*(37**j)
        F4.append(o%M)
    F4.sort()
    
    i=0
    while (i <= len(F4)-1): 
        c = 1
        ch = F4[i] 
        j = i 
        while (j < len(F4)-1): 
            if (F4[j] == F4[j+1]): 
                c = c+1
                j = j+1
            else:break
        out.append([F4[i],c])
        i = j+1
    print('BoW =',out)

if a == 'n' or a=='N':
    i=0
    while (i <= len(F3)-1): 
        c = 1
        ch = F3[i] 
        j = i 
        while (j < len(F3)-1): 
            if (F3[j] == F3[j+1]): 
                c = c+1
                j = j+1
            else:break
        out.append([F3[i],c])
        i = j+1
    print('BoW =',out)   


file_name.close()




# 6330347221 (18.90) 193 (2021-03-21 21:52)

Y_N = ['y', 'Y', 'n', 'N']
def fhash(w, M):
    fhash = 0
    for i in range(len(w)):
        a = ord(w[i])
        G = 37**i
        fhash += a*G
    fhash = fhash % M
    return fhash
def remove_punctuation(file_name):
    file = open(file_name, 'r')
    x = ''
    for line in file:
        for e in line:
            if e in '\'\"\\/()[].,;:':
                x += ' '
            else:
                x += e
    file.close()
    return x
def list_stopwords(file):
    stopwords = open(file, 'r')
    a = []
    for line in stopwords:
        x = line.split()
        for e in x:
            a.append(e)
    return a

file_name = input("File name = ")
fharshing = input('Use feature harshing ? (y,Y,n,N) ')
while fharshing not in Y_N:
    print('Try again.')
    fharshing = input('Use feature harshing ? (y,Y,n,N) ')

if fharshing == 'n' or fharshing == 'N':
    pass
elif fharshing == 'y' or fharshing == 'Y':
    M = int(input('M = '))
    
print('-------------------')

stopwords = list_stopwords('stopwords.txt')

file = open(file_name, 'r')
d = 0
for line in file:
    line = line.strip()
    for e in line:
        d += 1
print('char count =', d)
file.close()

file1 = remove_punctuation(file_name)
c = 0
for line in file1:
    line = line.strip()
    for e in line:
        c += 1
print('alphanumeric count =', c)

b = 0
file = open(file_name, 'r')
for line in file:
    b += 1
print('line count =', b)
file.close()

file2 = file1.split()
print('word count =', len(file2))

all_words = []
for line in file2:
    line.lower()
    a = line.split()
    for e in a:
        if e not in stopwords:
            all_words.append(e)

BoW = []
if fharshing == 'n' or fharshing == 'N':
    all_words.sort()
    count = 1
    for i in range(1, len(all_words)):
        right = all_words[i]
        left = all_words[i-1]
        if right != left:
            BoW.append([left,count])
            count = 1
        else:
            count += 1
        if i == len(all_words)-1:
            BoW.append([right, count])
    print('BoW =', BoW)
    
elif fharshing == 'y' or fharshing == 'Y':
    fea_hash = []
    for e in all_words:
        j = fhash(e, M)
        fea_hash.append(j)
    fea_hash.sort()
    count = 1
    for i in range(1, len(fea_hash)):
        right = fea_hash[i]
        left = fea_hash[i-1]
        if right != left:
            BoW.append([left,count])
            count = 1
        else:
            count += 1
        if i == len(fea_hash)-1:
            BoW.append([right, count])
    print('BoW =', BoW)

    

    

    

    



    
    
# 6330348921 (24.00) 194 (2021-03-21 23:30)
#Prog-08: Bag-of-words
# # 6330348921 (24.00) Name Palapol Suetrakoolpanich
file_name = input('File name = ')
def line_count(file):
    a = open(file)
    n = 0
    c = ''
    for line in a:
        c += line
    c = c.strip('\n')
    for e in c:
        if e == '\n':
            n += 1
    n += 1
    a.close()
    return n
def char_count(file):
    a = open(file)
    c = ''
    for line in a:
        c += line
    c = c.strip('\n')
    
    n = len(c) - line_count(file) +1
    a.close()
    return n
def alpha_count(file):
    a = open(file)
    n = 0
    for line in a:
        for e in line:
            if 'a'<= e.lower() <= 'z' or '0'<=e.lower()<='9':
                n += 1
    a.close()
    return n
def word_count(file):
    a = open(file)
    c = ''
    for line in a:
        for e in line:
            if 'a'<=e.lower()<='z' or '0'<=e.lower()<='9':
                c += e
            else:
                c += ' '
    n = len(c.split())
    a.close()
    return n
def bow(file,fhashh,M):
    g = ''
    c = ''
    a = open(file)
    b = open('stopwords.txt')
    for k in b:
        g += k.lower()
    g = g.split()
    for line in a:
        for e in line:
            if 'a'<=e.lower()<='z' or '0'<=e.lower()<='9':
                c += e.lower()
            else:
                c += ' '
    c = c.split()
    x = []
    y = []
    boww = []
    n= 0
    for i in range(len(c)): #['age','hdwqh','wefewif','555','age','age','555','565']
        if fhashh == False:
            if c[i] not in x and c[i] not in g:
                x.append(c[i])
                y.append(0)
            if c[i] in x:
                p = x.index(c[i])
                y[p] += 1
        if fhashh == True:
            if fhash(c[i],M) not in x and c[i] not in g:
                x.append(fhash(c[i],M))
                y.append(0)
            if fhash(c[i],M) in x and c[i] not in g:
                p = x.index(fhash(c[i],M))
                y[p] += 1
    for j in range(len(x)):
        boww.append([x[j],y[j]])
        boww.sort()
    a.close()
    b.close()
    return boww
def fhash(w,M):
    c = 0
    for i in range(len(w)):
        c += ord(w[i])*(37**i)
    c = c%M
    return c
            
            
            

feature = input('Use feature hashing ? (y,Y,n,N) ')
if feature.lower() == 'y':
    M = int(input('M = '))
    print('-'*19)
    print('char count =',char_count(file_name))
    print('alphanumeric count =',alpha_count(file_name))
    print('line count =',line_count(file_name))
    print('word count =',word_count(file_name))
    print('BoW =', bow(file_name,True,M))
    
    
elif feature.lower() == 'n':
    M = ''
    print('-'*19)
    print('char count =',char_count(file_name))
    print('alphanumeric count =',alpha_count(file_name))
    print('line count =',line_count(file_name))
    print('word count =',word_count(file_name))
    print('BoW =', bow(file_name,False,M))
else:
    print("Try again.")

# 6330349521 (30.00) 195 (2021-03-22 23:53)
def char_count(file_name):
    f = open(file_name)
    c = 0
    d = 0
    for line in f:
        c += len(line)
        if line[-1::]=='\n':
            d += 1
    f.close()
    return c-d
    
def alnum_count(file_name):
    f = open(file_name)
    c = 0
    for line in f:
        for g in line:
            if g.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
                c += 1
    f.close()
    return c
    
def line_count(file_name):
    f = open(file_name)
    c = 0
    for line in f:
        c += 1
    f.close()
    return c
def word_count(file_name):
    f = open(file_name)
    c = ''
    wc = 0
    for line in f:
        for g in line:
            if g.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
                c += g
            else:
                c += ' '
        wc += len(c.split())
        c = ''
    f.close()
    return wc
def BoW(file_name,stopwords):
    f1 = open(file_name)
    f2 = open(stopwords)
    lb = []
    lc = []
    cfn = ''
    d2 = []
    csw = ''
    cb  = '.'
    for line in f1:
        for g in line:
            if g.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
                cfn += g.lower()
            else:
                cfn += ' '
    for line in f2:
        csw += ' '
        if line[-1::1] == '\n':
            line = line[0:-1:1]
        for g in line:
            csw += g.lower()
    for r in cfn.split():
        if r not in csw.split():
            cb += r
            cb += '.'
    for cdc in cb.split('.'):
        if cdc not in lc:
            lc.append(cdc)
    for e in lc:
        if e == '':
            pass
        else:
            cnb = 0
            w = 0
            while cb.find(e,w)!=-1:
                cnb += 1
                w = cb.find(e,w)+1
            lb.append([e, cnb])
    lb.sort()
    f1.close()
    f2.close()
    return lb
def feature_harshing(l,M):
    s = []
    f = ''
    for l1 in l:
        c = 0
        c1 = 0
        for l3 in l1[0]:
            c += ord(l3)*(37**c1)
            c1 += 1
        fhash = c%M
        f += (str(fhash)+'.')*l1[1]    
    for i in range(M):
        c2 = 0
        c3 = 0
        while f.find(str(i),c3) != -1:
            c2 += 1
            c3 = f.find(str(i),c3)+1
        if c2 != 0:
            s.append([i, c2])
    return s        
        
def display(file_name,stopwords,x):
    print('-------------------')
    print('char count =', char_count(file_name))
    print('alphanumeric count =', alnum_count(file_name))
    print('line count =', line_count(file_name))
    print('word count =', word_count(file_name))
    if x=='0':
        print('BoW =', BoW(file_name,stopwords))
    if x=='1':    
        print('BoW =',feature_harshing(BoW(file_name,stopwords),M))

file_name = input("File name = ")
x = input("Use feature hashing ? (y,Y,n,N) ").strip()
while x not in ['y','Y','n','N']:
    x = input("Use feature hashing ? (y,Y,n,N) ").strip()
if x == 'y' or x == 'Y':
    M = int(input("M = ").strip())
    display(file_name,'stopwords.txt','1')
else:
    display(file_name,'stopwords.txt','0')
# 6330350021 (20.35) 196 (2021-03-22 21:21)
def feature_hashing():
    a = input('Use feature hashing ? (y,Y,n,N) ')
    if a == 'y' or a == 'Y':
        return True
    elif a == 'n' or a == 'N':
        return False
    else:
        print('Try again.')
    
def remove_punc(t):
    out = ''
    for e in t:
        if 'A' <= e <= 'z' or '0' <= e <= '9':
            out += e
        else:
            out += ' '
    return out
def fhash(w,M):
    a = 0
    for i in range(len(w)):
        a += (ord(w[i])*(37**i))
    b = a%M
    return b
    
stopwords = []
fs = open('stopwords.txt','r')
for line in fs:
    z = line.split()
    for i in z:
        stopwords.append(i.upper())
fs.close()

file_name = input('File name = ')        
how = feature_hashing()
while how != True and how != False:
    how = feature_hashing()
if how == True:
    M = int(input('M = '))
print('-'*19)    
fn = open(file_name, 'r')
char_count = 0
alphanumeric_count = 0
line_count = 0
word_count = 0
word_list = []
BoW = []
x = 1
if how == False:
    for line in fn:
        char_count += len(line)
        for ch in line:
            if 'A' <= ch <= 'z' or '0' <= ch <= '9':
                alphanumeric_count += 1
        line_count += 1
        count_word = remove_punc(line).split()
        word_count += len(count_word)
        for i in count_word:
            if i.upper() not in stopwords:
                word_list.append(i.lower())
    word_list.sort()
    print('char count =', char_count-line_count+1)
    print('alphanumeric count =', alphanumeric_count)
    print('line count =', line_count)
    print('word count =', word_count)
    for j in range(len(word_list)-1):
        if word_list[j] == word_list[j+1]:
            x += 1
        else:
            BoW.append([word_list[j],x])
            x = 1
    BoW.append([word_list[-1],x])
    print('BoW =', BoW)
if how == True:
    for line in fn:
        char_count += len(line)
        for ch in line:
            if 'A' <= ch <= 'z' or '0' <= ch <= '9':
                alphanumeric_count += 1
        line_count += 1
        count_word = remove_punc(line).split()
        word_count += len(count_word)
        for i in count_word:
            if i.upper() not in stopwords:
                word_list.append(i.lower())
    word_list.sort()
    print('char count =', char_count-line_count+1)
    print('alphanumeric count =', alphanumeric_count)
    print('line count =', line_count)
    print('word count =', word_count)
    v = []
    for j in word_list:
        v.append(fhash(j, M))
    v.sort()
    for k in range(len(v)-1):
        if v[k] == v[k+1]:
            x += 1
        else:
            BoW.append([v[k],x])
            x = 1
    BoW.append([v[-1],x])
    print('BoW =', BoW)
fn.close()
# 6330351721 (30.00) 197 (2021-03-21 15:53)
#=====================================================
aa = input('File name = ')
bb = input('Use feature hashing ? (y,Y,n,N) ')
#-----------------------------------------------------
def fhash(w,M):
    x = 0
    for i in range(len(w)):
        x += ((37**i)*ord(w[i]))
    x = x % M
    return x
def nosum(w):
    y = []
    for i in range(len(w)):
        if w[i] not in y:
            y.append(w[i])
    return y
def nosumfinal(real):
    real.sort()
    ans = []
    mem = []

    for e in real:
        if e[0] not in mem:
            ans.append([e[0], 0])
            mem.append(e[0])
        ans[-1][1] += e[1]
    return ans
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
fn = open(aa, 'r')
fn2 = open('stopwords.txt', 'r')
x = ''
c = 0
for line in fn:
    line = line.strip()
    line = line.lower()
    x += line+ ' '
    c += 1
y = ''
for i in range(len(x)):
    if 'a'<=x[i]<='z' or 'A'<= x[i]<='Z' or '0'<=x[i]<='9' :
        y += x[i]
    else:
        y += ' '
        
z = y.split()
#==============================================================
yfinal = ''
for i in range(len(z)):
    yfinal += z[i]
x2 = ''
for line in fn2:
    line = line.strip()
    line = line.lower()
    x2 += line+ ' '
z2 = x2.split()

real = []
for i in range(len(z)):
    if z[i] not in z2:
        real.append(z[i])

xx = []
for i in range(len(real)):
    xx.append([real[i],real.count(real[i])])

xx = nosum(xx)
#=============================================================

while bb not in ['Y','y','N','n']:
    print('Try again.')
    bb = input('Use feature hashing ? (y,Y,n,N) ')
if bb in ['y','Y']:
    M = int(input('M = '))
print('-------------------')
print('char count =',len(x)-c)
print('alphanumeric count =',len(yfinal))
print('line count =', c)
print('word count =',len(z))
if bb in ['y','Y']:
    ans = xx
    for i in range(len(ans)):
        ans[i][0] = fhash(ans[i][0], M)
    ans.sort()
    print('BoW =',nosumfinal(ans))
else:
    xx.sort()
    print('BoW =', xx)
#-------------------------------------------------
    
fn.close()
fn2.close()
# 6330352321 (24.00) 198 (2021-03-22 03:58)

def fhash(w,m):
    c = []
    s = 0
    for i in range(len(w)):
        c.append(ord(w[i]))
    for i in range(len(w)):
        s += c[i]*(37**i)
    fh = s%m
    return fh
def char_count(filename):
    c_line = 0
    c_char = 0
    f = open(filename)
    for line in f:
        c_line += 1
        for i in range(len(line)):
            c_char += 1
    f.close()
    c_char -= c_line-1
    return c_char
def alp_count(filename):
    c = 0
    f = open(filename)
    for line in f:
        for e in range(len(line)):
            if ("a" <= line[e].lower() <= "z") or ("0" <= line[e] <= "9"):
                c += 1
    f.close()
    return c
def line_c(filename):
    c = 0
    f = open(filename)
    for line in f:
        c += 1
    f.close()
    return c
def word_count(filename):
    alp = "abcdefghijklmnopqrstuvwxyz"
    num = "1234567890"
    st = ""
    f = open(filename)    
    for line in f:
        for e in line:
            if (e.lower() not in alp) and (e not in num):
                st += " "
            else:
                st += e
    f.close()
    c = st.lower().split()
    return len(c)
def bow_n(filename,stopwords):
    alp = "abcdefghijklmnopqrstuvwxyz"
    num = "1234567890"
    s1 = ""
    s2 = ""
    l1 = []
    l2 = []
    l = []
    f1 = open(filename)
    for line in f1:
        for e in line:
            if (e.lower() not in alp) and (e not in num):
                s1 += " "
            else:
                s1 += e
    f2 = open(stopwords)
    for line in f2:
        for e in line:
            s2 += e
    f1.close()
    f2.close()
    l1 = s1.lower().split()
    l2 = s2.lower().split()
    for e in l1:
        if e not in l2:
            l.append(e)
    l.sort()
    bow0 = [l[0]]
    bow1 = [1]
    for i in range(1,len(l)):
        if l[i] == l[i-1]:
            bow1[-1] += 1
        else:
            bow0.append(l[i])
            bow1.append(1)
    bow = []
    for i in range(len(bow0)):
        bow.append([bow0[i],bow1[i]])
    return bow
def bow_y(filename,stopwords,m):
    bow = bow_n(filename,stopwords)
    for i in range(len(bow)):
        bow[i][0] = fhash(bow[i][0],m)
    bow.sort()
    bowy = [bow[0]]
    for i in range(1,len(bow)):
        if bow[i][0] == bow[i-1][0]:
            bowy[-1][1] += bow[i][1]
        else:
            bowy.append(bow[i])
    return bowy
    
#---------------------------------------------------------------------

yesno = ["y","Y","n","N"]
file_name = input("File name = ")
ufh = input("Use feature hashing ? (y,Y,n,N) ")
while ufh not in yesno:
    print("Try again.")
    ufh = input("Use feature hashing ? (y,Y,n,N) ")
if ufh== "n" or ufh == "N":
    print("-"*19)
    print("char count = " + str(char_count(file_name)))
    print("alphanumeric count = " + str(alp_count(file_name)))
    print("line count = " + str(line_c(file_name)))
    print("word count = " + str(word_count(file_name)))
    print("BoW = " + str(bow_n(file_name,"stopword.txt")))
elif ufh == "y" or ufh == "Y":
    M = int(input("M = "))
    print("-"*19)
    print("char count = " + str(char_count(file_name)))
    print("alphanumeric count = " + str(alp_count(file_name)))
    print("line count = " + str(line_c(file_name)))
    print("word count = " + str(word_count(file_name)))
    print("BoW = " + str(bow_y(file_name,"stopword.txt",M)))
        
# 6330353021 (29.80) 199 (2021-03-21 22:17)
an = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
def fhash(w,M) :
    f = 0
    for i in range(len(w)) :
        f += ord(w[i])*37**i
    return f%int(M)
f = input('File name = ')
h = input("Use feature hashing ? (y,Y,n,N) ")
sl = '-'*19
o = open(f,'r')
o2 = open('stopwords.txt','r')
sow2 = ''
for line in o2 :
    for i in line :
        if i in an :
            sow2 += i
        else :
            sow2 += ' '
sow2 = sow2.lower()
low2 = sow2.split(' ')
while '' in low2 :
    low2.remove('')

chc = 0
anc = 0
wc = 0
lc = 0
sow= ''
for line in o :
    chc += len(line)-1
    for i in line :
        if i in an :
            anc += 1
            sow += i
        else :
            sow += ' '
    lc += 1
sow = sow.lower()
low = sow.split(' ')
for i in low :
    if i != '' :
        wc += 1
while '' in low :
    low.remove('')
chc += 1
while h not in ['n','N','y','Y'] :
    print('Try again')
    h = input("Use feature hashing ? (y,Y,n,N) ")
if h == 'n' or h == 'N':
    print(sl)
    print('char count = '+str(chc))
    print('alphanumeric count = '+str(anc))
    print('line count = '+str(lc))
    print('word count = '+str(wc))
    fw = low
    for i in range(len(low)) :
        if low[i] in low2 :
            low[i] = ''
    while '' in low :
        low.remove('')
    bow = []
    for i in range(len(low)) :
        n = 1
        for j in range(i+1,len(low),1) :
            if low[i] == low[j] :
                n += 1
        bow.append([low[i],n])
    for i in range (len(bow)) :
        for j in range (i+1,len(bow),1) :
            if (bow[i])[0] == (bow[j])[0] :
                bow[j] = ['',0]
    while ['',0] in bow :
        bow.remove(['',0])
    print('BoW = '+str(bow))
elif h== 'y' or h == 'Y':
    m = int(input('M = '))
    print(sl)
    print('char count = '+str(chc))
    print('alphanumeric count = '+str(anc))
    print('line count = '+str(lc))
    print('word count = '+str(wc))
    z = []
    for i in range (len(low)) :
        if low[i] in low2 :
            low[i] = ''
    while '' in low :
        low.remove('')
    for i in range (len(low)) :
        nn = 0
        nn += fhash(low[i],m)
        z.append(nn)
    bow9 = []
    for i in range(len(z)) :
        k = 1
        for j in range (i+1,len(z),1) :
            if z[i] == z[j] :
                k += 1
        bow9.append([z[i],k])
    for i in range (len(bow9)) :
        for j in range (i+1,len(bow9),1) :
            if (bow9[i])[0] == (bow9[j])[0] :
                bow9[j] = [0,0]
    while [0,0] in bow9 :
        bow9.remove([0,0])
    bow9 = sorted(bow9)
    print("BoW = "+str(bow9))

# 6330354621 (26.67) 200 (2021-03-22 23:28)

file_name=input('File name = ')
#------------------------------------------------------------------------------
def stopwordtolist():
    b=[]
    z=open('stopword.txt')
    for line in z:
        if line != "\n":
            line1=line.strip('\n')
            line2=line1.split(' ')
            for j in range(len(line2)):
                b.append(line2[j]) #word chec use for b.o.w
    z.close()
    return b
#------------------------------------------------------------------------------
def alphanum(word):
    text=''
    for i in word: #alphanumeric
        if i==' ':
            text+=' '
        elif i in 'abcdefghijklmnopqrstuvwxyz0123456789':
            text+=i
        else:
            text+=' '
    return text
#------------------------------------------------------------------------------
def linecount(k):
    file=open(k)
    line_count = 0
    x=file.read()
    x1=x.strip('\n')
    x2=x1.split('\n')
    for i in x2:
        line_count+=1
    file.close()
    #for line in file:
       # if line != "\n":
            #line_count+=1
    return line_count
#------------------------------------------------------------------------------
def texttosent(file):
    file=open(file)
    a=''
    for line in file:
        if line != "\n":
            line=line.lower()
            l1=line.strip('\n')
            a+=''.join(l1)+' '
    file.close()
    return a
#------------------------------------------------------------------------------      
def charcount(file):
    file=open(file)
    charcount =''
    for line in file:
        linex=line.strip()
        if linex != "\n":
            line=line.lower()
            l1=line.strip('\n')
            charcount+=''.join(l1)
    ans=len(charcount)
    file.close()
    return ans
#------------------------------------------------------------------------------
def allChar(l1):
    word_stick=''.join(l1.split())#find char count
    ans=alphanum(word_stick)
    
    return ans
#------------------------------------------------------------------------------
def BoW(word):#word= alpha
    a1=word.split()
    ans=[]
    num=0
    for i in a1:
        for k in range(len(a1)):
            if i == a1[k]:
                num+=1
        a2=[i,num]
        if a2 in ans:
            num=0
        else:
            ans.append([i,num])
            num=0
    
    return ans
    
#feature hashing---------------------------------------------------------------
def BoWfe(w,m):#cut_word='best times worst times age wisdom 555'
    a1=w.split()
    listall=[]
    list1=[]
    for i in a1:
        feh=fe(i,m)
        list1.append(feh)
    num=0
    for j in list1:
        for k in range(len(list1)):
            if j == list1[k]:
                num+=1
        a2=[j,num]
        if a2 in listall:
            num=0
        else:
            listall.append(a2)
            num=0
    return listall
    
#------------------------------------------------------------------------------
def fe(w,m):
    sum1=0
    k=0
    for i in w:
        sum1+=(ord(i)*(37**(w.find(i,0+k))))
        k+=1
    ans=sum1 % m
    return ans
    
#------------------------------------------------------------------------------

chose=0
choice=input('use feature hashing ? (y,Y,n,N) ')
while choice!='n' or choice!='N':
    if choice=='y' or choice=='Y':
        chose=1
        break
    if choice=='n' or choice=='N':
        chose=0
        break
    else:
        print('Try again.')
        choice=input('use feature hashing ? (y,Y,n,N) ')
if chose == 1:
    m=input('M = ')
    print('-------------------')
    a=texttosent(file_name)
    n=linecount(file_name)
    b=stopwordtolist()
    alpha=alphanum(a)
    alpha2=''.join(alpha.split())
    cut_word =' '.join([i for i in alpha.split() if i not in b])
    print('char count =', charcount(file_name))
    print('alphanumeric count =',len(alpha2))
    print('line count =', n)
    print('word count =', len(alpha.split()))
    print('BoW =',BoWfe(cut_word,int(m)))
elif chose == 0:
    print('-------------------')
    a=texttosent(file_name)
    n=linecount(file_name)
    b=stopwordtolist()
    alpha=alphanum(a)
    alpha2=''.join(alpha.split())
    cut_word =' '.join([i for i in alpha.split() if i not in b]) #word that already cut stopwords usr for b.o.w
    print('char count =', charcount(file_name))
    print('alphanumeric count =',len(alpha2))
    print('line count =', n)
    print('word count =', len(alpha.split()))
    print('BoW =',BoW(cut_word))
# 6330355221 (29.00) 201 (2021-03-22 23:03)

#---------------------------------------
def fhash(w,M):
    confhash = 0
    for i in range(len(w)):
        confhash += ord(w[i])*(37**i)
    return confhash % M

#---------------------------------------

vala = ''
valb = ''
vocab_one = []
vocab_two = []
linecount = 1
sarawordcount = 0
sicticcount = 0

#---------------------------------------

list_Fileimport = input('File name = ')
thename_char = input('Use feature hashing ? (y,Y,n,N) ')

#---------------------------------------

while thename_char not in 'yYnN':
    print('Try again.')
    thename_char = input('Use feature hashing ? (y,Y,n,N) ')
if thename_char in "yY":
    case_one = int(input("M = "))
print('-------------------')

linefilea = open('stopwords.txt')
linefileaa = linefilea.read()
open_filetwo = linefileaa.lower()

linefile = open(list_Fileimport)
linefiles = linefile.read()
linefiless = linefiles.lower()
open_file = linefiless.strip('\n')

#---------------------------------------

for i_e in open_file:
    if i_e != '\n':
        sicticcount += 1   
    else:
        linecount += 1
    if 'a'<= i_e <='z' or '0'<= i_e <='9':
        sarawordcount += 1
        vala += i_e
    elif vala != '':
        vocab_one.append(vala)
        vala = ''
print('char count =', sicticcount)
print('alphanumeric count =', sarawordcount)
print('line count =', linecount)

#---------------------------------------

if vala != '':
    vocab_one.append(vala)
countvocab = len(vocab_one)
print('word count =',countvocab)
for i_j in open_filetwo :
    if '0'<= i_j <='9' or 'a'<= i_j <='z':
        valb+=i_j
    elif valb != '':
        vocab_two.append(valb)
        valb = ''
if valb!='':
    vocab_two.append(valb)
for i_o in vocab_two:
    for i in range(vocab_one.count(i_o)):
        vocab_one.remove(i_o)
if thename_char in "yY":
    for i in range(len(vocab_one)):
        vocab_one[i] = fhash(vocab_one[i],case_one)
        
#---------------------------------------
        
vala = []
finalBoW =[]
for i_k in vocab_one:
    if i_k not in vala:
        finalBoW.append([i_k,vocab_one.count(i_k)])
        vala.append(i_k)
        
#---------------------------------------
        
finalBoW.sort()
print('BoW =',finalBoW)
# 6330356921 (22.80) 202 (2021-03-21 22:58)
#---------------------------------------------------------------------
def fhash(w,M) :
    a = []
    b = 0
    c = 0
    for i in range(len(w)) :
        a.append(str(ord(w[i])))
    for i in range(len(a)) :
        b += int(a[i])*(37**c)
        c += 1
    b = int(b)%int(M)
    return b
#---------------------------------------------------------------------
def cutpunc(N) :
    result = ""
    for c in N:
        if c in "\"\'/\\,.:;" :
            result += ""
        elif c in "\n" :
            result += " "
        else :
            result +=c
    return result
#---------------------------------------------------------------------
def cutword(N) :
    N = cutpunc(N)
    N = N.lower()
    N = N.split()
    x = ""
    a = open("stopwords.txt", "r")
    for lines in a :
        x += lines
    b = cutpunc(x)
    b = b.split()
    result = ""
    for c in N  :
        if c in b :
            result += ""
        else :
            result += c+" "
    return result
#---------------------------------------------------------------------
def BOW1(N) :
    N = cutword(N)
    N = N.split()
    N.sort()
    N.append("")
    a = []
    c = 1
    for i in range(len(N)-1) :
        if N[i]==N[i+1] :
            c +=1
            
        else :
            a.append([N[i],c])
            c = 1
            
    return a
    
#---------------------------------------------------------------------
def BOW2(N) :
    N = cutword(N)
    N = N.split()
    N.sort()
    a = []
    c = 1
    x = []
    for i in range(len(N)) :
        x.append(fhash(N[i],M))
    x.sort()
    x.append("")
    for i in range(len(x)-1) :
        if x[i]==x[i+1] :
            c +=1
            
        else :
            a.append([x[i],c])
            c = 1
            
    return a
   
    
#---------------------------------------------------------------------
x = ""
character_count = 0
line_count = 0
word_count = 0
alphanumeric_count = 0
d = input("File name = ")
a = open(d, "r")
for lines in a :
    x += lines
    character_count += len(lines)
    line_count +=1
print(x)
y = cutpunc(x)
z = "".join(y) 
character_count -=line_count-1
y = cutpunc(x)
y = y.split()
for i in range(len(y)):
    if y[i]==y[i] :
        word_count += 1
h = 0
for i in range(len(z)) :
    if " "==z[i] :
        h +=1
alphanumeric_count += len(z)-h
a.close()
#---------------------------------------------------------------------

b = input("Use feature hashing ? (y,Y,n,N) ")
if b=="y" or b=="Y" :
    M = int(input("M = "))
    print("-------------------")
    print("char count =",character_count)
    print("alphanumeric count =",alphanumeric_count)
    print("line count =",line_count)
    print("word count =",word_count)
    print("BoW =",BOW2(x))
if b=="n" or b=="N" :
    print("-------------------")
    print("char count =",character_count)
    print("alphanumeric count =",alphanumeric_count)
    print("line count =",line_count)
    print("word count =",word_count)
    print("BoW =",BOW1(x))
    
while b!="n" and b!="N"and b!="y" and b!="Y" :
    print("Try again.")
    b = input("Use feature hashing ? (y,Y,n,N) ")
    if b=="y" or b=="Y" :
        M = int(input("M = "))
        print("-------------------")
        print("char count =",character_count)
        print("alphanumeric count =",alphanumeric_count)
        print("line count =",line_count)
        print("word count =",word_count)
        print("BoW =",BOW2(x))
    if b=="n" or b=="N" :
        print("-------------------")
        print("char count =",character_count)
        print("alphanumeric count =",alphanumeric_count)
        print("line count =",line_count)
        print("word count =",word_count)
        print("BoW =",BOW1(x))
        break

    

    




# 6330357521 (30.00) 203 (2021-03-22 01:23)
 
def stopwords(file_dir):
    bow = list()
    with open(file_dir, 'r') as file:
        for line in file:
            if line:
                bow.extend(line.split())
    return bow
 
def do_hash():
    while True:
        do_ten = input("Use feature hashing ? (y,Y,n,N) ")
        if do_ten == 'y' or do_ten == 'Y':
            return True
        elif do_ten == 'n' or do_ten == 'N':
            return False
        else:
            print("Try again.")
 
def hasher(w, m):
    chars = list(w)
    sum_ord = 0
    for i, c in enumerate(chars):
        sum_ord += ord(c) * (37 ** i)
    return sum_ord % m
 
def count_bow(bow):
    bow_count = list()
    dictionary = list()
    for w in bow:
        if w not in dictionary:
            bow_count = bow_count + [[w, 0]]
            dictionary.extend([w])
        bow_count[dictionary.index(w)][1] += 1
    return bow_count
 
def print_results(num_chars, num_alpha_numeric, num_lines, num_words, bow_count):
    print("char count =", num_chars)
    print("alphanumeric count =", num_alpha_numeric)
    print("line count =", num_lines)
    print("word count =", num_words)
    print("BoW =", bow_count)
 
def main():
    stop_words = stopwords("stopwords.txt")
    num_lines = 0
    num_words = 0
    num_chars = 0
    num_alpha_numeric = 0
 
    filename = input("File name = ")
    with open(filename, 'r') as file:
        converted_words = list()
        for line in file:
            line = line.strip('\n')
            num_lines += 1
            num_chars += len(line)
            num_alpha_numeric += sum(char.isalnum() for char in line)
            line = [" " if not char.isalnum() else char for char in line]
            line = "".join(map(lambda char: char.lower(), line))
            words = line.split()
            converted_words.extend(words)
            num_words += len(words)
        file.close()
 
    bag_of_words = list(filter(lambda w: w not in stop_words, converted_words))
 
    if do_hash():
        m = int(input("M = "))
        bag_of_words = [hasher(w, m) for w in bag_of_words]
    bow_count = count_bow(bag_of_words)
    bow_count = sorted(bow_count, key=lambda w: w[0])
 
    print_results(num_chars, num_alpha_numeric, num_lines, num_words, bow_count)
 
main()
# 6330358121 (30.00) 204 (2021-03-22 23:46)
def condit():
    s=input('Use feature hashing ? (y,Y,n,N) ')
    if s.lower()=='y':return int(input('M = '))
    elif s.lower()!='n':print('Try again.');return condit()
    else:return 'x'
def fhash(w,M):
    t=0
    for j in range(len(w)):t+=ord(w[j])*(37**j)
    return t%M
def check_char(t):return 'a'<=t<='z' or '0'<=t<='9'
def show_BoW(g):
    h=[];g.sort();l=[]
    if g==[]:return []
    else:
        for i in range(1,len(g)):
            if g[i-1]!=g[i]:h.append([g[i-1],g.count(g[i-1])])
        h.append([g[-1],g.count(g[-1])])
        return sorted(h)
def data_line(l):
   alph_line=0;t=list(l)
   for e in l:alph_line+=int(check_char(e))
   for j in range(len(t)):
       if not check_char(t[j]):t[j]=' '
   return [len(l.strip()),alph_line,''.join(t).split()]
#-------------------------------------------------------------
fn=open(input('File name = '),'r')
stop=open('stopwords.txt','r').read()
m=condit();print('-------------------')
char=0;alph=0;words=0;line=0;sonjai=[]
l=fn.readline().lower().strip()
while len(l)>0:
    p,q,h=data_line(l)
    line+=1;char+=p;alph+=q;words+=len(h)
    for e in h:
        if e not in stop:sonjai.append(e)
    l=fn.readline().lower()
fn.close()
#------------------------------------------------------------
print('char count =',char);print('alphanumeric count =',alph)
print('line count =',line);print('word count =',words)
if m!='x':
    for i in range(len(sonjai)):sonjai[i]=fhash(sonjai[i],m)
print('BoW =',show_BoW(sonjai))
#------------------------------------------------------------
    
# 6330360321 (30.00) 205 (2021-03-22 19:57)

eng_word =  ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',]
num = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
def cut_noodle(n1):
    r = ''
    n = n1.lower()
    for e in range(len(n)):
        if n[e] in eng_word or n[e] in num :
            r += n[e]
        else :
            r += ' '
    return r
def stop_words():
    a = open('stopwords.txt', 'r')
    d = a.readlines()
    a.close()
    g = []
    for n in d:
        if '\n' in n :
            g.append(cut_noodle(n[0:-1]))
        else :
            g.append(cut_noodle(n))
    c = ' '.join(g)
    return c
def cut_stopword(word):
    r = []
    for n in range(len(word)):
        if word[n] not in stop_word:
            r.append(word[n])
    return r
def cout_line(file_name):
    f = open(file_name, 'r')
    n = f.readlines()
    f.close()
    return len(n)
def cout_charater(file_name):
    f = open(file_name ,'r')
    g = f.readlines()
    f.close()
    n = 0
    for e in g :
        if '\n' in e:
            n += len(e[0:-1])
        else :
            n += len(e)
            
    return n
def fhash(word,M):
    e = 0
    for n in range(len(word)):
        e += ord(word[n])*(37**n)
    g = e%M
    return g
def list_fhash(list_word,M):
    f = []
    for n in list_word:
        f.append(fhash(n,M))
    return f
def fhash_way():    
    while True :
        a = input('Use feature hashing ? (y,Y,n,N) ')
        if a in ['n','N'] :
            return 'No', 0
            break
        elif a in ['y','Y'] :
            M = input('M = ')
            return 'Yes',M
        else:
            print('Try again.')
def list_of_words(file_name):
    f = open(file_name)
    g = []
    
    while True :
        n = f.readline()
        if n != '' :
            if '\n' in n :                
                g.append(n[0:-1])
            else :
                g.append(n)
        else :
            break
    f.close()
    g = ' '.join(g)
    g = cut_noodle(g)
    g = g.split()
    return g
               
def cout_alpha(file_name):
    f = list_of_words(file_name)
    p = 0
    for n in f :
        p += len(n)
    return p
def BoW(way,M,flie_name):
    f = list_of_words(flie_name)
    f = cut_stopword(f)
    if way == 'No':
        couted = []
        couted_num = []
        for n in f :
            if n not in couted:
                couted.append(n)
                couted_num.append([n,f.count(n)])
        couted_num.sort(key=None, reverse=False)        
        print('BoW =', couted_num)
    elif way == 'Yes' :
        y = list_fhash(f,M)
        n1 = []
        n1_num = []
        for n in y :            
            if n not in n1:
                n1.append(n)
                n1_num.append([n,y.count(n)])
        n1_num.sort(key=None, reverse=False)
        print('BoW =', n1_num)
        
stop_word = stop_words().split()   
file_name = input('File name = ')
way,M=fhash_way()
print('-------------------')
print('char count =', cout_charater(file_name))
print('alphanumeric count =', cout_alpha(file_name))
print('line count =', cout_line(file_name))
print('word count =', len(list_of_words(file_name)))
BoW(way,int(M),file_name)





    
            
    
# 6330361021 (27.28) 206 (2021-03-22 23:15)

def fhash(w,m):
    ans = 0
    for i in range(len(w)):
        ans += ord(w[i])*(37**i)
    ans %= m
    return ans

def stopword():
    stop = open('stopwords.txt','r')
    stop1 = []
    stop2 = []
    
    for i in stop:
        stop1 += i.split()
    
    for i in stop1:
        stop2.append(i.lower())
 
    stop3 = [] 
    for i in range(len(stop2)):
        if stop2.count(stop2[i]) == 1:
            stop3.append(stop2[i])
        else:
            if not stop3.count(stop2[i]) == 1:
                stop3.append(stop2[i])
                
    stop.close()
    return stop3

def makewords(file):
    char = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVXYZ0123456789'
    readfile = open(file,'r')
    allline = []
    words = []
    
    for i in readfile:
        line = ''
        for j in i.strip():
            
            if j in char:
                line += j.lower()
                
            elif j == ' ':
                line += j.lower()
                
            else:
                line += ' '
        allline.append(line)
        
    for i in allline:
        splitword = i.split()
        for j in splitword:
            if not j in stopword():
                words.append(j)
        
    readfile.close()
    return words

def makebow(words):
    word = []
    bow = []
    for i in words:
        word.append(i)
        if word.count(i) > 1 :
            word.pop(-1)
    for i in word:
        bow.append([i,words.count(i)])
    return bow

def changebow(bow,m):
    inbow = []
    for i in range(m):
        inbow.append([i,0])
        
    newbow = []
    
    for i in range(len(bow)):
        for j in range(bow[i][1]):
            newbow.append(fhash(bow[i][0],m))
            
    for i in range(m):
        inbow[i][1] = newbow.count(i)
        
    outbow = []
    
    for i in range(len(inbow)):
        if not inbow[i][1] == 0 :
            outbow.append(inbow[i])
            
    return outbow
    
    
def charcount(file):
    readfile = open(file,'r')
    allline = []
    words = []
    n = 0
    for i in readfile:
        line = ''
        for j in i.strip():
            n+=1
    readfile.close()
    print('char count =', n)
    
def alphanumeric(file):
    char = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVXYZ0123456789'
    readfile = open(file,'r')
    allline = []
    words = []
    n = 0
    for i in readfile:
        line = ''
        for j in i.strip():
            
            if j in char:
                line += j.lower()
                
            elif j == ' ':
                line += j.lower()
                
            else:
                line += ' '
        allline.append(line)
         
    for i in allline:
        splitword = i.split()
        for j in splitword:
            for l in j:
                n+=1
    readfile.close()      
    print('alphanumeric count =',n)

def linecount(file):
    readfile = open(file,'r')
    n = 0
    for i in readfile:
        n += 1
    readfile.close()
    print('line count =',n)

def wordcount(file):
    char = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVXYZ0123456789'
    readfile = open(file,'r')
    allline = []
    words = []
    n = 0
    
    for i in readfile:
        line = ''
        for j in i.strip():
            
            if j in char:
                line += j.lower()
                
            elif j == ' ':
                line += j.lower()
                
            else:
                line += ' '
        allline.append(line)
        
    for i in allline:
        splitword = i.split()
        for j in i.split():
            n+=1
    readfile.close()
    print('word count =',n)
def show(file):
    charcount(file)    
    alphanumeric(file)
    linecount(file)
    wordcount(file)
    
    
def main():
    file = input('File name = ')
    feature = input('Use feature hashing ? (y,Y,n,N) ')

    while not feature in ['y','Y','n','N']:
        print('Try again.')
        feature = input('Use feature hashing ? (y,Y,n,N) ')

    if feature in ['y','Y']:
        m = int(input('M = '))
        show(file)
        print('Bow =',changebow(makebow(makewords(file)),m))  
        
    elif feature in ['n','N']:
        show(file)
        print('Bow =',makebow(makewords(file)))

#---------------------------------------------------------------------------------------

main()
# 6330362621 (29.00) 207 (2021-03-22 00:39)
filename=input('File name = ')
feature=input('Use feature hashing ? (y,Y,n,N) ')
while feature not in 'yYnN':
    print('Try again.')
    feature=input('Use feature hashing ? (y,Y,n,N) ')
if feature in "yY":
    M=int(input("M = "))
print('-------------------')
file= open(filename).read().lower().strip('\n')
file2= open('stopwords.txt').read().lower()

charcount =0
alphanumericcount=0
linecount=1
word=[]
word2=[]
a=''
b=''
def fhash(w,M):
    ans=0
    for i in range(len(w)):
        ans+=ord(w[i])*(37**i)
    return ans%M
for e in file:
    if e!='\n':
        charcount+=1   
    else:
        linecount+=1
    if 'a'<=e<='z' or '0'<= e<='9':
        alphanumericcount+=1
        a+=e
    else:
        if a!='':
            word.append(a)
            a=''
print('char count =',charcount)
print('alphanumeric count =',alphanumericcount)
print('line count =',linecount)
if a!='':
    word.append(a)
wordcount=len(word)
print('word count =',wordcount)
for x in file2:
    if 'a'<=x<='z' or '0'<=x<='9':
        b+=x
    else:
        if b!='':
            word2.append(b)
            b=''
if b!='':
    word2.append(b)
    

for e in word2:
    for i in range(word.count(e)):
        word.remove(e)

if feature in "yY":
    for i in range(len(word)):
        word[i]=fhash(word[i],M)
        
        
BoW =[]
a=[]
for e in word:
    if e not in a:
        BoW.append([e,word.count(e)])
        a.append(e)
        

BoW.sort()
print('BoW =',BoW)

        


# 6330365521 (19.15) 208 (2021-03-21 17:23)
def fhash(w,m):
    su=0
    for a in range (len(w)):
        su+=ord(w[a])*37**a
    su=su%int(m)
    return su

fname=input("File name = ")
ha=input("Use feature hashing ? (y,Y,n,N) ")
while ha not in ["y","Y","n","N"]:
    print("Try again.")
    ha=input("Use feature hashing ? (y,Y,n,N) ")
stw=[]
stw1=['"',"'",'\\','.',',',':',';','|','?','(',')','[',']','*','+','-','/','!','#','$','%','^','&','_','{','}','<','>','@','~','`']
fstw=open("stopwords.txt")
for line in fstw:
    x=line.strip().split()
    stw+=x
fstw.close()
f=open(fname)
word1=""
word2=""
linecount=0
cout=0
for lin in f:
    if lin == "\n":
        linecount-=1
        cout+=1
    if lin != '\n':
        linecount+=cout
        cout=0
    y=lin.strip().lower()
    word1+=y+' '
    charcout=len(word1)
    linecount+=1
alpcount=0
for x in word1:
    if x not in stw1:
        word2+=x
        alpcount+=1
    else:
        word2+=' '
word2=word2.split()

wordcount=len(word2)
alpcount=alpcount-wordcount
charcout=charcout-linecount
word3=[]
for x in word2:
    if x not in stw :
        word3.append(x)
word3.sort()
bow=[]
countword=1

#print(len(word2),'\n',stw,'\n',linecount,charcout,word2)
has=[]
bow2=[]
countword=1
if ha not in ["n","N"]:
    m=input()
    print('-'*19)
    print("char count =",charcout)
    print("alphanumeric count =",alpcount)
    print("line count =",linecount)
    print("word count =",wordcount)
    
    for wo in word3:
        x=fhash(wo,m)
        has.append(x)
    has.sort()
    for x in range (len(word3)-1):
        if word3[x]!=word3[x+1]:
            bow.append([word3[x],countword])
            countword=1
        else: countword+=1
    x+=1
    bow.append([word3[x],countword])    
    for y in range (len(has)-1):
        if has[y]!=has[y+1]:
            bow2.append([has[y],countword])
            countword=1
        else: countword+=1
    y=y+1
    bow2.append([has[y],countword])
    print("BoW =",bow2)
else:
    print('-'*19)
    print("char count =",charcout)
    print("alphanumeric count =",alpcount)
    print("line count =",linecount)
    print("word count =",wordcount)
    for x in range (len(word3)-1):
        if word3[x]!=word3[x+1]:
            bow.append([word3[x],countword])
            countword=1
        else: countword+=1
    x+=1
    bow.append([word3[x],countword])
    print("BoW =",bow)
    
    

# 6330366121 (23.40) 209 (2021-03-21 17:03)
#-----------------------------------------------------------------
def char_count (H):
    #หาจำนวนตัวอักษร
    len(H)
    return len(H)
def alphanumeric_count (I):
    #หาตัวจำนวนเพียงตัวอักษรและตัวเลข
    A = 'abcdefghijklmnopqrstuvwxyz0123456789'
    c = 0
    for i in range (len(I)):
        if I[i] in A:
           c += 1
    return c
def word_count (J):
    #หาจำนวนนwordd
    alphanum = 'abcdefghijklmnopqrstuvwxyz0123456789'
    K = ""
    for j in range (len(J)):
        if J[j] in alphanum:
          K += J[j]
        else :
          K += " "
    Sakura = K.split()
    w = len(Sakura)
    return w,Sakura


def fhash (word,M):
    g = 0
    for i in range (len(word)):
        g += ord(word[i])*(37**i)
    return g % M



def BOW_Y (Sasuke,M):
    Sasuke1 = []
    Sasuke2 = []
    Sasuke3 = []
    for i in range (len(Sasuke)):
        if fhash(Sasuke[i],M) in Sasuke2:
            for number in Sasuke3 :
                if fhash(Sasuke[i],M) == number[0] :
                    number[1] += 1
        else :
            Sasuke2.append(fhash(Sasuke[i],M))
            Sasuke3.append([fhash(Sasuke[i],M),1])
    return Sasuke3

def BOW_N (Sasuke):
    Sasuke1 = []
    Sasuke2 = []
    Sasuke3 = []
    for i in range (len(Sasuke)):
        
        if Sasuke[i] in Sasuke2:
            for number in Sasuke3 :
                if Sasuke[i] in number :
                    number[1] += 1
        else :
            Sasuke2.append(Sasuke[i])
            Sasuke3.append([Sasuke[i],1])
    return Sasuke3
        


    
    
    
    

#-----------------------------------------------------------------
#file stopwords
stop_words = open("stopwords.txt",'r')
Madara = []
for line in stop_words:
    for word in line.strip().lower().split():
        Madara.append(word)



#แปลงข้อความเป็นตัวเล็ก
file_name = open(input("File name = "),"r")
Hokage1 = ''
Hokage2 = ''
z=0
for line in file_name:
    Hokage1 += line.strip("\n").lower() #ใช้กับ char count และ alphanumeric
    Hokage2 += " "+line.strip("\n").lower() #ใช้กับwordcount
    z += 1

file_name.close()

#input
Naruto = input("Use feature hashing ? (y,Y,n,N) ")
while Naruto not in 'yYnN' :
       print('Try again')
       Naruto = input("Use feature hashing ? (y,Y,n,N) ")
if Naruto == 'y' or Naruto == 'Y':
    M = int(input("M = "))
    x=char_count(Hokage1)
    y=alphanumeric_count(Hokage1)
    w,Sakura=word_count(Hokage2)
    D = []
    for word in Sakura:
        if word in Madara:
            pass
        else :
            D.append(word)
    v=BOW_Y(D,M)
    print('-------------------')
    print('char count = '+str(x))
    print('alphanumeric_count = '+str(y))
    print('line count = '+str(z))
    print('word count = '+str(w))
    print("BoW =",sorted(v))
           
           
elif Naruto == 'n' or Naruto == 'N':
    x=char_count(Hokage1)
    y=alphanumeric_count(Hokage1)
    w,Sakura=word_count(Hokage2)
    D = []
    for word in Sakura:
        if word in Madara:
            pass
        else :
            D.append(word)
    u=BOW_N(D)
    print('-------------------')
    print('char count = '+str(x))
    print('alphanumeric_count = '+str(y))
    print('line count = '+str(z))
    print('word count = '+str(w))
    print("BoW =",sorted(u))
           
        
       
       






# 6330367821 (26.00) 210 (2021-03-21 23:06)
def fhash(w, M):
    y = 0
    for i in range(0, len(w), 1):
        x = (ord(w[i]))*(37**i)
        y += x
    fhash1 = y%M
    return fhash1
def lis2(li1, li2):
    a = []
    for i in range(0, len(li1), 1):
        b = [li1[i], li2[i]]
        a.append(b)
    return a
def first():
    print('-------------------')
    f = open(file_name, 'r')
    k = 0
    for line in f:
        line = line.strip()
        k += len(line)
    print('char count =', k)
    f = open(file_name, 'r')
    g = 0
    for line in f:
        line = line.strip()
        y = 0
        for i in line:
            if i in q:
                y += 1
        g += y
    print('alphanumeric count =', g)
    f = open(file_name, 'r')
    a = 0
    for line in f:
        if True:
            a+=1
    print('line count =', a)
    f = open(file_name, 'r')
    r = ''
    for line in f:
        line = ' '+line.strip()
        for i in range(0, len(line), 1):
            if line[i] in q:
                r += line[i]
            else:
                r += ' '
    t = r.split(' ')
    s = []
    for i in range(0, len(t), 1):
        if t[i] != '':
            s.append(t[i])
    print('word count =', len(s))
    f.close()
def feature_n():
    first()
    c = open(file_name, 'r')
    d = ''
    e = []
    for line in c:
        line = line.strip()
        for i in range(0, len(line), 1):
            if line[i] in q:
                d += line[i]
            else:
                if i == len(line)-1:
                    pass
                else:
                    d += ' '
        e += d.lower().split()
        d = ''
    c.close()
    filename = 'stopwords.txt'
    f = open(filename, 'r')
    k = ''
    for line in f:
        k += ' '+line.strip()
    line2 = k.split()
    f.close()
    j = ''
    for i in e:
        if i not in line2:
            j += ' '+i
    g = j.lower().split()
    m = sorted(g)
    l =[m[0]]
    for i in range(1, len(m), 1):
        if m[i] in l:
            pass
        else:
            l.append(m[i])
    n =[]
    o = 1
    for i in range(0, len(m)-1, 1):
        if m[i+1] == m[i]:
            o += 1
            if i == len(m)-2:
                n.append(o)
        else:
            n.append(o)
            o = 1
            if i == len(m)-2:
                n.append(o)
    print('BoW =', lis2(l, n))
def feature_y():
    first()
    c1 = open(file_name, 'r')
    d1 = ''
    e1 = []
    for line in c1:
        line = line.strip()
        for i in range(0, len(line), 1):
            if line[i] in q:
                d1 += line[i]
            else:
                if i == len(line)-1:
                    pass
                else:
                    d1 += ' '
        e1 += d1.lower().split()
        d1 = ''
    c1.close()
    filename = 'stopwords.txt'
    f1 = open(filename, 'r')
    k1 = ''
    for line in f1:
        k1 += ' '+line.strip()
    line21 = k1.split()
    f1.close()
    j1 = ''
    for i in e1:
        if i not in line21:
            j1 += ' '+i
    g1 = j1.lower().split()
    h = []
    for i in range(0, len(g1), 1):
        h += [fhash(g1[i], int(a))]
    h.sort()
    j =[h[0]]
    for i in range(1, len(h), 1):
        if h[i] == h[i-1]:
            pass
        else:
            j.append(h[i])
    k =[]
    l = 1
    for i in range(0, len(h)-1, 1):
        if h[i+1] == h[i]:
            l += 1
            if i == len(h)-2:
                k.append(l)
        else:
            k.append(l)
            l = 1
            if i == len(h)-2:
                k.append(l)
    print('BoW =', lis2(j, k))
q = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', \
                 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', \
                 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
file_name = input('File name = ')
feature = input('Use feature hashing ? (y,Y,n,N) ')
if feature == 'n' or feature == 'N':
    feature_n()
elif feature == 'y' or feature == 'Y':
    a = input('M = ')
    feature_y()
else:
    print('Try again.')
    feature = input('Use feature hashing ? (y,Y,n,N) ')
    while feature not in ['y', 'Y', 'n', 'N']:
            print('Try again.')
            feature = input('Use feature hashing ? (y,Y,n,N) ')
    if feature == 'n' or feature == 'N':
        feature_n()
    if feature == 'y' or feature == 'Y':
        a = input('M = ')
        feature_y()
# 6330370621 (20.00) 211 (2021-03-21 16:19)
file_name = input('File name = ')
fh = input('Use feature hashing ? (y,Y,n,N) ')
fh = fh.upper()
while fh != 'N' and fh != 'Y' :
    print('Try again.')
    fh = input('Use feature hashing ? (y,Y,n,N) ')
if fh == 'Y' :
    M = int(input('M = '))
    use_fh = True
elif fh == 'N' :
    use_fh = False
fstop = open("stopwords.txt", "r")
stopwords = []
for line in fstop:
    stopwords += line.split()
fstop.close()
print('-------------------')
fin = open(file_name, "r")
def char_count(x):
    c = 0
    for line in x:
        if "\n" in line:
            c += len(line)-1
        else:
            c += len(line)
    return str(c)
print("char count =", char_count(fin))
fin.close()
fin = open(file_name, "r")
def alphnum_count(x):
    c = 0
    for line in x:
        for e in line:
            if 'A'<=e<='Z' or 'a'<=e<='z' or '0'<=e<='9':
                c += 1
    return str(c)
print('alphanumeric count =', alphnum_count(fin))
fin.close()
fin = open(file_name, "r")
def line_count(x):
    c = 0
    for line in x:
        c += 1
    return str(c)
print('line count =', line_count(fin))
fin.close()
fin = open(file_name, "r")
def blank(t):
    new = ''
    letters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
    for e in t:
        if e in letters:
            new += e
        else:
            new += ' '
    return new
def word_count(x):
    words = []
    for line in x:
        words += blank(line).split()
    return str(len(words))
print('word count =', word_count(fin))
fin.close()
fin = open(file_name, "r")
def remove_stopwords(w):
    removed = []
    for e in w:
        if e not in stopwords:
            removed.append(e)
    return removed
def fhash(w, M):
    s = 0
    for i in range(len(w)):
        s += ord(w[i])*(37**i)
    fh = s % M
    return fh
fin = open("sample.txt", "r")
words = []
for line in fin:
    line2 = line.lower()
    words += blank(line2).split()
words = remove_stopwords(words)
if not use_fh:
    used = [] ; freq = [] ; BoW = []
    k = 1
    for e in words:
        if e not in used:
            used.append(e)
            freq.append(k)
        else:
            freq[used.index(e)] += 1
    for i in range(len(used)):
        BoW.append([used[i], freq[i]])
else:
    feathash = []
    for e in words:
        feathash.append(str(fhash(e, M)))
    feathash.sort()
    used = [] ; freq = [] ; BoW = []
    k = 1
    for e in feathash:
        if e not in used:
            used.append(e)
            freq.append(k)
        else:
            freq[used.index(e)] += 1
    for i in range(len(used)):
        BoW.append([used[i], freq[i]])
    for i in range(len(BoW)):
        (BoW[i])[0] = int((BoW[i])[0])
print('BoW =', BoW)
fin.close()



# 6330371221 (13.00) 212 (2021-03-22 22:56)
q = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
def fhash(w,M):
    k = 0
    for i in range(len(w)):
        k += ord(w[i])*(37)**(i)
    z = k%M
    return z
file_name = input('File name = ')
b = input('Use feature hashing ? (y,Y,n,N) ')
while b not in ['y','Y','n','N']:
    print('Try again.')
    b = input('Use feature hashing ? (y,Y,n,N) ')
    if b in ['y','Y']:
        print(input('M = '))
    else:
        pass
c = open("stopwords.txt","r")
a = open(file_name,"r")
line = 0
word = 0
character = 0
alpha = 0
for f in a:
    wordslist = f.split()
    line += 1
    word += len(wordslist)
    character += len(f)
    characters = character-line
    for i in range(len(f)):
        if f[i] in q:
            alpha +=1  
print('-'*19)
print('char count = ',characters)
print('alphanumeric count = ',alpha)
print('line count = ',line)
print('word count = ',word)







    
# 6330372921 (26.00) 213 (2021-03-19 13:54)

def fh(w, M):
    G = 37
    pow = 0
    sum = 0
    for c in w:
        asc = ord(c)
        sum += asc * (G ** pow)
        pow += 1
    return sum % M

def remove_stopwords(string):
    file = open("stopwords.txt")
    stopwords = []
    result = []
    for line in file:
        stopwords += line.strip().split()
    words = string.split()
    for word in words:
        if word not in stopwords:
            result.append(word)
    return " ".join(result)

def BoW_noHash(string):
    bow = []
    words = string.split()
    for word in words:
        found = False
        index = 0
        for w, n in bow:
            if w == word:
                found = True
                bow[index][1] += 1
                break
            index += 1
        if not found:
            bow.append([word, 1])
    return bow

def BoW_Hash(string, M):
    words = string.split()
    bow = []
    for word in words:
        fhash = fh(word, M)
        found = False
        index = 0
        for h, n in bow:
            if h == fhash:
                found = True
                bow[index][1] += 1
                break
            index += 1
        if not found:
            bow.append([fhash, 1])
    return bow

def process_file(file):
    characters = 0
    alphanumeric = 0
    line_count = 0
    beginning_of_word = False
    word_count = 0
    string = ""
    for line in file:
        line = line.strip()
        beginning_of_word = False
        for char in line:
            characters += 1
            if char.isalnum():
                alphanumeric += 1
                beginning_of_word = True
                string += char.lower()
            else:
                if beginning_of_word:
                    word_count += 1
                    string += " "
                    beginning_of_word = False
        string += " "
        line_count += 1
    return [characters, alphanumeric, line_count, word_count, string]


file_name = input("File name = ")
file = open(file_name)
h = input("Use feature hashing ? (y,Y,n,N) ")
hashing = False
M = 0
while True:
    if h == "y" or h == "Y":
        M = int(input("M = "))
        hashing = True
        break
    elif h == "n" or h == "N":
        hashing = False
        break
    else:
        print("Try again.")
        h = input("Use feature hashing ? (y,Y,n,N) ")

print("------------------")
count_result = process_file(file)
print("char count = " + str(count_result[0]))
print("alphanumeric count = " + str(count_result[1]))
print("line count = " + str(count_result[2]))
print("word count = " + str(count_result[3]))
process_string = remove_stopwords(count_result[4])
if hashing:
    print("BoW = ", end="")
    print(BoW_Hash(process_string, M))
else:
    print("BoW = ", end="")
    print(BoW_noHash(process_string))
# 6330374121 (29.00) 214 (2021-03-18 15:00)
file_name = input('File name = ')
modeHashing = input('Use feature hashing ? (y,Y,n,N) ').lower()
while modeHashing not in 'ny':
    modeHashing = input('Use feature hashing ? (y,Y,n,N) ').lower()
if modeHashing == 'y':
    M = int(input('M = '))
print('-------------------')

#input string output list - special characters & whitespace
def list_word(s):
    new_word = ''
    for e in s:
        if not e.isalnum():
            new_word += ' '
        else:
            new_word += e
    return new_word.split()
def fHash(w, M):
    s = 0
    for i in range(len(w)):
        s += ord(w[i])*(37**i)
    return s % M

#input string output list --> 'BoW ='
def BoW(s):
    stopword_list = f_stopword.read().split()
    BoW = []
    for e in list_word(s.lower()):
        if e not in stopword_list:
            if BoW == []:
                BoW.append([e, 1])
            else:
                for i in range(len(BoW)):
                    if e == BoW[i][0]:
                        BoW[i][1] += 1
                        break
                else:
                    BoW.append([e, 1])
    if modeHashing == 'n':
        return BoW
    else:
        temp = []
        for e in BoW:
            for k in range(e[1]):
                temp.append(fHash(e[0], M))       #temp = [int(0-M), int(0-M), int(0-M), ...]
        fHashing = []
        for i in range(M):
            fHashing.append([i,0])                #fHashing = [[0,0], [1,0], [2,0], ..., [M-1,0]]
        for e in temp:
            fHashing[e][1] += 1                   #fHashing = [[0,count0], [1,count1], ..., [M-1,count(M-1)]]
        for i in range(len(fHashing)):
            if fHashing[i][1] == 0:
                fHashing[i] = 'zero'              #remove element count=0
        while 'zero' in fHashing:
            fHashing.remove('zero')
        return fHashing
#------------------------------------------------
f_stopword = open('stopwords.txt', 'r')
f_work = open( file_name, 'r')

char_count = 0
alnum_count = 0
line_count = 0
word_count = 0
word_string = ''

for line in f_work:
    line_count += 1
    char_count += len(line.strip())
    for e in line.lower():
        if 'a' <= e <= 'z' or '0' <= e <= '9':
            alnum_count += 1
    word_count += len(list_word(line))
    word_string += line

print('char count =', char_count)
print('alphanumeric count =', alnum_count)
print('line count =', line_count)
print('word count =', word_count)
print('BoW =', BoW(word_string))
    
f_stopword.close()
f_work.close()
# 6330375821 (24.35) 215 (2021-03-22 22:25)
file_name=input('File name = ')
ft=input('Use feature hashing ? (y,Y,n,N) ')
while ft not in 'yYnN':
    print('Try again.')
    ft=input('Use feature hashing ? (y,Y,n,N) ')
if ft in "yY":
    M=int(input("M = "))
print('-------------------')

file= open(file_name).read().lower().strip('\n')
file2= open('stopwords.txt').read().lower()

Ch_c,Ap_c,L_c,word,word2,A,B =0,0,1,[],[],'',''


def fhash(w,M):
    ans=0
    for i in range(len(w)):
        ans+=ord(w[i])*(37**i)
    return ans%M
def PrBow(word):
    
    Bow,A =[],[]
   
    for e in word:
        if e not in A:
            Bow.append([e,word.count(e)])
            A.append(e)
    Bow.sort()
    print('BoW =',Bow)
    return(Bow)
for e in file:
    if e!='\n':
        Ch_c+=1   
    else:
        L_c+=1
    if 'A'<=e<='z' or '0'<= e<='9':
        Ap_c+=1
        A+=e
    else:
        if A!='':
            word.append(A)
            A=''
print('char count =',Ch_c)
print('alphanumeric count =',Ap_c)
print('line count =',L_c)
if A!='':
    word.append(A)
word_count=len(word)
print('word count =',word_count)
for x in file2:
    if 'A'<=x<='z' or '0'<=x<='9':
        B+=x
    else:
        if B!='':
            word2.append(B)
            B=''
if B!='':
    word2.append(B)
    

for e in word2:
    for i in range(word.count(e)):
        word.remove(e)

if ft in "yY":
    for i in range(len(word)):
        word[i]=fhash(word[i],M)

PrBow(word)
        
        

# 6330376421 (10.90) 216 (2021-03-22 14:00)
def fhash(w,M):
    a=0
    for i in range(len(w)):
        a+=ord(w[i])*(37**i)
    return a%M
def fhash_all(a,M):
    e=[]
    for i in a:
        e.append(fhash(i,M))
    return e
def allwords(a):
    c=[]
    for i in a:
        if not i in c:
            c.append(i)
    return c
 
def countwords(inside,allwords):
    c=[]
    for i in allwords:
        c.append([i,inside.count(i)])
    return c
#input------------------
file_name = input('File name = ')
while True:
    feture = input("Use feature hashing ? (y,Y,n,N) ")
    if feture =='Y'\
       or feture =='y'\
       or feture =='N'\
       or feture =='n':
        M=int(input('M = '))
        break
    else:
        print('Try again.')
print('-------------------')
#stopword--------------
infile = open('stopword.txt','r')
stopword=[]
for line in infile:
    e=line.lower().split()
    stopword+=e
infile.close()
#infile-----------------
infile = open(file_name,'r')
inside=''
charcount=0
alphanumeric_count =0
line_count=0
word_count=0
stopword_count=0
for line in infile:
    line_count+=1
    charcount+=len(line.strip())
    d=line.lower().strip()
    for i in d:
        if i in '!@#$%^&*()_+"\'\\/:;-=.?,{}':
            inside+=' '
        else:
            inside+=i
    inside+=' '
print('char count =',charcount)
e=inside.split()
inside=[]
for i in e:
    alphanumeric_count+=len(i)
    if not i in stopword:
        inside.append(i)
print('alphanumeric count =',alphanumeric_count)
print('line count =',line_count)
print('word count =',len(e))
infile.close()

if feture=='Y' or feture=='y':
    inside=fhash_all(inside,M)

#Output---------------------------------
print('BoW =',sorted(countwords(inside,allwords(inside))))
# 6330377021 (17.90) 217 (2021-03-21 17:18)

def fhash(w,M):
    G = 37
    feature_hashing = 0
    for i in range(len(w)):
        feature_hashing += (ord(w[i])*(G**i))
    feature_hashing = feature_hashing % int(M)
    return feature_hashing
def BoW(f1):
    new = []
    i = 0
    while i < len(f1)-1:
        h = 1
        if f1[i] == f1[i+1]:
            z = i+1
            while f1[i] == f1[z]:
                h += 1
                if z < len(f1)-1:
                    z += 1
                else:
                    break
            new.append([f1[i], h])
            i = z
        else:
            new.append([f1[i], 1])
            i += 1
    if f1[i] == f1[i-1]:
        pass
    else:
        new.append([f1[i], 1])
    return new   

File_name = input("File name = ")
choose_BoW = input("Use feature hashing ? (y,Y,n,N) ")

while True:
    if choose_BoW == "y" or choose_BoW == "Y":
        M = input("M = ")
        k = True
        break
    elif choose_BoW == "n" or choose_BoW == "N":
        k = False
        break
    else:
        print("Try again.")
        choose_BoW = input("Use feature hashing ? (y,Y,n,N) ")
    
print("-------------------")
name = open(File_name,"r")
stop_w = open("stopwords.txt","r")

line = name.readline()[:-1]
sentence = ""
line_count = 0
while len(line) > 0:
    if line[-1] == '\n':
        line = line[:-1]
    sentence += line
    line = name.readline()
    line_count += 1
char_count = len(sentence)
print("char count =", char_count)

b = ""
for i in range(len(sentence)):
    if '0'<=sentence[i]<='9' or 'a'<=sentence[i]<='z' or 'A'<=sentence[i]<='Z':
        b += sentence[i]
    else:
        b += " "
c = b.split()
alp_count = "".join(c)
print("alphanumeric count =",len(alp_count))
print("line count =",line_count)
print("word count =",len(c))

line2 = stop_w.readline()[:-1]+" "
sentence2 = ""
while len(line2) > 0:
    if line2[-1] == '\n':
        line2 = line2[:-1]+" "
    sentence2 += line2
    line2 = stop_w.readline()
stop_words = sentence2.split()

c = b.lower().split()
c.sort()
pre = []
for i in range(len(c)):
    if c[i] in stop_words:
        pass
    else:
        pre.append(c[i])       
if k == False:
    BoW = BoW(pre)
else:
    f1 = []
    for i in range(len(pre)):
        f1.append(fhash(pre[i],M))
    f1.sort()
    BoW = BoW(f1)
print("BoW =",BoW)        
name.close()
stop_w.close()

# 6330378721 (17.45) 218 (2021-03-21 01:09)

#-------------------------------------------------#
def get_unique( words ):
    
    unique_words = []
    
    for i in range(len(words)):
        if words[i] in words and not words[i] in unique_words:
            unique_words.append(words[i])
    
    return unique_words
def BoW0(ness_word):
    
    unique = get_unique( ness_word )
    freq_list = []
    for i in range(len(unique)):
        f = ness_word_str.count(unique[i])
        freq_list.append(f)
    bow0 = []
    for i in range(len(unique)):
        bow0.append([unique[i],freq_list[i]])
    return(bow0)
def fhash(w,M):
    
    G = 37
    fhash_cal = 0
    sig_fhash_cal = 0
    
    for i in range(len(w)):
        sig_fhash_cal += (w[i])*(G**i)
    fhash_w = sig_fhash_cal % M
    return fhash_w
def change_2_num(ness_word):
    c = []
    list_c = []
    for i in range(len(ness_word)):
        for k in range(len(ness_word[i])):
            list_c.append(ord(ness_word[i][k]))
        c.append(fhash(list_c,M))
        list_c = []
    return(c)
def BoW1(c):
    
    unique = get_unique(c)
    freq_list = []
    c_str = []
    uniq_c_str = []
    bow1 = []
    for i in range(len(c)):
        c_str.append(str(c[i]))
    c_str =(' ').join(c_str)

    for i in range(len(get_unique(c))):
        uniq_c_str.append(str(get_unique(c)[i]))
    
    for i in range(len(unique)):
        f = c_str.count(uniq_c_str[i])
        freq_list.append(f)
    for i in range(len(unique)):
        bow1.append([int(uniq_c_str[i]),freq_list[i]])
    return(bow1)

#-------------------------------------------------#

file_name0 = input('File name = ')
file_name = open(file_name0,'r')
a = ''
line_count = 0
for line in file_name:
    a += (line.strip())
    line_count += 1
char_count = len(a)
a = a.lower()
list_file =[]
alpha_count = 0
for i in range(char_count):
    if a[i] in ['/','.',',','"',':',':',"'"]:
        list_file += ' '
    else:
        list_file += a[i]
        if not a[i] == ' ':
            alpha_count += 1
normal_txt = ('').join(list_file)
list_o_str_words = normal_txt.split()
word_count = len(list_o_str_words)

file_stp_word = open('stopwords.txt','r')
b = ''
for line in file_stp_word:
    b += (line.strip())
    b += ' '
b = b.split()
bow = []
ness_word =[]
for i in range(word_count):
    if not list_o_str_words[i] in b:
        ness_word.append(list_o_str_words[i])
ness_word_str = (' ').join(ness_word)
answer = False
while answer == False:
    choice = input('Use feature harshing ? (y,Y,n,N) ')
    if choice in ['Y','y','N','n']:
        answer = True
        if choice == 'Y' or choice == 'y':
            M = int(input('M = '))
            print('-'*19)
            print('char count =',char_count)
            print('alphanumeric count =',alpha_count)
            print('line count = ',line_count)
            print('word count =',word_count)
            print('BoW =',BoW1(change_2_num(ness_word)))
        else:
            print('-'*19)
            print('char count =',char_count)
            print('alphanumeric count =',alpha_count)
            print('line count = ',line_count)
            print('word count =',word_count)
            print('BoW =',BoW0(ness_word))
    else:
        print('Try Again.')

file_name.close()
file_stp_word.close()
# 6330379321 (24.00) 219 (2021-03-21 17:03)

alphabet = 'abcdefghijklmnopqrstuvwxyz'
number = '0123456789'
File_name = input('File name = ')
file_name = open(File_name,'r')
stopword_file = open('stopwords.txt','r')
#================================================================
def stopword(list_of_word,stopword_file) :
    stopwords = []
    stop_word =[]
    for line in stopword_file :
        line = line.strip().lower().split()
        stopwords+=(line)
    for i in list_of_word :
        if i  in stopwords :
            continue
        else :
            stop_word.append(i)
    return stop_word       
    
def easy_stop_word(line) :
    out = ''
    for i in line :
        if i in alphabet+number :
            out+=i
        else :
            out+=' '
    return out
def fhash(word,M) :
    sum = 0
    for e  in range(len(word)) :
        sum+=((ord(word[e]))*37**e)
    result = sum%M    
    return result
def Yes(list_of_word,M) :
    C=[]
    for i in list_of_word :
        num = fhash(i,M)
        C.append(num)
    C.sort()
    c=1
    BoW =[]
    for j in range(len(C)-1) :
        if C[j] == C[j+1] :
            c+=1
        else :
            BoW.append([C[j],c])
            c=1
    BoW.append([C[j+1],c])
    return BoW
        
def countword(s) :
    countword = []
    for t in s :
        c = 1
        k = s.index(t)
        while t in s[k+1:] :
             c += 1
             k = s.index(t,k+1)
        countword.append([t,c])
    BoW = []
    for i in countword :
        if i not in BoW :
            BoW.append(i)
    return BoW
#================================================================
Y = 0
while True :
    answer = input('Use feature hashing ? (y,Y,n,N) ')
    if answer == 'y' or answer == 'Y' :
        M=int(input('M = '))
        Y = 1
        break
    elif answer == 'n' or answer =='N' :
        break
    else :
        print('Try again.')
        
#==================================================================
        
char = 0
alphanumeric_count = 0
line_count = 0
word_count = 0
list_of_word=[]
for line in file_name :
    line = line.strip().lower()
    for i in line :
        char+=1
        if i in alphabet+number :
            alphanumeric_count+=1
    line = easy_stop_word(line)
    line = line.split()
    for e in line :
        list_of_word.append(e)
        word_count += 1
    line_count +=1

list_of_word = stopword(list_of_word,stopword_file)
if Y == 0 :
    BoW = countword(list_of_word)
elif Y == 1:
    BoW = Yes(list_of_word,M)
    
#=============================================================================================
print('-------------------')
print('char count =',char)
print('alphanumeric count =',alphanumeric_count)
print('line count =',line_count)
print('word count =',word_count)
print('Bow =',BoW)

            
 

    
    

# 6330380921 (30.00) 220 (2021-03-21 17:14)
stopwords_list=[]
stopword_in=open('stopwords.txt',"r")
for line in stopword_in:
    stop=line.strip().split()
    for i in stop:
        stopwords_list.append(i)
stopword_in.close()
def fhash(w,M):
    G=37
    sum_of=0
    ws=str(w).strip()
    for i in range(len(ws)):
        a=ord(ws[i])
        sum_of += (a*(G)**i)
    real_sum=sum_of%M
    return real_sum
def pack_bow(text,stopwords_list):
    a=[]
    b=[]
    for e in text:
        if not(e in stopwords_list):
            a.append(e)
    return a
    
def char(text):
    if '\n' in text:
        a=len(text)-1
    else:
        a=len(text)
    return a
def alpha(text):
    c=0
    for i in text:
        if i in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789':
            c+=1
    return c
def word(text):
    a=''
    for e in text:
        if e in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789':
            a+= e
        elif e== ' ':
            a+=' '
        else:
            a+=' '
    b=a.split()
    c=[]
    i=0
    while i<len(b):
        if b[i]==' ':
            i+=1
        else:
            c.append(b[i].lower())
            i+=1
    return c  
def bow(L_list):
    a=[]
    b=[]
    for e in L_list:
        a+=e
    return a
def no_of_bow(word,allword):
    num_bow=[]
    for e in word:
        c=0
        k=0
        while k<len(allword):
            if e == allword[k]:
                c+=1
                k+=1
            else:
                k+=1
        num_bow.append(c)
    return num_bow   
def remove_rep(word):
    b=[]
    b+=word
    c=[]
    k=0
    while k<len(b):
        x=word.pop(0)
        if not(x in word):
            c.append(x)
            k+=1
        else:
            k+=1
    return c
def add_no_to_bow(word,no):
    a=[]
    for k in range(len(word)):
        a.append([word[k],no[k]])
    return a
def add_no_to_bow_hash(word,no):
    a=[]
    for k in range(len(word)):
        a.append([int(word[k]),no[k]])
    return a 

file_name = input('File name = ')
fh = input('Use feature hashing ? (y,Y,n,N) ')
active_fhash=0
M_no=0
while fh != 'y' or fh !='Y':
    if fh == 'n' or fh == 'N':
        active_fhash = 0
        break
    elif fh=='y' or fh=='Y':
        active_fhash = 1
        M_no=input('M = ')
        break
    else:
        print('Try again.')
        fh = input('Use feature hashing ? (y,Y,n,N) ')

char_count=0
alpha_count=0
line_count=0
word_count=0
fh_list=[]
bow_list=[]
word_list=[]
f_in=open(file_name,"r")
for line in f_in:
    char_count+=char(line)
    alpha_count+=alpha(line.strip())
    word_count+=(len(word(line.strip())))
    if len(line)!=0:
        line_count+=1
    word_list.append(word(line.strip()))
    bow_list.append(pack_bow(word(line),stopwords_list))
print('-------------------')
print('char count =',char_count)
print('alphanumeric count =',alpha_count)
print('line count =',line_count)
print('word count =',word_count)
f_in.close()
fh_list=[]
if active_fhash==0:
    word=remove_rep(bow(bow_list))
    allword=bow(word_list)
    no=no_of_bow(word,allword)
    BoW=sorted(add_no_to_bow(word,no))
elif active_fhash==1:
    fhword=bow(bow_list)
    for w in fhword:
        fh_list.append(str(fhash(w,int(M_no))))
    allfhash=bow(fh_list)
    wordhash=remove_rep(bow(fh_list))
    nohash=no_of_bow(wordhash,allfhash)
    BoW=sorted(add_no_to_bow_hash(wordhash,nohash))
print('BoW =',BoW)
# 6330381521 (20.20) 221 (2021-03-22 23:26)
file_name = input('File name = ')
fh = input('Use feature hashing ? (y,Y,n,N) ')
fh = fh.upper()


while fh != 'N' and fh != 'Y' :
    print('Try again.')
    fh = input('Use feature hashing ? (y,Y,n,N) ')
    fh = fh.upper()
    
    
if fh == 'N' and 'n' :
    file1= open(file_name).read().lower().strip('\n')
    file2= open('stopwords.txt').read().lower()

    character_count = 0
    alphanumeric_count = 0
    U1 = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    L1 = 'abcdefghijklmnopqrstuvwxyz'
    number = '0123456789'
    line_count = 1
    word_count = 0
    BoW = []
    
    print('-------------------')
    
    for line in file1 :
        if line != '\n' :
            character_count += 1
        else :   
            line_count += 1
        
    for i in range(len(file1)) :
        if file1[i] in U1 :
            alphanumeric_count += 1
        elif file1[i] in L1 :
            alphanumeric_count += 1
        elif file1[i] in number :
            alphanumeric_count += 1
            
    word1 = ""
    
    for i in range(len(file1)) :
        if not file1[i] in [',', '"', "'", '-', '_', '=', '.', '(', ')', '>', '<', ';', ':'] :
            word1 += file1[i]
        else:
            word1 += ' '
            
    word1 = word1.split()
    word_count += len(word1)

    word_stop = []
    word_without_stop = ''
    
    
    for word2 in file2 :
        if "a" <= word2 <= "z" or "0" <= word2 <= "9" :
            word_without_stop += word2
        else:
            if word_without_stop != "" :
                word_stop.append(word_without_stop)
                word_without_stop = ""
                
                
    for w_e_W in word_stop :
        for i in range(word1.count(w_e_W)) :
            word1.remove(w_e_W)
    aaaa = []
    for BoW_word in word1 :
        if BoW_word not in aaaa :
            BoW.append([BoW_word ,word1.count(BoW_word)])
            aaaa.append(BoW_word)            
    BoW.sort()
     
     
    print('char count = ' + str(character_count))
    print('alphanumeric count = ' + str(alphanumeric_count))
    print('line count = ' + str(line_count))
    print('word count = ' + str(word_count))
    print('BoW =',BoW)
    
    
    
if fh == 'Y' and 'y' :
    
    M_solution = int(input("M = "))
    print('-------------------')
    file1 = open(file_name).read().lower().strip('\n')
    file2 = open('stopwords.txt').read().lower()

    character_count = 0
    alphanumeric_count = 0
    line_count = 1
    word1 = []
    word2 = []
    string_word1 = ""
    string_word2 = ""
    
    
    for e_e_e in file1 :
        
        if e_e_e != "\n" :
            character_count += 1   
        else :
            line_count += 1
            
        if 'a' <= e_e_e <= 'z' or '0 '<=  e_e_e <= '9' :
            alphanumeric_count += 1
            string_word1 += e_e_e
        else :
            if string_word1 != "" :
                word1.append(string_word1)
                string_word1 = ""
          
          
    print('char count =',character_count)
    print('alphanumeric count =',alphanumeric_count)
    print('line count =',line_count)
    
    
    if string_word1 != "" :
        word1.append(string_word1)
    word_count = len(word1)
    print('word count =', word_count)
    
    
    for word_word in file2 :
        if "a" <= word_word <= "z" or "0" <= word_word <= "9" :
            string_word2 += word_word
        else :
            if string_word2 != "" :
                word2.append(string_word2)
                string_word2 = ""
                
                
    if string_word2 != "" :
        word2.append(string_word2)
        

    for e_e_e_e in word2 :
        for i in range(word1.count(e_e_e_e)) :
            word1.remove(e_e_e_e)
            
            
    def  feature_hashing(word,M_solution) :
        ans = 0
        for e_e_e_e in range(len(word)) :
            ans += ord(word[e_e_e_e])*(37**e_e_e_e)
        return ans % M_solution
    
    
    if fh in "yY" :
        for e_e_e_e in range(len(word1)) :
            word1[e_e_e_e] = feature_hashing(word1[e_e_e_e], M_solution)
            
            
    BoW = []
    string_word1 = []
    
    for w_o_r_d in word1 :
        if w_o_r_d not in string_word1 :
            BoW.append([w_o_r_d , word1.count(w_o_r_d)])
            string_word1.append(w_o_r_d)
                 
    BoW.sort()
    
    
    print('BoW =',BoW)   
# 6330382121 (14.00) 222 (2021-03-22 23:23)
def flash(w, M):
    a = 0
    for i in range(len(w)):
        a += ord(w[i]) * (37 ** i)
    b = a % M
    return b
        
def repeat(stop, file_name):
    alphabet_list = 'abcdefghijklmnopqrstwxyz'
    number_list = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    
    stop_file = open(stop, 'r')
    text_file = open(file_name, 'r')
    stop_texts = stop_file.read()
    file_texts = text_file.read()
    new_textfile1 = '.'
    new_textfile = ''
    new_stopfile = ''
    for e in file_texts:
        if e.lower() not in alphabet_list and e.lower() not in number_list:
            new_textfile += ' '
        else:
            new_textfile += e.lower() 
            
    for e in file_texts:
        if e.lower() not in alphabet_list and e.lower() not in number_list:
            new_textfile1 += '.'
        else:
            new_textfile1 += e.lower() 
    
    for i in stop_texts:
        if i.lower() not in alphabet_list and i.lower() not in number_list:
            new_stopfile += ' '
        else:
            new_stopfile += i.lower()
    new_stopfile = new_stopfile.split()
    result = []
    
    for a in new_textfile.split():
        if a in new_stopfile or a in result:
            pass
        else:
            result.append(a)
    
    BoW = []
    for c in result:
        x = 0
        x = new_textfile1.count('.' + c + '.')
        BoW.append([c, x])
        
    stop_file.close()
    text_file.close()
    
        
    return BoW
def feature_hashing(BoW, M):
    new_BoW = ''
    for i in BoW:
        a = flash(i[0], M)
        b = i[1]
        new_BoW += ('.' + str(a) + ',') * b
        
    new_BoW1 = []
    for k in range(M):
        b = 0
        x = 0
        x = new_BoW.count('.' + str(k) + ',')
        if x != 0:
            new_BoW1.append([k, x])
    
    return new_BoW1
               

file_name = input('File name = ')
a = input('Use feature hashing ? (y,Y,n,N) ')
while a != 'y' and a != 'Y' and a != 'n' and a != 'N':
    print('Try again.')
    a = input('Use feature hashing ? (y,Y,n,N) ')
if a == 'y' or a == 'Y':
    M = int(input('M = '))
    print('-------------------')
    file = open(file_name, "r")
    texts = file.read()
    char = 0
    line = 0
    space_bar = 0
    special = 0
    number = 0
    number_list = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    special_list = ['"', '(', '[', ']', ')', '.', ',', '\'', '/', ':', ';', '!']
    for i in texts:
        if i == '\n':
            line = line + 1
        elif i == ' ':
            space_bar += 1
        elif i in number_list:
            number += 1
        elif i in special_list:
            special += 1
        else:
            char += 1
    print('char count =', char + space_bar + special + number)
    print('alphanumeric count =', char + number)
    print('line count =', line + 1)
    print('word count =', space_bar + 1*(line + 1))
    file.close()
    print(feature_hashing(repeat('stopwords.txt', file_name), M))
    
            
        
    
    # feature hashing
elif a != 'n' or a != 'N':
    print('-------------------')
    file = open(file_name, "r")
    texts = file.read()
    char = 0
    line = 0
    space_bar = 0
    special = 0
    number = 0
    number_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9']
    special_list = ['"', '(', '[', ']', ')', '.', ',', '\'', '/', ':', ';', '!']
    for i in texts:
        if i == '\n':
            line = line + 1
        elif i == ' ':
            space_bar += 1
        elif i in number_list:
            number += 1
        elif i in special_list:
            special += 1
        else:
            char += 1
    print('char count =', char + space_bar + special + number)
    print('alphanumeric count =', char + number)
    print('line count =', line + 1)
    print('word count =', space_bar + 1*(line + 1))
    file.close()
    print(repeat('stopwords.txt', file_name))
    # no feature hashing

        
    

# 6330384421 (30.00) 223 (2021-03-22 20:27)
M = 0
file_name = input('File name = ')
nb = input('Use feature hashing ? (y,Y,n,N) ')
while not nb in ['y', 'Y', 'n', 'N']:
    print('Try again.')
    nb = input('Use feature hashing ? (y,Y,n,N) ')
if nb in ['y', 'Y']:M = int(input('M = '))
print('-------------------')
information = []
l=[]
stopwords = open('stopwords.txt', 'r')
for line in stopwords:
    l+=line.strip().lower().split()
for i in range(len(l)):
    if not l[i] in information:
        information+=[l[i]]
stopwords.close()

charactercount=0
ab12=0
linecount=0
L=[]
L2=''
information2= []
information3= []
openfile = open(file_name, 'r')
for line in openfile:
    linecount+=1
    for i in line:
        charactercount+=1
        if i=='\n':charactercount-=1
        if '0'<=i<='9'or'A'<=i<='Z'or'a'<=i<='z':
            ab12+=1
            L2+=i
        else: L2+=' '
    L+=L2.strip().lower().split()
    L2=''
for i in range(len(L)):
            information2+=[L[i]]
            if not L[i] in information3:
                information3+=[L[i]]
wordcount=len(information2)  
openfile.close()
def fhash(w, M):
    total=0
    for i in range(len(w)):
        total+=ord(w[i])*(37**i)
    t=total%M
    return t
information4=information3.copy()
information5=information2.copy()
bo=[]
bo2=[]
t=[]
total=0
for i in range(len(information3)):
    if information3[i] in information:information4.remove(information3[i])
for i in range(len(information2)):
    if information2[i] in information:information5.remove(information2[i])

if (nb=='y')or (nb== 'Y'):
    for i in range(len(information5)):
        bo.append(fhash(information5[i], M))
    for i in range(len(bo)):
        if bo[i]not in bo2:bo2+=[bo[i]]
    for i in range(len(bo2)):
        for r in range(len(bo)):
            if bo2[i]==bo[r]:
                total+=1
        t.append([bo2[i],total])
        total=0
elif (nb=='n')or (nb== 'N'):
    for i in range(len(information5)):
        bo.append(information5[i])
    for i in range(len(bo)):
        if bo[i]not in bo2:bo2+=[bo[i]]
    for i in range(len(bo2)):
        for r in range(len(bo)):
            if bo2[i]==bo[r]:
                total+=1
        t.append([bo2[i],total])
        total=0

print('char count =', charactercount)
print('alphanumeric count =', ab12)
print('line count =', linecount)
print('word count =', wordcount)
print('BoW =', t)
# 6330387321 (21.40) 224 (2021-03-22 19:38)
def fhash(w,M):
    a = 0
    G = 37
    for i in range(len(w)):
        a += ord(w[i])*(G**i)
    b = a%M
    return b
def alphanumeric_count(w):
    ac = 0
    for i in range(len(w)):
        if w[i] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789':
            ac += 1
        else:
            pass
    return ac
def word_count(W):
    for i in range(len(W)):
        if W[i] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789':
            pass
        else:
            W = W.replace(W[i],' ')
    wc = len(W.split())
    return wc
def BoW(w, u, m):
    f = open("stopwords.txt", "r")
    f1 = []
    line = f.readline()
    while len(line) != 0:
        f1.append(line)
        line = f.readline()
    f.close()
    f1 = "".join(f1)
    f1 = f1.replace('\n',' ')
    f1 = f1.split()
    w = w.lower()
    for i in range(len(w)):
        if w[i] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789':
            pass
        else:
            w = w.replace(w[i],' ')
    w = w.split()
    bb = []
    for i in w:
        if i in f1 :
            pass
        else:
            bb.append(i)
    bb.sort()
    if u in ('y','Y'):
        z = []
        for i in range(len(bb)):
            z0 = fhash(bb[i],m)
            z.append(z0)
        z.sort()
        a = []
        b = []
        bow = []
        y =  1
        for j in z:
            if j not in a:
                a.append(j)
                b.append(y)
            else:
                b[a.index(j)] +=1
        for k in range(len(a)):
            bow.append([a[k], b[k]])
    else:
        a = []
        b = []
        bow = []
        y =  1
        for j in bb:
            if j not in a:
                a.append(j)
                b.append(y)
            else:
                b[a.index(j)] +=1
        for k in range(len(a)):
            bow.append([a[k], b[k]])
    return bow
                
            

file_name = input('File name = ')
ufh = input('Use feature hashing ? (y,Y,n,N) ')
while ufh not in ('y','Y','n','N'):
    print('Try again.')
    ufh = input('Use feature hashing ? (y,Y,n,N) ')
if ufh in ('y','Y'):
    M = int(input('M = '))
else:
    M = 1

fn = open(file_name, "r")
nf = []
lc = 0
line = fn.readline()
while len(line) != 0:
    nf.append(line)
    line = fn.readline()
    lc += 1
fn.close()
nf = "".join(nf)
nf = nf.replace('\n','')

cc = len(nf)
ac = alphanumeric_count(nf)
wc = word_count(nf)
b = BoW(nf, ufh, M)
 
print('-------------------')
print('char count = '+str(cc))
print('alphanumeric count = '+str(ac))
print('line count = '+str(lc))
print('word count = '+str(wc))
print('BoW = '+str(b))
# 6330388021 (22.95) 225 (2021-03-22 20:38)
def count_line(file_name):
    fn = open(file_name)
    c = 0
    for line in fn:
        c += 1
    fn.close()
    return c
def fn2st(file_name):
    fn = open(file_name)
    c = ''
    for i in fn:
        c+=i
    c=c.split('\n')
    c=' '.join(c)
    fn.close()
    return c
def replace_punctuation(file_name):
    s=fn2st(file_name)
    t = ""
    for e in s:
        if e in "\"\'/\\,.:;()[]{}":
            t += " "
        else:
            t += e
    return t
def remove_stopwords(file_name,stop_words):
    a=replace_punctuation(file_name).split()
    s=fn2st(stop_words).split()
    t=[]
    for e in a:
        if e.lower() not in s:
            t+=[e.lower()]
    return t
def BoW(t):
    a=[]
    for i in t:
        c=0
        for e in t:
            if i == e:
                c+=1
        if [i,c] not in a:
            a+=[[i,c]]
    return a
def count_char(file_name):
    a=0
    fn=open(file_name)
    for i in fn :
        a+=len(i.strip())
    fn.close()
    return a
def fhash(w,M):
    a=0
    for c in range(len(w)):
        a+=(ord(w[c])*37**c)
    a=a%int(M)
    return a
def fhash_all(t,M):
    a=[]
    for w in t:
        a+=[fhash(w,M)]
    return a
def count_alb(file_name):
    a=replace_punctuation(file_name)
    c=0
    for i in a:
        if i!=' ' :
            c+=1
    return c
def count_words(file_name):
    a=replace_punctuation(file_name)
    a=a.split()
    c=0
    for i in a:
        c+=1
    return c
#------------------------------------------------
file_name = input('File name = ')
fh = input('Use feature hashing ? (y,Y,n,N) ')
while fh not in ['y','Y','n','N']:
    print('Try again.')
    fh=input('Use feature hashing ? (y,Y,n,N) ')
if fh in ['y','Y']:
    M=input('M = ')
    fn=remove_stopwords(file_name,'stopwords.txt')
    fn=fhash_all(fn,M)
elif fh in ['n','N']:
    fn=remove_stopwords(file_name,'stopwords.txt')
print('-------------------')
cc=count_char(file_name)
print('char count =',cc)
ca=count_alb(file_name)
print('alphanumeric count =',ca)
cl=count_line(file_name)
print('line count =',cl)
cw=count_words(file_name)
print('word count =',cw)
bow=BoW(fn)
print('BoW =',bow)

    
    
    
    

# 6330389621 (22.15) 226 (2021-03-22 19:45)

#------------------------------------------------------------------#
def blank1(t1):
    result1=""
    for k in t1.lower():
        if k in"\"\'/\\,.:;\n ":
            result1+=""
        else:
            result1+=k
    return result1
def blank2(t2):
    result2=""
    for l in t2.lower():
        if l in"\n":
            result2+=""
        else:
            result2+=l
    return result2
def blank3(t3):
    result3=""
    for m in t3.lower():
        if m in"\"\'/\\,.:;\n":
            result3+=" "
        else:
            result3+=m
    return result3
def bow(dt,sw):
    BoW=[]
    for i in range(len(dt)):
        if dt[i] not in sw:
            BoW+=[dt[i]]
    BoW=sorted(BoW)
    i=0
    message=[]
    while (i <= len(BoW)-1): 
        count = 1
        ch = BoW[i] 
        j = i 
        while (j < len(BoW)-1): 
            if (BoW[j] == BoW[j+1]): 
                count = count+1
                j = j+1
            else: 
                break
        message.append([BoW[j],count])
        i = j+1
    return message
def delete(dt,sw):
    BoW=[]
    for i in range(len(dt)):
        if dt[i] not in sw and dt[i] not in BoW:
            BoW+=[dt[i]]
    return BoW
def fahash(w,M):
    calc=0
    for i in range(len(w)):
        calc+=ord(w[i])*(G**i)
    calc=calc%M
    return calc
def Bow(a):
    BoW=sorted(a)
    i=0
    message=[]
    while (i <= len(BoW)-1): 
        count = 1
        ch = BoW[i] 
        j = i 
        while (j < len(BoW)-1): 
            if (BoW[j] == BoW[j+1]): 
                count = count+1
                j = j+1
            else: 
                break
        message.append([BoW[j],count])
        i = j+1
    return message

    
#------------------------------------------------------------------#

file_name=open(input("File name = "),"r")
fhash=input("Use feature hashing ? (y,Y,n,N) ")
while fhash not in "y,Y,n,N":
    print("Try again")
    fhash=input("Use feature hashing ? (y,Y,n,N) ")
if fhash in "n,N":
    char=""
    alphanumeric=""
    word=[]
    count_line=0
    x=""
    for line in file_name:
        char+=blank2(line.lower())
        alphanumeric+=blank1(line.lower())
        count_line+=1
        word+=line.split()
        x+=blank3(line)
    data=x.split()
    fn=open("stopwords.txt","r")
    y=""
    for line in fn:
        y+=blank3(line)
    stop_word=y.split()
    fn.close()
    show=bow(data,stop_word)
    print("-------------------")
    print("char count = ",len(char))
    print("alphanumeric count = ",len(alphanumeric))
    print("line count =",count_line)
    print("word count =",len(word))
    print("BoW =",show)

elif fhash in "y,Y":
    G=37
    M=int(input("M = "))
    char=""
    alphanumeric=""
    word=[]
    count_line=0
    x=""
    for line in file_name:
        char+=blank2(line.lower())
        alphanumeric+=blank1(line.lower())
        count_line+=1
        word+=line.split()
        x+=blank3(line)
    data=x.split()
    fn=open("stopwords.txt","r")
    y=""
    for line in fn:
        y+=blank3(line)
    stop_word=y.split()
    fn.close()
    BoW=delete(data,stop_word)
    BoW_y=[]
    for i in range(len(BoW)):
        find=fahash(BoW[i],M)
        BoW_y+=[find]
    show=Bow(BoW_y)
    
    print("-------------------")
    print("char count = ",len(char))
    print("alphanumeric count = ",len(alphanumeric))
    print("line count =",count_line)
    print("word count =",len(word))
    print("BoW =",show)

file_name.close()







# 6330391821 (21.90) 227 (2021-03-21 21:01)
def alphanumeric_count(x):
    c = 0
    for e in range(len(x)):
        if "0" <= x[e] <= "9" or "A" <= x[e] <= "z":
            c += 1
    return c
def char_count(x):
    c = 0
    for e in range(len(x)):
        if x[e] != "\n":
            c += 1
    return c
def line_count(file_name):
    a = open(file_name,'r')
    z = []
    for line in a:
        z.append(line)
    a.close()
    p = []
    c= 0
    for e in range(len(z)):
        p.append(z[-e-1])
    for i in range(len(p)):
        if p[i] != "\n":
            break
        else:
            c += 1
    if c == 0:
        return len(z)
    else:
        return len(z[:-c:])
def split_word(x):
    y = ""
    for e in range(len(x)):
        if "0" <= x[e] <= "9" or "A" <= x[e] <= "z":
            y += x[e]
        elif x[e] == "\n":
            pass
        else:
            y += " "
    x = []
    z = y.split(" ")
    for e in range(len(z)):
        if z[e] == "":
            pass
        else:
            x.append(z[e])
    return x
def word_count(x):
    y = split_word(x)
    return len(y)
def list_word_count(x):
    y = []
    for e in range(len(x)):
        if x[e] not in y:
            y.append(x[e])
    z = []
    for i in range(len(y)):
        c = 0
        for e in range(len(x)):
            if y[i] == x[e]:
                c += 1
        z.append([y[i],c])
    return z
def Bag_of_word(x):
    b = open("stopword.txt",'r')
    y = []
    for line in b:
        y += split_word(line)
    x = split_word(x)
    z = []
    for e in range(len(x)):
        if x[e] in y:
            pass
        else:
            z.append(x[e])
    b.close()
    return list_word_count(z)
def feature_flashing(x,m):
    b = open("stopword.txt",'r')
    y = []
    for line in b:
        y += split_word(line)
    x = split_word(x)
    z = []
    for e in range(len(x)):
        if x[e] in y:
            pass
        else:
            z.append(x[e])
    m = int(m)
    u = []
    for e in range(len(z)):
        c = 0
        for i in range(len(z[e])):
            c += ord(z[e][i])*(37**(i))
        c %= m
        u.append(c)
    b.close()
    return list_word_count(u)
#__________________________________________________
file_name = input("File name = ")
feature = input("Use feature hashing ? (y,Y,n,N) ")
while feature not in ['y','Y','n','N']:
    print("Try again.")
    feature = input("Use feature hashing ? (y,Y,n,N) ")
M = 1
if feature in ['y','Y']:
    M = input("M = ")
print("-------------------")
a = open(file_name,'r')
char = 0
alphanumeric = 0
word = 0
y = ""
for line in a:
    line = line.lower()
    for e in range(len(line)):
        if line[e] == "\n":
            y += " "
        else:
            y += line[e]
    char += char_count(line)
    alphanumeric += alphanumeric_count(line)
    word += word_count(line)
print("char count = "+str(char))
print("alphanumeric count = "+str(alphanumeric))
print("line count = "+str(line_count(file_name)))
print("word count = "+str(word))
x1 = Bag_of_word(y)
x1.sort()
x2 = feature_flashing(y,M)
x2.sort()
a.close()
if feature in ['n','N']:
    print("Bow = "+str(x1))
else:
    print("Bow = "+str(x2))
# 6330392421 (20.50) 228 (2021-03-22 23:05)


# input from user (file_name, feature, (M?))
file_name = input('File name = ')
feature = input('Use feature hashing ? (y,Y,n,N) ')
k = 0
while k <= 0:
    if feature in ['y', 'Y']:
            M = input('M = ')
            break
            k += 1
    if feature in ['n', 'N']:
            k += 1
    if feature not in ['y', 'Y', 'n', 'N']:
        print('Try again.')
        feature = input('Use feature hashing ? (y,Y,n,N) ')
        k -= 1
        
     
# finding list of stop words  
stopwin = open('stopword.txt', 'r')
stop_words = []
for line in stopwin:
    for e in line.strip().split():
        stop_words.append(e)
stopwin.close()

# 
def blank(t):
    result = ''
    for c in t:
        if c in '\"\'/\\,.:;':
            result += ' '
        else:
            result += c
    result = result.strip()
    return result



# read info from file_name
fin = open(file_name,'r')
char_count = 0
alphanumeric_count = 0
line_count = 0
word_count = 0
list_of_lower_mee_stopwords = []
for line in fin:
    list_of_words = blank(line).split()
    # นับตัวทั้งหมด
    char_count += len(line.strip())
    # นับจำนนบรรทัด
    line_count += 1
    # นับจำนวนตัวอักษร
    for e in list_of_words:
        list_of_lower_mee_stopwords.append(e.strip().lower())
        alphanumeric_count += len(e)
    # นับจำนวนคำ
    word_count += len(list_of_words)
print('-------------------')
print('char count = ' + str(char_count))
print('alphanumeric count = '+ str(alphanumeric_count))
print('line count = ' + str(line_count))
print('word count = ' + str(word_count))

list_of_screened_words = []
for e in list_of_lower_mee_stopwords:
    if e not in stop_words:
        list_of_screened_words.append(e)
        


BoW_words_pre = []
counter = []
for e in list_of_screened_words:
    if e not in BoW_words_pre:
        BoW_words_pre.append(e)
        counter.append(1)
    else:
        i = BoW_words_pre.index(e)
        counter[i] += 1
        
# Bag of Words no feature
BoW_words_no_feature = []
for i in range(len(BoW_words_pre)):
    BoW_words_no_feature.append([BoW_words_pre[i],counter[i]])

        
# Bag of Words no feature
BoW_words_fhash = []
def fhash(w,M):
    sum_of_ord = 0
    G = 37
    for i in range(len(w)):
        sum_of_ord += ord(w[i])*(G**i)
    result = sum_of_ord % int(M)
    return result


if feature in ['y', 'Y']:
    for e in list_of_screened_words:
        BoW_words_fhash.append(fhash(e,M))
        
    bow_fhash_final = []
    c = 0
    for e in BoW_words_fhash:
        for i in range(len(BoW_words_fhash)):
            if e == BoW_words_fhash[i]:
                c += 1
        if [e,c] not in bow_fhash_final:
            bow_fhash_final.append([e,c])
        c = 0
    bow_fhash_final.sort()        
            

if feature in ['y', 'Y']:
        print('BoW = ' + str(bow_fhash_final))
if feature in ['n', 'N']:
        print('BoW = '+ str(BoW_words_no_feature))
        
fin.close()
# 6330395321 (30.00) 229 (2021-03-22 23:39)

#----------------------------------------------------------------------------
def Allcount(Filename):
    countline = 0
    alphabe = 0
    chartnum = ''
    wordcount = ''
    sentence = []
    alpha = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0']
    for line in Filename:
        chartnum += line.strip('\n')
        line = line.lower()
        countline += 1
        for e in line:
            if e in alpha :
                alphabe += 1
        wordcount += ' ' + words(line)

    for e in wordcount.split():
        if e.strip() not in stopwords:
            sentence.append(e.strip())
            
    print('char count =' ,len(chartnum) )
    print('alphanumeric count =' ,alphabe)
    print('line count =' ,countline)
    print('word count =',len(wordcount.split()))
    return sentence

#----------------------------------------------------------------------------
def flash(word,M):
    all = 0 
    for i in range(len(word)):
        all += ord(word[i])* (37**i)
    toon = all % M
    return toon

#----------------------------------------------------------------------------
def words(line):
    nword = ''
    alpha = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0']
    for e in line:
        if e not in alpha:
            nword += ' '
        else:
            nword += e
    return nword

#----------------------------------------------------------------------------

stopwords = []
st = open('stopwords.txt', 'r')
for line in st:
    for word in line.split():
        stopwords.append(word)
st.close()

file_name = input('File name = ')
Filename = open(file_name, 'r')
fh = input('Use feature hashing ? (y,Y,n,N) ')

while fh not in 'y,Y,n,N':
    print('Try again.')
    fh = input('Use feature hashing ? (y,Y,n,N) ')

if fh.lower() == 'y' :
    N = input('M = ')
    M = int(N)
    print('-------------------')
    sentence = Allcount(Filename)
    nword = []
    bow = []
    repeat = []
    for word in sentence:
        if flash(word,M) not in repeat:
            repeat.append(flash(word,M))
            bow.append([flash(word,M),1])
        else:
            for w in bow:
                if flash(word,M) == w[0]:
                    w[1] += 1
    arrange = sorted(bow)
    print('BoW =',arrange)

else:
    repeat = []
    bow = []
    sentence = Allcount(Filename)
    for word in sentence:
        if word in repeat:
            for t in bow:
                if t[0] == word:
                    t[1] += 1
        else:
            bow.append([word,1])
            repeat.append(word)
    arrangeb = sorted(bow)
    print('BoW =',arrangeb)

Filename.close()


# 6330396021 (18.00) 230 (2021-03-22 21:21)

def list_only_word(str_data):
    a = ""
    words = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
    for i in range(len(str_data)):
        if str_data[i] in words:
            a += str_data[i]
        else :
            a += " "
    only_words = a.lower().strip()
    return only_words
def char_count(file_name): 
    fin = open(file_name,"r")
    line = fin.readline()
    a = line.strip()
    for line in fin:
        a += line.strip()
    fin.close()
    return len(a)
def alphanumeric_count(file_name):
    fin = open(file_name,"r")
    line = fin.readline()
    words = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
    n = 0
    a = line
    for i in fin:
        a += i
    for e in range(len(a)):
        if a[e] in words :
            n += 1
    alphanumeric_count1 = n
    fin.close()
    return alphanumeric_count1
def line_count(file_name): #fin = open("sample.txt","r")
    fin = open(file_name,"r")
    line = fin.readline()
    n = 0
    for line in fin:
        n += 1
    fin.close() 
    return n+1
def word_count(file_name):
    fin = open(file_name,"r")
    line = fin.readline()
    a = line
    for i in fin:
        a += i
    b = list_only_word(a)
    word_count_in_fin = b.split()
    return len(word_count_in_fin)

def normal_BoW(file_name):
    fin1 = open(file_name,"r") 
    line1 = fin1.readline()
    line2 = fin2.readline()
    a = line2
    for i in fin2:
        a += i
    a.strip().lower()
    list_stopwords = a.split()

    str_words = line1
    for e in fin1:
        str_words += e
    b = list_only_word(str_words)
    list_words1 = b.split()

    list_words2 = []
    for q in list_words1:
        if q not in list_stopwords :
            list_words2.append(q)

    list_words3 = []
    for z in list_words2:
        if z not in list_words3 :
            list_words3.append(z)
    
    list_words2.sort()
    list_words3.sort()
    normal_BoW = []
    for word in list_words3:
        count_word = 0
        for p in list_words2:
            if p == word :
                count_word += 1
        normal_BoW.append([word,count_word])
    return normal_BoW
def fhash(w,M):
    a = 0
    for i in range(len(w)):
        a += ord(w[i])*37**i
    return a%M
def num_BoW(file_name,M):
    fin1 = open(file_name,"r")
    line1 = fin1.readline()
    line2 = fin2.readline()
    a = line2
    for i in fin2:
        a += i
    a.strip().lower()
    list_stopwords = a.split()

    str_words = line1
    for e in fin1:
        str_words += e
    b = list_only_word(str_words)
    list_words1 = b.split()

    list_words2 = []
    for q in list_words1:
        if q not in list_stopwords :
            list_words2.append(q)
            
    list_words2.sort()
    Q = []
    for i in list_words2:
        Q.append(fhash(i,M)) 
    Q.sort()
    list_words22 = []
    for q in Q:
        if q not in list_words22 :
            list_words22.append(q)
    num_BoW = []
    for word in list_words22:
        count_word = 0
        for p in Q:
            if p == word :
                count_word += 1
        num_BoW.append([word,count_word])
    return num_BoW

fin2 = open("stopwords.txt","r")
file_name = input("File name = ")
yes_no = input("Use feature hashing ? (y,Y,n,N) ")

while yes_no not in "nNyY" :
    print("Try again.")
    yes_no = input("Use feature hashing ? (y,Y,n,N) ")
else :
    if yes_no in 'nN' :
        print("-------------------")
        print("char count =",char_count(file_name))
        print("alphanumeric_count =",alphanumeric_count(file_name))
        print("line_count =",line_count(file_name))
        print("word_count =",word_count(file_name))
        print("BoW =",normal_BoW(file_name))
    elif yes_no in 'yY' :
        M = int(input("M = "))
        print("-------------------")
        print("char count =",char_count(file_name))
        print("alphanumeric_count =",alphanumeric_count(file_name))
        print("line_count =",line_count(file_name))
        print("word_count =",word_count(file_name))
        print("BoW =",num_BoW(file_name,M))

    

# 6330397621 (21.40) 231 (2021-03-21 21:13)

# fhash function     
def fhash(string,M):
    if M==0:
        return string
    return sum(ord(string[i])*37**(i) for i in range(len(string)))%M

#Calculate parameter and Bag of word function
def BoW(file_name,M):
    with open(file_name) as f:
        wordlist = f.readlines()
    char = 0
    alp = 0
    line =0
    words = 0 
    line = len(wordlist)
    wordlist = "".join(wordlist).replace('\n','').lower()
    char = len(wordlist)
    alp = sum(c.isalpha() for c in wordlist)+sum(c.isdigit() for c in wordlist)
    wordlist = "".join([ c if c.isalnum() else " " for c in wordlist ])
    wordlist = ' '.join(wordlist.split()).split(" ")
    words = len(wordlist)
    clean_words = []

    for w in wordlist:
        if w not in stop_words:
            clean_words.append(fhash(w,M))
   
    
    wordfreq = [clean_words.count(w) for w in clean_words] 

    BoW = []
    
    for i in range(len(clean_words)):
        if [clean_words[i],wordfreq[i]] not in BoW:
            BoW.append([clean_words[i],wordfreq[i]])
    BoW = sorted(BoW)
    print('-------------------')
    print('char count =',char)
    print('alphanumeric count =',alp)
    print('line count =',line)
    print('word count =',words)
    print('BoW =',BoW)


#Open stop words
with open('stopwords.txt') as f:
    stop_words = f.read()
    stop_words = stop_words.replace('\n',' ')
    stop_words = stop_words.split(' ')       

# User input
file_name = input("File name = ")
    
while True:
    use_M = input('use feature hashing ? (y,Y,n,N) ')
    if use_M in ['y','Y']:
        M = int(input('M = '))
        break
    elif use_M in ['n','N']:
        M=0
        break
    else:
        print('Try again.')

BoW(file_name,M)
    



    






# 6330398221 (30.00) 232 (2021-03-21 19:16)

def flash(w,M):
    feature=0
    G=37
    n=0
    for c in w:
       feature+=ord(c)*G**n
       n+=1
    feature_hashing=feature%M
    return feature_hashing
#-----------------------------------
file_name=input('File name = ')
use=input('Use feature hashing ? (y,Y,n,N) ')
while use!='Y' and use!='y' and use!='N' and use!='n':
    print('Try again.')
    use=input('Use feature hashing ? (y,Y,n,N) ')
if use=='Y' or use=='y':
    M=int(input('M = '))

#-----------------------------------
stop=open('stopwords.txt','r')
stopwords=[]

for e in stop:
    s=e.split()
    stopwords+=s
stop.close()

#-----------------------------------

file=open(file_name,'r')
word=''
word1=''
word_2=[]
for x in file:
    
    word+=x[:-1:]
    word1+=x
    word_2.append(x)
# n_word=len(word)+1
n_word=0
for l in range(len(word_2)):
    if word_2[-1]=='' or word_2[-1]=='\n':
        word_2=word_2[:-1:]
for ch in word_2:
    if ch[-1]=='\n':
        n_word+=len(ch)-1
    else:
        n_word+=len(ch)
print('-'*19)
print('char count = '+str(n_word))
#-----------------------------------

word1=word1.lower()
collect='0123456789abcdefghijklmnopqrstuvwxyz'
list_word=[]
word_1=''
for x1 in word1:
    if x1 in collect:
        word_1+=x1
    elif word_1!='':
        list_word.append(word_1)
        word_1=''
if word_1!='':
    list_word.append(word_1)
#-----------------------------------
alphabet_count=0
for x2 in list_word:
    alphabet_count+=len(x2)
print('alphanumeric count = '+str(alphabet_count))
#-----------------------------------



line_count=len(word_2)
print('line count = '+str(line_count))
#-----------------------------------

word_count=len(list_word)
print('word count = '+str(word_count))
#-----------------------------------
word_cut=[]
for w in list_word:
    if w not in stopwords:
        word_cut.append(w)
        
#-----------------------------------
BoW=[]
if use=='Y' or use=='y':
    BoW_y=[]
    for By in word_cut:
        BoW_y.append(flash(By,M))
    BoW_y.sort()
    ny=1
    if word_cut==[]:
        BoW=[]
    else:
        for Byes in range(1,len(BoW_y)):
            if BoW_y[Byes]!=BoW_y[Byes-1]:
                BoW.append([BoW_y[Byes-1],ny])
                ny=1
            else:
                ny+=1
        BoW.append([BoW_y[-1],ny])

else:
    nn=1
    word_cut.sort()
    if word_cut==[]:
        BoW=[]
    else:
        for Bno in range(1,len(word_cut)):
            if word_cut[Bno]!=word_cut[Bno-1]:
                BoW.append([word_cut[Bno-1],nn])
                nn=1
            else:
                nn+=1
        BoW.append([word_cut[-1],nn])
file.close()
print('BoW = '+str(BoW))
#-----------------------------------


# 6330399921 (22.34) 233 (2021-03-22 01:23)

def file_to_lowerstr(file):
    txt=''
    c=0
    for i in file:
        if i[-1]=='\n':
            txt+=i[:-1]+' '
        else:
            txt+=i
        c+=1
    txt=txt.lower()
    return [txt,c]
def del_punc(txt):
    new=''
    for i in txt:
        if i in '?!{}[]()+-*/=:;\'\"&_%$#@^><\\':
            new+=' '
        else:
            new+=i
    lis=new.split()
    while '' in lis:
        lis.remove('')
    return lis
def del_stp(lis,stp):
    nlis=lis[:]
    for i in lis:
        if i in stp:
            nlis.remove(i)
    return nlis
def count_wrd(lis):
    count=[]
    nlis=[]
    freq_wrd=[]
    for i in lis:
        if i not in nlis:
            nlis.append(i)
            count.append(1)
        else:
            count[nlis.index(i)]+=1
    for i in range(len(nlis)):
        freq_wrd.append([nlis[i],count[i]])
    return freq_wrd
def fhash(wrd,m):
    x=0
    for i in range(len(wrd)):
        x+=ord(wrd[i])*37**i
    y=x%int(m)
    return y
def act_fh(lis,m):
    fh=[]
    count=[]
    f=[]
    c=[]
    l=[]
    for i in lis:
        fh.append(fhash(i[0],m))
        count.append(i[1])
    for i in range(len(fh)):
        if fh[i] not in f:
            f.append(fh[i])
            c.append(count[i])
        else:
            c[f.index(fh[i])]+=count[i]
    for i in range(len(f)):
        l.append([f[i],c[i]])
    return l
def main():
    file_name=input('File name = ')
    file=open(file_name,'r')
    stp=open('stopwords.txt','r')
    z=file_to_lowerstr(file)
    d=z[1]
    a=z[0]
    b=del_punc(a)
    x=''
    for i in b:
        x+=i
    file_txt=z[0]
    stp_lis=file_to_lowerstr(stp)[0].split()
    file_lis=del_stp(del_punc(file_txt),stp_lis)
    Bow=count_wrd(file_lis)
    while True:
        check=input('Use feature hashing ? (y,Y,n,N) ')
        if check in 'yY':
            m=input('M = ')
            print('-------------------')
            Bow=act_fh(Bow,m)
            break
        elif check in 'nN':
            print('-------------------')
            break
        else:
            print('Try again')
    
    
    print('char count = ',len(a)+d-1)
    print('alphanumeric count = ',len(x))
    print('line count = ',d)
    print('word count = ',len(b))
    print('BoW = ',Bow)
    file.close()
    stp.close()
    
main()
# 6330400821 (19.65) 234 (2021-03-21 12:40)
file_name = input("File name = ")
choice = input("Use feature hashing ? (y,Y,n,N) ")
sum_len = 0
while True :
 if choice == "y" or choice == "Y" :
    M_putin = input("M = ")
    print("-------------------")
    stop_word = open("stopwords.txt","r")
    stop_word_list = ''
    for i in stop_word :
        i.strip()
        for e in i :
          e.strip()
          stop_word_list += e.strip()
    my_file = open(file_name,"r")
    for i in my_file :
        sum_len += len(i.strip())
    my_file.close()
    print("char count = " + str(sum_len))
    #---------------------------------------
    my_file = open(file_name,"r")
    alphanumeric_count = 0
    for e in my_file :
        for i in e.strip() :
          if i.lower() in "qwertyuiopasdfghjklzxcvbnm" :
             alphanumeric_count += 1
          if i in "0123456789" :
             alphanumeric_count += 1
          else :
             continue
    my_file.close()
    print("alphanumeric count = " + str(alphanumeric_count))
    #--------------------------------------------------
    my_file = open(file_name,"r")
    line_count = 0
    for i in my_file :
        line_count += 1
    print("line count = " + str(line_count))
    my_file.close()
    #--------------------------------------------------
    my_file = open(file_name,"r")
    component = ''
    list_ti = []
    for e in my_file :
        i = 0
        while i < len(e.strip()) :
            if i - len(e.strip()) != -1 :
               if e[i].lower() in "qwertyuiopasdfghjklzxcvbnm0123456789" and e[i+1].lower() in "qwertyuiopasdfghjklzxcvbnm0123456789":
                component += e[i]
               if e[i+1].lower() not in "qwertyuiopasdfghjklzxcvbnm0123456789" :
                component += e[i]
                list_ti.append(component)
                component = ''
            if  i - len(e.strip()) == -1 :
                if e[i] == "qwertyuiopasdfghjklzxcvbnm0123456789" :
                  component += e[i]
                  list_ti.append(component)
                  component = ''
                  break
            i += 1
    print("word count = " + str(len(list_ti)))
    my_file.close()
    #--------------------------------------------------------
    my_file = open(file_name,"r")
    G = 37
    su_m = 0
    def flash(string):
            z = ord(string)
            return z
    def hlash(su_m, M_putin) :
            T = su_m % int(M_putin)
            return T
    def sortgunti(one,li_st) :
            i = 0
            e = 0
            number = 0
            while i < len(li_st) :
                if li_st.find(one,i) != -1 :
                    e += 1
                    i += 1
            return e
    def get_unique(unique, words):
        z = 1
        for i in range(len(words)):
            words.remove(unique)
            if unique not in words:
                words.append(unique)
            else:
                z += 1

        return [unique, z]


    for_1word = []
    for i in list_ti :
        p = 0
        if i.lower() not in stop_word_list :
          for e in i :
            su_m += flash(e)*(G**p)
            p += 1
          for_1word.append(hlash(su_m, M_putin))
          su_m = 0

    for i in for_1word :
        i = int(i)
        for_1word.sort()

    i = 0
    set = []
    while i < len(for_1word) :
        for_1word.sort()
        unique = for_1word[i]
        set.append(get_unique(unique,for_1word))
        i += get_unique(unique,for_1word)[1]
    set.sort()
    print("BoW = " + str(set))
    my_file.close()
    break
 if choice == "n" or choice == "N" :
    print("-------------------")
    stop_word = open("stopwords.txt", "r")
    stop_word_list = ''
    for i in stop_word:
        i.strip()
        for e in i:
            e.strip()
            stop_word_list += e.strip()
    my_file = open(file_name, "r")
    for i in my_file:
        sum_len += len(i.strip())
    my_file.close()
    print("char count = " + str(sum_len))
    # ---------------------------------------
    my_file = open(file_name, "r")
    alphanumeric_count = 0
    for e in my_file:
        for i in e.strip():
            if i.lower() in "qwertyuiopasdfghjklzxcvbnm":
                alphanumeric_count += 1
            if i in "0123456789":
                alphanumeric_count += 1
            else:
                continue
    my_file.close()
    print("alphanumeric count = " + str(alphanumeric_count))
    # --------------------------------------------------
    my_file = open(file_name, "r")
    line_count = 0
    for i in my_file:
        line_count += 1
    print("line count = " + str(line_count))
    my_file.close()
    # --------------------------------------------------
    my_file = open(file_name, "r")
    component = ''
    list_ti = []
    for e in my_file:
        i = 0
        while i < len(e.strip()):
            if i - len(e.strip()) != -1:
                if e[i].lower() in "qwertyuiopasdfghjklzxcvbnm0123456789" and e[
                    i + 1].lower() in "qwertyuiopasdfghjklzxcvbnm0123456789":
                    component += e[i]
                if e[i + 1].lower() not in "qwertyuiopasdfghjklzxcvbnm0123456789":
                    component += e[i]
                    list_ti.append(component)
                    component = ''
            if i - len(e.strip()) == -1:
                if e[i] == "qwertyuiopasdfghjklzxcvbnm0123456789":
                    component += e[i]
                    list_ti.append(component)
                    component = ''
                    break
            i += 1
    print("word count = " + str(len(list_ti)))
    my_file.close()
    # --------------------------------------------------------
    my_file = open(file_name, "r")
    G = 37
    su_m = 0


    def flash(string):
        z = ord(string)
        return z


    def hlash(su_m, M_putin):
        T = su_m % int(M_putin)
        return T


    def sortgunti(one, li_st):
        i = 0
        e = 0
        number = 0
        while i < len(li_st):
            if li_st.find(one, i) != -1:
                e += 1
                i += 1
        return e


    def get_unique(unique, words):
        z = 1
        for i in range(len(words)):
            words.remove(unique)
            if unique not in words:
                words.append(unique)
            else:
                z += 1

        return [unique, z]

    i = 0
    set = []
    while i < len(list_ti):
      list_ti.sort()
      if list_ti[i].lower() not in stop_word_list :
         unique = list_ti[i].lower()
         set.append(get_unique(unique, list_ti))
         i += get_unique(unique, list_ti)[1]
      else :
          i += 1
    print("BoW = " + str(set))
    my_file.close()
    break
 else :
    print("Try again.")
    choice = input("Use feature hashing ? (y,Y,n,N) ")





        














# 6330401421 (22.00) 235 (2021-03-22 23:28)

# -------------------------------------------------------
def inp():
    M = 1
    file_name = input("File name = ")
    enable_fhash = input("Use feature hashing ? (y,Y,n,N) ")
    while enable_fhash.lower() not in 'yn':
        print("Try again.")
        enable_fhash = input("Use feature hashing ? (y,Y,n,N) ")
    if enable_fhash.lower() in 'yn':
        if enable_fhash.lower() == 'y': M = int(input("M = "))
    return file_name, enable_fhash.lower(), M 
# -------------------------------------------------------
def fhash(w,M):
    G = 37; fh = 0
    for i,ch in enumerate(w): fh += ord(ch) * (G**i)
    return fh % M
# -------------------------------------------------------
def stopwords():
    file = open('stopwords.txt','r')
    words = []
    for line in file:
        for word in line.strip().split(): words.append(word.lower())
    file.close()
    return words
# -------------------------------------------------------
def filereader(fname,fhash,M):
    file = open(fname)
    #------------
    charcount = 0; alnucount = 0; linecount = 0
    words = []
    #------------
    for line in file:
        linecount += 1
        word = ""
        for ch in line.strip():
            charcount += 1
            if ch == '\n': charcount -= 2
            if ('a' <= ch <= 'z') or ('A' <= ch <= 'Z') or ('0' <= ch <= '9'):
                alnucount += 1
                word += ch
            elif len(word) != 0:
                words.append(word.lower())
                word = ""
            else: pass
    file.close()
    return charcount, alnucount, linecount, len(words), words
# -------------------------------------------------------
def bow(words, stopwords, M, fh):
    bag = []
    for word in words:
        if word not in stopwords:
            if fh == 'y':
                hashed = fhash(word,M)
                bag.append(hashed)
            if fh == 'n':
                if [word,words.count(word)] not in bag:
                    bag.append([word,words.count(word)])
        else: pass
    nb = []
    if fh == 'y':
        for e in bag:
            if [e,bag.count(e)] not in nb: nb.append([e,bag.count(e)])
        return sorted(nb)
    if fh == 'n':
        return sorted(bag)
    return bag
# -------------------------------------------------------
fname, fhsh, M = inp()
cc, ac, lc, wc, words = filereader(fname,fhash,M)
print('-' * 19)
print('char count =',cc)
print('alphanumeric count =',ac)
print('line count =',lc)
print('word count =',wc)
print('BoW =',bow(words, stopwords(), M, fhsh))
# 6330402021 (23.13) 236 (2021-03-21 23:25)

def fhash(w,m):
    c = 0
    for i in range(len(w)) :
        c += ord(w[i])*(37**i)
    return c % m
#-------------------------------------------------------------------------------------
file_name = input("File name = ")

hashing = input("Use feature hashing ? (y,Y,n,N) ")
while hashing not in ["Y","y","n","N"]:
    print("Try again.")
    hashing = input("Use feature hashing ? (y,Y,n,N) ")
if hashing in ["Y","y"]:
    m = int(input("M = "))
print("-------------------")

stopword = open("stopword.txt","r")
sw = []
for i in stopword:
    word_char1 = ""
    for a in i:
        if a.isalpha() == True :
            word_char1 += a.lower()
        elif word_char1 != "" :
            sw.append(word_char1)
            word_char1 = ""
        else:
            word_char1 = ""
if word_char1.isalpha() == True :
    sw.append(word_char1)
stopword.close()

line = 0
char = 0
alnum = 0

file = open(file_name,"r")

for i in file:
    line += 1
    char += len(i)-1
    for a in i:
        if a.isalnum() == True :
            alnum += 1
        h = a
if h != "\n" :
    char += 1
file.close()
print("char count = ",char)
print("alphanumeric count = ",alnum)
print("line count = ",line)
file = open(file_name,"r")

word_char = []
for i in file :
    word_char1 = ""
    for a in i:
        if a.isalnum() == True :
            word_char1 += a.lower()
        elif word_char1 != "" :
            word_char.append(word_char1)
            word_char1 = ""
        else:
            word_char1 = ""
if word_char1.isalnum() == True :
    word_char.append(word_char1)

file.close()

word = len(word_char)    
print("word count = ",word)

word_clear = []
for i in word_char:
    if i not in sw :
        word_clear.append(i)
BoW = []        
for i in word_clear:
    bow_count = 0
    for a in range(len(word_clear)):
        if i == word_clear[a]:
            bow_count += 1
    if [i,bow_count] not in BoW :
        BoW.append([i,bow_count])
    
if hashing in "Nn":
    print("BoW = ",BoW)
elif hashing in "Yy":
    BoW_fhash = []
    for i in BoW :
        BoW_fhash.append([fhash(i[0],m),i[1]])
    BoW_fhash_clear = []        
    for i in BoW_fhash:
        bow_fhash_count = 0
        for a in range(len(BoW_fhash)):
            if i[0] == BoW_fhash[a][0]:
                bow_fhash_count += BoW_fhash[a][1]
        if [i[0],bow_fhash_count] not in BoW_fhash_clear :
            BoW_fhash_clear.append([i[0],bow_fhash_count])
            
    print("BoW = ",BoW_fhash_clear)
# 6330403721 (25.00) 237 (2021-03-20 19:15)

#---------------------------------------------------------------
def file_to_calw(file_name):

    words = ''
    for word in file_name:
            words += word
    words = words.lower() 
    c = 0
    for sym in words:
        if sym == '\n':
            c += 1
    text = ''
    for e in words:
        if not 'a' <= e <= 'z' and not '0' <= e <= '9':
            e = ' '
            text += e
        else:
            text += e
    text = text.split() 
    text1 = ''.join(text) 
    print('char count =',len(words)-c)    
    print('alphanumeric count =',len(text1))
    print('line count =',c+1)
    print('word count =',len(text))
    
    return(text)
def _stopwords(text):
    
    stop = open('stopwords.txt', 'r')
    stop_str = ''
    for i in stop:
        stop_str += i
    stop_list = stop_str.split()
    fi_text = []
    for i in range(len(text)):
        fi_text.append(text[i])
    for i in range(len(text)):
        if text[i] in stop_list:
            fi_text.remove(text[i])

    return(fi_text)
def get_unique(text):

    unique_text = []

    text.sort()
    if len(text) != 0 :
        for i in range(len(text)):
            if text[i-1] != text[i]:
                unique_text.append(text[i])
        
    return unique_text
def bow_1(unique_text, fi_text):
    bow1 = []
    c = 0
    while c < len(unique_text):
        s = 0
        for i in range(len(fi_text)):
            if unique_text[c] == fi_text[i]:
                s += 1
        bow1.append([unique_text[c], s])
        c += 1
        
    return bow1
def fhash(w, m):
    
    sum = 0
    for i in range(len(w)):
        sum += ord(w[i])*(37)**i
    fhash = sum % m
    
    return(fhash)
def bow_fhash(bow):
    bow2 = []
    for i in range(len(bow)):
            bow2.append([fhash(bow[i][0], m), bow[i][1]])
    bow2.sort()
    bow3 = []
    if m == 1:
        c = 0
        for i in range(len(bow2)):
            c += bow2[i][1]
        bow3 = [[0, c]]
    else:
        for i in range(len(bow2)):
            if bow2[i-1][0] != bow2[i][0]:
                bow3.append(bow2[i])
            else:
                bow3[-1][1] += bow2[i][1]
            
    return bow3

    
#---------------------------------------------------------------

file = input("File name = ")
yn = input("Use feature hashing ? (y,Y,n,N) ")
while yn != 'y' and yn != 'Y' and yn != 'n' and yn != 'N':
    print("Try again.")
    yn = input("Use feature hashing ? (y,Y,n,N) ")

if yn == 'y' or yn =='Y':
    m = int(input('M = '))
    print('-------------------')
    file_name = open(file , 'r')
    text = file_to_calw(file_name)
    fi_text = (_stopwords(text))
    unique_text = get_unique(fi_text)
    bow1 = bow_1(unique_text, fi_text)
    print(bow_fhash(bow1))
                
if yn == 'n' or yn == 'N':
    print('-------------------')
    file_name = open(file , 'r')
    text = file_to_calw(file_name)
    fi_text = (_stopwords(text))
    unique_text = get_unique(fi_text)
    print('BoW =',bow_1(unique_text, fi_text))
# 6330404321 (23.00) 238 (2021-03-22 00:48)

stop_words = []
stop_file = open("stopwords.txt", 'r') 
for line in stop_file:
    if line:
        stop_words.extend(line.split())
num_lines = 0
num_words = 0
num_chars = 0
num_alpha_numeric = 0
filename = input("File name = ")
f = open(filename, 'r')
converted_words = []
for line in f:
        line = line.strip('\n')
        num_lines += 1
        num_chars += len(line)
        num_alpha_numeric += sum(char.isalnum() for char in line)
        new_line = ""
        for char in line:
            if char.isalnum():
                new_line += char
            else:
                new_line += " "
        words = new_line.split()
        converted_words.extend(words)
        num_words += len(words)
f.close()

bag_of_words = []
for w in converted_words:
    if w not in stop_words:
        bag_of_words.append(w)

while True:
    do_hash = input("Use feature hashing ? (y,Y,n,N) ")
    if do_hash == 'y' or do_hash == 'Y':
        m = int(input("M = "))
        new_bag_of_words = []
        for w in bag_of_words:
            chars = list(w)
            sum_ord = 0
            count =0
            for c in chars:
                sum_ord += ord(c) * (37 ** count)
                count+=1
            new_bag_of_words.append(sum_ord % m)
        bag_of_words = new_bag_of_words
        break
    elif do_hash == 'n' or do_hash == 'N':
        break
    else:
        print("Try again.")

bow_count = []
dict1 = []
for w in bag_of_words:
    if w not in dict1:
        bow_count = bow_count + [[w, 0]]
        dict1.extend([w])
    bow_count[dict1.index(w)][1] += 1
bow_count = sorted(bow_count)

print("char count =", num_chars)
print("alphanumeric count =", num_alpha_numeric)
print("line count =", num_lines)
print("word count =", num_words)
print("BoW =", bow_count)
# 6330405021 (29.00) 239 (2021-03-22 22:17)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def fhash(x,y):
    a=0
    for i in range(len(x)):
        a+=ord(x[i])*37**i
    return a%y
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
file_name=input('File name = ')
f1=open(file_name,'r',encoding='utf-8')
sw=open('stopwords.txt','r')
swl=[]
for line in sw:
    swl+=line.strip().split()
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
while True:
    use=input('Use feature hashing ? (y,Y,n,N) ')
    if use in'yYnN' and use!='':
        break
    else:
        print('Try again.')
cc,ac,l,wc,a4w=0,0,[],0,[]
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
for line in f1:
    cc+=len(line)-int(line[-1]=='\n')
    l.append(line)
    s,re='',''
    for i in line:
        if i in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ':
            re+=i.lower()
        else:
            re+=' '
    ac+=len(''.join(re.split()))
    wc+=len(re.split())
    a4w+=[m for m in re.split() if m not in swl]
for i in range(len(l)):
    if l[-1]=='\n':
        l=l[:-1]
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
L1=[]
L2=[]
if use in 'yY':
    M=int(input('M = '))
    for i in a4w:
        if fhash(i,M) not in L1:
            L1.append(fhash(i,M))
            L2+=[1]
        else:
            L2[L1.index(fhash(i,M))]+=1
else:
    for i in a4w:
        if i not in L1:
            L1.append(i)
            L2+=[1]
        else:
            L2[L1.index(i)]+=1
BoW=[[L1[i],L2[i]] for i in range(len(L1))]
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print('-'*19)
print('char count =',cc)
print('alphanumeric count =',ac)
print('line count =',len(l))
print('word count =',wc)
print('BoW =',sorted(BoW))
f1.close()
sw.close()

# 6330406621 (21.40) 240 (2021-03-22 01:28)

a = input("File name = ")
b = input("Use feature hashing ? (y,Y,n,N) ")
#z = input("M = ")
ccc = 0
c1 = ""
c2 = 0
d = ""
sw = []
wf = []
x2 = ""
read = open("stopwords.txt", "r")
for line in read:
    sw += line.split()
while b != "n" and b != "N" and b != "y" and b != "Y":
    print("Try again.")
    b = input("Use feature hashing ? (y,Y,n,N) ")
   
        
b = b
if b == "n" or b == "N":
    file_name = open(a, "r")
    for line in read:
        d = line.strip()
    for line in file_name:
        ccc += len(line.strip())
        c2 += 1
        s = line.lower().strip()
        for ch in (s):
            if ch in "1234567890abcdefghijklmnopqrstuvwxyz":
                c1 += ch
                x2 += ch
           
                
            elif ch not in "1234567890abcdefghijklmnopqrstuvwxyz":
                c1 += " "
                
    wn = c1.split()
    wl = []
    cc = 1
    n = 0
    xl = []
    for word in wn:
        if word not in sw:
            wf += [word]
    #print(wf)
    for j in range(len(wf)):
        n = wf.count(wf[j])
        if [wf[j], n] not in xl:
            xl.append([wf[j], n])
    #print(xl)    
        
            
        
                
    print("char count = " + str(ccc))
    print("alphanumeric count = " + str(len(x2)))
    print("line count = " + str(c2))
    print("word count = " + str(len(wn)))
    print("BoW =", xl)

elif b == "y" or b == "Y":
    z = input("M = ")
    file_name = open(a, "r")
    for line in read:
        d = line.strip()
    for line in file_name:
        ccc += len(line.strip())
        c2 += 1
        s = line.lower().strip()
        for ch in (s):
            if ch in "1234567890abcdefghijklmnopqrstuvwxyz":
                c1 += ch
                x2 += ch
           
                
            elif ch not in "1234567890abcdefghijklmnopqrstuvwxyz":
                c1 += " "
    #print('wwwwwwww')
    wn = c1.split()
    #print(wn)
    wl = []
    cc = 1
    n = 0
    xl = []
    for word in wn:
        if word not in sw:
            wf += [word]
    #print(wf)
    for j in range(len(wf)):
        n = wf.count(wf[j])
        if [wf[j], n] not in xl:
            xl.append([wf[j], n])
    #print(xl)


    #print(hashing(wn, int(z)))
    g = 37
    M = int(z)
    h = []
    w = []
    w1 = []
    v = []
    o = []
    e = []
    r = []
    rr = []
    for i in wf:
        h += [i]
    for l in h:
        w = list(l)
        w1.append(w)
    #print(w1)
    x = 0
    c = 0
    o1 = 0
    u = []
    for i in w1:
        for t in range(len(i)):
            o = (ord(i[t])*(g**t))
            x += o
            c += 1
            #print(i[t], c)
            if c == len(i):
                x = int(x)
                o1 = x % M
                h = [o1, "".join(i)]
                c = 0
                x = 0
                o1 = 0
                u += [h]
    u = u
    temp = []
    ct = []
    ct1 = []
    for x in u:
        temp += [(x[0])]
    for y in temp:
        ct.append([y, temp.count(y)])
        for z in ct:
            if z not in ct1:
                ct1.append(z)
                
    print("char count = " + str(ccc))
    print("alphanumeric count = " + str(len(x2)))
    print("line count = " + str(c2))
    print("word count = " + str(len(wn)))
    print("BoW =", ct1)
        

    
read.close()
# 6330407221 (30.00) 241 (2021-03-21 21:14)
file_name = input('File.name = ')
set_1 = []
set_3 = []
d = open(file_name ,'r')
usehashing = input('Use feature hashing ? (y,Y,n,N) ')
while usehashing not in ['y','Y','n','N']:
    print ('Try again.')
    usehashing = input('Use feature hashing ? (y,Y,n,N) ')

if (usehashing == 'y') or (usehashing == 'Y') :
    Numberofbow = int(input('M = '))
    c = open('stopwords.txt', 'r')
    print('-------------------')
    #stopwords
    for i in c:
        word = i.split()
        set_1.append(word)
    set_2 =[]
    for y in set_1:
        for z in y:
            set_2.append(z)
    c.close()
    #text
    for j in d:
        words = j.split()
        set_3.append(words)
    set_4 =[]
    for l in set_3:
        for k in l:
            set_4.append(k.strip())
    d.close()
    
#character_count
    d = open(file_name)
    charrr_count = 0
    for i in d:
        if '\n' in i:
            charrr_count += len(i)-1
        else :
            charrr_count += len(i)
    print('char count =',charrr_count)
    d.close()
            
        
#number and eng_character
    use = []
    for d in set_4:
        used = ''
    
        newset_4 = []
        for e in d:
            if ('a'<=e<='z') or ('A'<=e<='Z') or ('0'<=e<='9'):
                used += e
        use.append(used)
    count = 0
    for h in use:
        count += int(len(h))
    print('alphanumeric count','=',count)

#line_count
    d = open(file_name ,'r')
    line_count = 0
    for i in d:
        line_count += 1
    print('line count','=',line_count)
    d.close()
    
#wordcount
    wordcount = 0
    d = []
    p = []
    for h in set_4:
        e =''
        o = ''
        for i in h:
            if ('a'<=i<='z') or ('A'<=i<='Z') or ('0'<=i<='9'):
                e += i
            else :
                e += ' '
        b = e.split()
        d.append(b)
        wordcount += int(len(b))
    for r in d:
        for g in r:
             p.append(g)
    print('word count','=',wordcount)

#bow
    check = []
    check_2 = []
    for d in p:
        check.append(d.lower())

    for i in check :
        if i not in set_2:
            check_2.append(i)
    check_2.sort()
    bow = []

    for i in check_2:
        c = 0
        for y in check_2:
            if y == i:
                c+=1
        bow_f =[i,c]
        if bow_f not in bow:
            bow.append(bow_f)
#feature hashing
    def fhash(a, M):
        sum = 0
        for i in range(len(a)):
            sum += ord(a[i])*(37**i)
        answer = sum%M
        return answer
    hashing  = []
    for i in bow :
        aa = fhash((i[0]),Numberofbow)
        bb = [aa,i[1]]
        hashing.append(bb)
        hashing.sort()
    use = []
    for y in hashing:
        cc = 0
        for h in hashing:
            if y[0] == h[0]:
                cc += h[1]
        add = [y[0],cc]
        if add not in use:
            use.append(add)    
    print('BoW','=',use)
    
#no
elif (usehashing == 'n') or (usehashing=='N'):
    c = open('stopwords.txt', 'r')
    print('-------------------')
    #stopwords
    for i in c:
        word = i.split()
        set_1.append(word)
    set_2 =[]
    for y in set_1:
        for z in y:
            set_2.append(z)
    c.close()
    #text
    for j in d:
        words = j.split()
        set_3.append(words)
    set_4 =[]
    for l in set_3:
        for k in l:
            set_4.append(k.strip())
    d.close()
    d = open(file_name)
#character_count
    d = open(file_name)
    charrr_count = 0
    for i in d:
        if '\n' in i:
            charrr_count += len(i)-1
        else :
            charrr_count += len(i)
    print('char count =',charrr_count)
    d.close()
            
#number and eng_character
    use = []
    for d in set_4:
        used = ''
    
        newset_4 = []
        for e in d:
            if ('a'<=e<='z') or ('A'<=e<='Z') or ('0'<=e<='9'):
                used += e
        use.append(used)
    count = 0
    for h in use:
        count += int(len(h))
    print('alphanumeric count','=',count)
 
#line_count
    d = open(file_name ,'r')
    line_count = 0
    for i in d:
        line_count += 1
    print('line count','=',line_count)
    d.close()
    
#wordcount
    wordcount = 0
    d = []
    p = []
    for h in set_4:
        e =''
        o = ''
        for i in h:
            if ('a'<=i<='z') or ('A'<=i<='Z') or ('0'<=i<='9'):
                e += i
            else :
                e += ' '
        b = e.split()
        d.append(b)
        wordcount += int(len(b))
    for r in d:
        for g in r:
             p.append(g)
    print('word count','=',wordcount)

#bow
    check = []
    check_2 = []
    for d in p:
        check.append(d.lower())

    for i in check :
        if i not in set_2:
            check_2.append(i)
    check_2.sort()
    bow = []

    for i in check_2:
        c = 0
        for y in check_2:
            if y == i:
                c+=1
        bow_f =[i,c]
        if bow_f not in bow:
            bow.append(bow_f)
    print('BoW =',bow)
    


        
    


    
    




# 6330408921 (30.00) 242 (2021-03-21 19:29)

def BoW(list_str):
    list_str.sort()
    k = []
    v = []
    bow = []
    for s in list_str:
        if not (s in k):
            k.append(s)
            v.append(1)
        else:
            v[k.index(s)] += 1
    for i in range(len(k)):
        bow.append([k[i], v[i]])

    return bow
def fhash(w, M):
    n = 0
    ans = 0
    for s in w:
        ans += ord(s)*(37**n)
        n += 1
    return ans % int(M)

file_name = input('File name = ')
file = open(file_name, 'r')
st = open('stopwords.txt', 'r')
StrFile = ''
StopFile = ''
char_count = 0
c = []
for line in file:
    for s in line:
        if s != '\n':
            char_count += 1
    c.append(line)
    StrFile += line.lower().strip()+' '
StrFileNoEtc = ''
for i in range(len(c)):
    if c[-1] == '\n':
        c = c[:-1]
line_count = len(c)
for s in StrFile:
    if '0' <= s <= '9' or 'a' <= s <= 'z' or s == ' ':
        StrFileNoEtc += s
    else:
        StrFileNoEtc += ' '

StrFileNoStop = []
for line in st:
    StopFile += line.strip()+' '
StopFile = StopFile.split()
StrFileNoEtc = StrFileNoEtc.split()
for i in range(len(StrFileNoEtc)):
    if not StrFileNoEtc[i] in StopFile:
        StrFileNoStop.append(StrFileNoEtc[i])

x = input('Use feature hashing ? (y,Y,n,N) ').lower()
while x != 'y' and x != 'n':
    print('Try again.')
    x = input('Use feature hashing ? (y,Y,n,N) ').lower()

if x == 'y':
    m = input('M = ')
    fh = []
    for s in StrFileNoStop:
        fh.append(fhash(s, m))
    print('-------------------')
    print('char count =', char_count)
    print('alphanumeric count =', len(''.join(StrFileNoEtc)))
    print('line count =', line_count)
    print('word count =', len(StrFileNoEtc))
    print('BoW =', BoW(fh))

else:
    print('-------------------')
    print('char count =', char_count)
    print('alphanumeric count =', len(''.join(StrFileNoEtc)))
    print('line count =', line_count)
    print('word count =', len(StrFileNoEtc))
    print('BoW =', BoW(StrFileNoStop))
st.close()
file.close()
# 6330409521 (0.00) 243 (2021-03-22 22:08)
def fhash(w,M):
    result = 0
    G = 37
    for i in range(len(w)):
        result = result + ord(w[i])*(G**i)
    result = result% int(M)
    return result
file_name = input('File name = ')
while True:
    mode = input('User feature hashing ? (y,Y,n,N)')
    if mode not in ['y','Y','n','N']:
        print('Try again')
    elif mode == 'y' or mode == 'Y':
        M = input('M = ')
        break
    elif mode == 'n' or mode == 'N':
        M = -1
        break
    
print('--------------------')
fle = open('stopwords.txt','r')
stw = []
stw2= []
for line in fle:
    k = line.lower()
    k = k.strip().split()
    stw.append(k)
for i in range(len(stw)):
    w =stw[i]
    for j in range(len(w)):
        stw2.append(w[j])
fle = open(file_name,"r")
y = 0
p = 0
q = 0
words = 0
for line in fle:
    k = line.strip()
    y = y +len(k)
fle = open(file_name,"r")
for line in fle:
    k = line.strip().split()
    for i in range(len(k)):
        o = k[i]
        for e in range(len(o)):
            u = o[e]
            if u in ['\"','\'',',','.','|','/',';',':']:
                p +=0
            else:
                p +=1
fle = open(file_name,"r")
for line in fle:
    q +=1

fle = open('file_name.txt','r')
z = []
v = []
q = 4
for line in fle:
    k = line.lower()
    k = k.strip('\n')
    k = k.strip(',')
    k = k.strip('"')
    k = k.strip('.').split()
    z.append(k)
for i in range(len(z)):
    c=z[i]
    for j in range(len(c)):
        v.append(c[j])
copy1 = v.copy()
pos =0
times =0
while True:
    if times == len(v):
        break
    if v[pos] in stw2:
        v.remove(str(v[pos]))
        
    else:
        pos +=1
        times +=1
pos = 0
bow = []
checker = []
def count( data, element ):
    c = 0
    for e in data:
        if e == element: c += 1
    return c

if mode == 'y' or mode =='Y':
    fhash_value = []
    for i in range(len(v)):
        g = fhash(str(v[i]),M)
        fhash_value.append(str(g))
    for i in range(len(fhash_value)):
        semibow = []
        if fhash_value[i] in checker:
            pass
        else:
            amount = count(fhash_value,str(fhash_value[i]))
            checker.append(str(fhash_value[i]))
            semibow.append(str(fhash_value[i]))
            semibow.append(amount)
            bow.append(semibow)
        
else:    
    for i in range(len(v)):
        semibow = []
        if v[i] in checker:
                 pass
        else:
            amount = count(v,str(v[i]))
            checker.append(str(v[i]))
            semibow.append(str(v[i]))
            semibow.append(amount)
            bow.append(semibow)
    
    
print('char count = '+str(y))
print('alphanumeric count = '+str(p))
print('line count = '+str(q))
print('word count = '+str(len(copy1)))
print('BoW = '+ str(bow))

fle.close()
# 6330410021 (22.58) 244 (2021-03-21 22:55)
def fhash(w,m):
    c=0
    for i in range(len(w)):
        c+=ord(w[i])*37**i
    return c%m
    
def yakword(x):
    r=open(x,'r')
    sarae='"\'\\/,-=-+.#$%^&*()[]{}:;<>?|\n '
    e=[]
    allc=0
    alphac=0
    wn=0
    nline=0
    l=r.readline()
    while len(l)>0:
        nline+=1
        allc+=len(l)
        allc-=1
        l=clear(l)
        l=l.split()
        for i in range(len(l)):
            e.append(l[i])
            for i1 in l[i]:
                if not i1 in sarae:
                    alphac+=1
        l=r.readline()
    wn=len(e)
    return [e,allc,alphac,wn,nline]
           
def clear(x):
    e=''
    x=x.lower()
    sarae='"\'\\/,-=-+.#$%^&*()[]{}:;<>?|\n '
    for i in x:
        if i in sarae:
            e+=' '
        else:
            e+=i
    return e 
    
def bow1(word):
    e=[]
    e1=[]
    stpw=yakword('stopwords.txt')[0]
    for i in word:
        if not (i in e or i in stpw):
            e.append(i)
    for i in e:
        c=0
        for i1 in word:
            if i==i1:
                c+=1
        e1.append([i,c])    
    return e1
def bow2(word,m):
    e=[]
    e1=[]
    e2=[]
    w=bow1(word)
    for i in w:
        e.append(fhash(i[0],m))
    for i in range(len(e)):
        c=0
        for i1 in range(len(e)): 
            if e[i]==e[i1]:
                c+=w[i1][1]      
        e1.append([e[i],c])
    for i in e1:
        if not i in e2:
            e2.append(i)
    e2.sort()        
    return e2
def main():
    yn =['y','Y','n','N']
    x=input('File name = ')
    y=input('Use feature hashing ? (y,Y,n,N) ')
    while not y in yn:
        print('Try again.')
        y=input('Use feature hashing ? (y,Y,n,N) ')
    if y=='y'or y=='Y':
        m=int(input('M = '))
    e=yakword(x)    
    print('-------------------')
    print('char count =',e[1])
    print('alphanumeric count =',e[2])
    print('line count =',e[4])
    print('word count =',e[3])
    if y=='y'or y=='Y':
        print('Bow =',bow2(e[0],m))
    else:
        print('BoW =',bow1(e[0]))
main()        
    
    

# 6330411721 (27.00) 245 (2021-03-21 16:15)

def makelist(file):
    a = lowerb(file)
    a = a.split()
    return a
    
def lowerb(file):
    a = ''
    infile = open(file,'r')
    for line in infile:
        for e in line:
            if 'A' <= e <= 'Z' or 'a' <= e <= 'z' or '0' <= e <= '9':
                a += e
            elif e == ' ':
                a += ' '
            elif e == '\n':
                a += ' '
            else:
                a += ' '
    a = a.lower()
    return a
def removestop(words,stop):
    output = []
    for e in words:
        if e in stop:
            pass
        else:
            output.append(e)
    return output
def charc(file):
    isBlankFile = True
    charC = 0
    for line in file:
        isBlankFile = False
        charC += len(line) - 1
    if not isBlankFile:
        charC += 1
    return charC
def alph(words):
    w = ''.join(words)
    c = len(w)
    return c
def line(file):
    c=0
    for k in file:
        c += 1
    return c
def wordc(words):
    l = len(words)
    return l
    
def bow(data):
    temp = []
    BoW = []
    for word in data:
        if word not in temp:
            BoW.append([word,data.count(word)])
        temp.append(word)            
    return BoW
def fhash(data,M):
    M = int(M)
    fhNum = []
    for word in data:
        n = 0
        i = 0
        for e in word:
            n += ord(e)*((37)**i)
            i += 1
        fhNum.append(n%M)
    temp = []
    BoW = []
    for num in fhNum:
        if num not in temp:
            BoW.append([num,fhNum.count(num)])
        temp.append(num)
    BoW.sort()
    return BoW

#---------------------------------------------------------------------------------

filename = input('File name = ')
fh = input('Use feature hashing ? (y,Y,n,N) ')
while fh not in 'yYnN':
    print('Try again.')
    fh = input('Use feature hashing ? (y,Y,n,N) ')   

data = makelist(filename)
stop = makelist('stopwords.txt')
dataNoStop = removestop(data,stop)
if fh == 'n' or fh == 'N' :
    print('-------------------')
    filen = open(filename,'r')
    print('char count =', charc(filen))
    filen.close()
    print('alphanumeric count =',alph(data))
    filen = open(filename,'r')
    print('line count =',line(filen))
    filen.close()
    print('word count =',wordc(data))
    print('BoW =',bow(dataNoStop))
    
elif fh == 'y' or fh == 'Y' :
    m = input('M = ')
    print('-------------------')
    filen = open(filename,'r')
    print('char count =', charc(filen))
    filen.close()
    print('alphanumeric count =',alph(data))
    filen = open(filename,'r')
    print('line count =',line(filen))
    filen.close()
    print('word count =',wordc(data))
    print('BoW =',fhash(dataNoStop,m))
    
# 6330412321 (30.00) 246 (2021-03-22 21:13)
file_name = input('File name = ')
def char_count(a):
    a = open(file_name,'r')
    cc = 0
    for line in a:
        cc += int(len(line.strip()))
    a.close()
    return cc
    
def alphanumeric_count(a):
    a = open(file_name,'r')
    alp = 0
    for line in a:
        for e in line:
            if '0' <= e <= '9' or 'A' <= e <= 'Z' or 'a' <= e <= 'z':
                alp += 1
    a.close()
    return alp
def line_count(a):
    a = open(file_name,'r')
    lc = 0
    for line in a:
        lc += 1
    a.close()
    return lc
def word_count(a):
    a = open(file_name,'r')
    k = ''
    wc = 0
    for line in a:
        for e in line:
            if (e not in 'abcdefghijklmnopqrstuvwxyz') and (e not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') and(e not in '0123456789'):
                k += ' '
            else:
                k += e
        words = k.split()
    wc += len(words)
            
    a.close()
    return wc
def BoW(a):
    a = open(file_name, "r")
    stop_words = open("stopwords.txt","r")
    new = ''
    for line in a:
        line = line.lower()
        for e in line:
            if (e in 'abcdefghijklmnopqrstuvwxyz') or (e in '0123456789'):
                new += e
            else:
                new += ' '
    new1 = new.split(' ')
    sw = ''
    for line in stop_words:
        line = line.lower()
        for e in line:
            if (e in 'abcdefghijklmnopqrstuvwxyz') or (e in '0123456789'):
                sw += e
            else:
                sw += ' '
    sw1 = sw.split(' ')

    new2 = []
    for e in new1:
        if e in sw1:
            new2.append('')
        else:
            new2.append(e)
    new3 = []
    for e in new2:
        if e != '':
            new3.append(e)
    u = []
    v= []
    for e in new3:
        if e not in u:
            u.append(e)
            v.append([e,1])
        else:
            t = u.index(e)
            v[t] = [e,v[t][1]+1]
    a.close()
    stop_words.close()
    return v
def fhash(w,M):
    s = 0
    for i in range(len(w)):
        s += int(ord(w[i])*((37)**i))
    fhash = s%M
    return fhash
def new_bow(a):
    s = BoW(a)
    u = []
    v = []

    for e in s:
        i = fhash(e[0],M)
        j = e[1]
        if i not in u:
            u.append(i)
            v.append([i,j])
        else:
            k = u.index(i)
            v[k] = [i,v[k][1]+j]
    v.sort()
    return v
            


fh = input('Use feature hashing ? (y,Y,n,N) ')
while fh != 'n' and fh != 'N' and fh != 'y' and fh != 'Y':
    print('Try again.')
    fh = input('Use feature hashing ? (y,Y,n,N) ')
if fh == 'n' or fh == 'N':
    print('-------------------')
    print('char count = '+str(char_count(file_name)))
    print('alphanumeric count = '+str(alphanumeric_count(file_name)))
    print('line count = '+str(line_count(file_name)))
    print('word count = '+str(word_count(file_name)))
    print('BoW = '+str(BoW(file_name)))
elif fh == 'y' or fh == 'Y':
    M = int(input('M = '))
    print('-------------------')
    print('char count = '+str(char_count(file_name)))
    print('alphanumeric count = '+str(alphanumeric_count(file_name)))
    print('line count = '+str(line_count(file_name)))
    print('word count = '+str(word_count(file_name)))
    print('BoW = '+str(new_bow(file_name)))

    
    



# 6330413021 (30.00) 247 (2021-03-21 22:09)
'''def senstrip(sentence) :
    a = ''
    for e in sentence :
        if e.isalnum():
            a += e
        else :
            a += ' '
    return a'''
def senstrip(sentence) :
    a = ''
    for e in sentence :
        if '0' <= e <= '9' or 'a' <= e <= 'z' :
            a += e
        elif e == '\n' :
            pass
        else :
            a += ' '
    return a

def read_file(file):
    f = open(file)
    wordslist = [senstrip(line.lower()) for line in f.readlines()]
    f.close()
    return wordslist
def fhash(words,m) :
    num = 0
    for i in range(len(words)) :
         num +=  ord(words[i:i+1])*37**(i)
    
    return num % m
def spstrip(word):
    a = ''
    for e in word :
        if e.isalnum() :
            a += e
    return a

file_name = input('File name = ')
hashornot = input('Use feature hashing ? (y,Y,n,N) ')
while hashornot not in ['Y','y','N','n'] :
   print('Try again.')
   hashornot = input('Use feature hashing ? (y,Y,n,N) ')
stopwords = ' '.join(read_file('stopwords.txt')).split() + ['']
datalist = read_file(file_name)
chain = ' '.join(datalist).split()
words = []
for e in chain:
    if e not in stopwords  :
        words.append(e)
bow = []
if hashornot in ['Y','y'] :
    m = int(input('M = '))
    hashed = [fhash(x,m) for x in words]
    for i in range(m) :
      a = hashed.count(i)
      if a > 0 :
        bow.append([i,a])
    
elif hashornot in ['N','n'] :
    words.sort()
    ind = 0
    while ind != len(words) :
        bow.append([words[ind],words.count(words[ind])])
        ind += words.count(words[ind])
alphacount = len(''.join(chain))
linecount = len(datalist)
charcount = len(''.join(datalist))
wordcount = len(chain)
print('-------------------')
print('char count =',charcount)
print('alphanumeric count =',alphacount)
print('line count =',linecount)
print('word count =',wordcount)
print('BoW =',sorted(bow))

# 6330415221 (27.65) 248 (2021-03-22 23:37)

Alp = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
alp = 'abcdefghijklmnopqrstuvwxyz'
num = '1234567890'
def remove(t) : #หาจน.ตัวอังกฤษกับตัวเลข
    out = ''
    for e in t :
        if e in Alp or e in alp or e in num :
            out += e
        else :
            out +=''
    return out
def remove_punc(t) : #หาจน.คำ
    ex = ''
    for i in t :
        if i in "\"\'/\\().,;:!$%&*+-<=>?@[]^_`{|}~'" :
            ex += ' '
        else :
            ex += i
    exs = ex.split()
    return exs

def fhash(w,M) :
    G = 37
    a = 0
    for i in range(len(w)):
        a += ord(w[i])*(G**i)
    a = a % int(M)
    return a
    
def find(s, t) :
    pp = []
    for i in range(len(s)) :
        if s[i] == t :
            pp.append(s[i])
    return pp


file_name = input('File name = ')
choose = input('Use feature hashing ? (y,Y,n,N) ')
while choose not in 'yYnN' :
    print('Try again.')
    choose = input('Use feature hashing ? (y,Y,n,N) ')

if choose == 'y' or choose == 'Y' :
    M = input('M = ')
    print('-------------------')
else :
    print('-------------------')

char_count = 0
alp_count = 0
word_count = 0
line_count = 0
infile = open(file_name, "r")
temp = infile.read().splitlines()

for line in temp : #หาจน.อักขระ
    char_count += len(line)
    alp_count += len(remove(line))
    word_count += len(remove_punc(line))
    line_count += 1
infile.close()

print('char count =', char_count)
print('alphanumeric count =', alp_count)
print('line count =', line_count)
print('word count =', word_count)

infile2 = open("stopwords.txt", "r")
sw = []
while True :
    line = infile2.readline()
    if line == '' :
        break
    else :
        sw += line.split()
        
infile2.close()

text = ' '.join(temp)
text = remove_punc(text.lower())
text2 = []
for i in range(len(text)) :
    if not text[i] in sw :
        text2.append(text[i])

if choose == 'n' or choose == 'N' :
    BoW = []
    words = []
    for i in range(len(text2)) :
        if not text2[i] in words :
            words.append(text2[i])
            frequency = text2.count(text2[i])
            BoW.append([text2[i], frequency])
    print('BoW =', BoW)

if choose == 'y' or choose == 'Y' :
    fhash_num = []
    for i in range(len(text2)) :
        fhash_num.append(fhash(text2[i],M))

    p = []
    f = []
    k = []
    BoW_Y = []
    for i in range(len(fhash_num)) : 
        u = find(fhash_num, fhash_num[i])
        if u not in p :
            p.append(u)
        else :
            pass
    
    for i in range(len(p)) :
        a = p[i]
        f.append(len(a))
        k.append(a[0])

    for i in range(len(k)) :
        BoW_Y.append([k[i], f[i]])
    
    BoW_Y.sort()
    print('BoW =', BoW_Y)
# 6330416921 (3.00) 249 (2021-03-21 23:28)
#----------------------------------------------------- 
def line_count(a) :    
    fin = open(a,'r')
    line = fin.readline()
    linecount=0
    while len(line) > 0:
        linecount += 1
        line = fin.readline()

    fin.close()
    return  str(linecount)
#----------------------------------------------------- 
def char_count(a):
    fin = open(a,'r')
    line=fin.readline().strip()
    charcount=0
    while len(line) > 0:
        for i in line:
            charcount += 1
        line=fin.readline().strip()
            
    fin.close()
    return str(charcount)
#-----------------------------------------------------
def alp_count(a):
    fin = open('sample.txt','r')
    line=fin.readline().strip()
    alpcount=0
    while len(line) > 0:
        for i in line:
            if 'a'<=i<='z' or 'A'<=i<='Z' or '0'<=i<='9':
                alpcount += 1
        line=fin.readline().strip()
            
    fin.close()
    return str(alpcount)
#-----------------------------------------------------
def remove_punc(t):
    out = ""
    for e in t :
        if  'a'<=e<='z' or 'A'<=e<='Z' or '0'<=e<='9' :
            out += e
        else :
            out += ' '
    return out
#-----------------------------------------------------
def word_count(a):
    fin = open(a,'r')
    line=fin.readline().strip()
    wordcount=0
    while len(line) > 0:
        wordcount += int(len(remove_punc(line).split()))
        line=fin.readline().strip()
            
    fin.close()
    return str(wordcount)
#-----------------------------------------------------
def remove_stopword(t):
    fin = open('stopwords.txt','r')
    stopword=[]
    line = fin.readline()
    while len(line) > 0:
        x=line.strip().split()
        for p in x:
            stopword.append(p)
        line = fin.readline()
        
    newtext=[]
    for e in t :
        if e in stopword :
            pass
        else :
            newtext.append(e)
        
    return newtext
#-----------------------------------------
def bownothash(a):   
    fin = open(a,'r')
    line=fin.readline().strip()
    bow=[]
    newline=[]
    while len(line) > 0:
        low= line.lower()
        new=remove_stopword(remove_punc(low.strip()).split())
        for f in new:
            newline.append(f)
        line=fin.readline().strip()
        
        
        
    d=newline.copy()
    for i in range(len(d)):
        if newline != []:
            x=newline.pop(0)
            if x in newline:
                n=1
                aa=newline.copy()
                for w in aa:
                    if w == x:
                        n+=1
                        newline.remove(x)
                bow.append([x,n])
            else:
                bow.append([x,1])
    bow.sort()
    fin.close()
    return bow
 
#-----------------------------------------------------
def fhash(a,m):
    summ=0
    for i in range (len(a)):
        summ += ord(a[i])*(37)**i
    c = summ % int(m)
    return c
#-----------------------------------------
def bowhash(a,m):  
    fin = open(a,'r')
    line=fin.readline().strip()
    bow=[]
    newline=[]
    while len(line) > 0:
        low= line.lower()
        new=remove_stopword(remove_punc(low.strip()).split())
        for f in new:
            newline.append(fhash(f,m))
        line=fin.readline().strip()
        
        
        
    d=newline.copy()
    for i in range(len(d)):
        if newline != []:
            x=newline.pop(0)
            if x in newline:
                n=1
                aa=newline.copy()
                for w in aa:
                    if w == x:
                        n+=1
                        newline.remove(x)
                bow.append([x,n])
            else:
                bow.append([x,1])
    bow.sort()
    fin.close()
    return bow
  
#-----------------------------------------------------
a = input('File name = ')
b = input('Use feature hashing ? (y,Y,n,N) ')
#-----------------------------------------------------
while True:
    if b == 'y' or b=='Y':
        m = input('M = ')
        print ('-------------------')
        print ('char count = '+char_count(a))
        print ('alphanumeric count = '+alp_count(a))   
        print ('line count = '+line_count(a))
        print ('word count = '+word_count(a))
        print ('BoW =',bowhash(a,m))
        break
#-----------------------------------------------------    
    elif b == 'n' or b=='N':
        print ('-------------------')
        print ('char count = '+char_count(a))
        print ('alphanumeric count = '+alp_count(a))
        print ('line count = '+line_count(a))
        print ('word count = '+word_count(a))
        print ('BoW =',bownothash(a))
        break
    
#-----------------------------------------------------     
    else:
        print('Try again.')
        b = input('Use feature hashing ? (y,Y,n,N) ')

# 6330417521 (30.00) 250 (2021-03-22 16:14)

def fhash(text, M):
    i = 0
    plu = 0
    for e in text:
        plu += ord(e)*(37**i)
        i += 1
    return plu %M
        
file_name = input('File name = ')
ch = input('Use feature hashing ? (y,Y,n,N) ')
while ch not in ['y','Y','n','N']:
    print('Try again.')
    ch = input('Use feature hashing ? (y,Y,n,N) ')
if ch == 'y' or ch == 'Y':
    m = int(input('M = '))
f = open(file_name, 'r')
st_f = open('stopwords.txt', 'r')
fd = []
for e in st_f:
    if e[-1] == '\n':
        fd += e[:-1].split()
    else:
        fd += e.split()

f_u = ''
cc = 0
ac = 0
lc = 1
for e in f:
    for k in e.lower():
        if k != '\n':
            cc += 1
        if k in 'abcdefghijklmnopqrstuvwxyz1234567890':
            ac += 1
        if k == '\n':
            lc += 1
        if k not in 'abcdefghijklmnopqrstuvwxyz1234567890':
            f_u += ' '
        else:
            f_u += k

li_f = f_u.split()
li_fu = []
for e in li_f:
    if e not in fd:
        li_fu.append(e)

print('-------------------')
print('char count =', cc)
print('alphanumeric count =', ac)
print('line count =', lc)
print('word count =', len(li_f))
if ch == 'y' or ch == 'Y':
    b = []
    for_ch = []
    for e in li_fu:
        a = fhash(e, m)
        if a not in for_ch:
            for_ch.append(a)
            b.append([a, 1])
        else:
            c = for_ch.index(a)
            d = b[c][1]
            b[c] = [a, d+1]
    b.sort()
    
else:
    b = []
    for_ch = []
    for e in li_fu:
        if e not in for_ch:
            for_ch.append(e)
            b.append([e, 1])
        else:
            a = for_ch.index(e)
            c = b[a][1]
            b[a] = [e, c+1]
print('BoW =', b)
f.close()
st_f.close()
# 6330418121 (20.88) 251 (2021-03-22 18:23)

def remove_punc(t):
    out = ""
    for e in t:
        if e not in "\"\'/\\().,;:":
            out += e
    return out
def fhash(w,M):
    c = 0
    for i in range(len(w)):
       c += ord(w[i])*(37**i)
    out = c%M
    return out
def word_count(word, wordslist):
    wc = 0
    for w in wordslist:
        if w == word:
            wc += 1
    return wc
def cut_words(words, stopwords):
    cw = []
    for i in range(len(words)):
        words[i] = words[i].lower()
    for e in words:
        if e not in stopwords:
            cw.append(e)
    return cw
def Bow(wordslist):
    bow = []
    wordslist.sort()
    for e in wordslist:
        if e not in bow :
            bow.append(e)
    for i in range(len(bow)):
        bow[i] = [bow[i],word_count(bow[i],wordslist)]
    return bow
    
file_name = input("File name = ")
feature_hashing = input("Use feature hashing ? (y,Y,n,N) ").lower()
while feature_hashing not in "yn":
    print("Try again.")
    feature_hashing = input("Use feature hashing ? (y,Y,n,N) ").lower()
    
if feature_hashing == "y":
    M = int(input("M = "))
    
stopwords = []
s_file = open("stopwords.txt","r")
for l in s_file:
    if len(l) > 0 :
        for i in l.split():
            stopwords.append(i)
s_file.close()

alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
character_count = 0
alphabet_count = 0
line_count = 0
words = []
file = open(file_name,"r")
for l in file:
    character_count += len(l.strip())
    if len(l) > 0:
        line_count += 1
    w = ""
    for i in range(len(l)):
        if l[i] in alphabet:
            w += l[i]
        if l[i] not in alphabet and w != "":
            words.append(w)
            w = ""
    for x in l:
        if x in alphabet:
            alphabet_count += 1

final = cut_words(words, stopwords)
if feature_hashing == "n":
    bow = Bow(final)
else:
    for i in range(len(final)):
        final[i] = fhash(final[i],M)
    bow = Bow(final)
    
print("-------------------")
print("char count =",character_count)
print("alphanumeric count =",alphabet_count)
print("line count =",line_count)
print("word count =",len(words))
print("BoW =",bow)

file.close()



# 6330420321 (21.40) 252 (2021-03-22 23:33)

def fhash(w,M) :
  sumss = 0
  G = 37
  for c in range(len(w)) :

    sumss += ord(w[c])*(G**c)
    
  return sumss % M
def get_stopwords(stopwordss) :
  stopwords = []
  with open(stopwordss,'r') as f :
    for line in f :
      for word in line.split() :
        stopwords.append(word.lower())
  return stopwords
def char_count(filen) :
  c = 0
  aln = 0
  line_c = 0
  alllins = []
  with open(filen,'r') as readfil:
    # while True :
    #   line = readfil.readline().strip()
      
    #   if not line :
    #     break
    #   line_c += 1
    #   c += len(line)
      # for each_chr in line :
      #   if each_chr.isalnum():
      #     aln += 1
    Alliine = readfil.readlines()
    for i in Alliine:
      if i != '\n' and not i.isspace():
        alllins.append(i.strip('\n'))
    

    line_c = len(Alliine)
    
    for i in alllins:
      c += len(i)
      for eachchar in i :
        if eachchar.isalnum():
          aln+=1


  print('char count =',c)
  print('alphanumeric count =',aln)
  print('line count =',line_c)
  # return c , aln , line_c
def word_count(filen,BowFlag,M) :
  file_str = ''
  alllins = []
  with open(filen,'r') as readfil :
    # while True :
    #   line = readfil.readline().strip()
    #   if not line :
    #     break

    #   for c in line :
    #     if c.isalnum():
    #       file_str += c.lower()  
    #     else :
    #       file_str += ' '
    Alliine = readfil.readlines()

    for i in Alliine:
      if i != '\n' and not i.isspace():
        alllins.append(i.strip('\n'))

    for eachline in alllins :
      for c in eachline :
        if c.isalnum() :
          file_str += c.lower()
        else :
          file_str += ' '

  if BowFlag :
    BOww = BoWwhash(file_str,M)
  else :
    BOww = BoW(file_str)
  
  words_c = len(file_str.split())
  print('word count =',words_c)
  print('BoW =',BOww)
  # return words_c , BOww
def BoW(sentence) :
  Bows = []
  ss_l = []
  stopword = get_stopwords('stopwords.txt')
  f_str = sentence
  
  f_str = f_str.split()
  for f in f_str :
    if f not in stopword :
      ss_l.append(f)
  
  for f in ss_l:
    if not any(f == subB[0] for subB in Bows) :  #Checking in Sub-list
      Bows.append([f,f_str.count(f)])
  Bows.sort()
  return Bows
def BoWwhash(sentence,M) :
  Bows = []
  hash_value = []
  ss_l = []
  stopword = get_stopwords('stopwords.txt')
  f_str = sentence
  
  f_str = f_str.split()
  for f in f_str :
    if f not in stopword :
      ss_l.append(f)
  

  for f in ss_l:
    hashf = fhash(f,M)
    hash_value.append(hashf)

  
  for i in hash_value :
    if not any(i == subBow[0] for subBow in Bows) :
      Bows.append([i,hash_value.count(i)])
  Bows.sort()
  return Bows


def main() :
  
  file_name = input('File name = ')
  
  while True:
    choice = input('Use feature hashing ? (y,Y,n,N) ').lower() 
    if choice == 'y' :
      hash = True
      break
    elif choice == 'n' :
      hash = False
      break
    else :
      print('Try again.')
  
  
  
  if hash :
    M = int(input('M = '))
    print('-'*19)
    char_count(file_name)
    word_count(file_name,True,M)
    
    
    
  
  else :
    print('-'*19)
    char_count(file_name)
    word_count(file_name,False,None)
    


#------------------------------------------

main()
# 6330422621 (14.60) 253 (2021-03-21 20:05)

def fhash(w,M):
    mixfhash = 0
    for i in range(len(w)):
        a = int(ord(w[i]))
        b = a*(37)**i
        mixfhash += b
    fhash = mixfhash%M
    return fhash
def freq1(str1): 
    str2 = [] 
    for i in str1:
        if i not in str2:
            str2.append(i)
    rt = []
    for i in range(len(str2)):
        y = str1.count(str2[i])
        rt.append(str("[\'"+str2[i]+"\', "+ str(y)+"]"))
        op = "["+", ".join(rt)+"]"
    return op
def freq(str1): 
    str2 = [] 
    for i in str1:
        if i not in str2:
            str2.append(i)
    rt = []
    for i in range(len(str2)):
        y = str1.count(str2[i])
        rt.append(str("["+str2[i]+", "+ str(y)+"]"))
        op = "["+", ".join(rt)+"]"
    return op
def remove(word):
    line = ""
    for e in word:
        if e in "\"\'/\\().,;:":
            line += " "
        else:
            line += e
    return line
def choose(word):
    cutout = []
    for i in range(len(word)):
        cut = remove(word[i])
        cutout.append(cut.strip())
    string = ""
    for i in range(len(cutout)):
        string += str(cutout[i])+" "
    return string
def for_M(M):
    hash = []
    for i in range(len(strink)):
        d=strink[i] 
        rr = fhash(d,M)
        hash.append(str(rr))
    return hash
file = input("File name = " )
file_name = open(file,"r")
file_stop = open("stopwords.txt","r")
sp = file_stop
fn = file_name

stop = []
for line in sp:
    stopp = line.strip().split()
    for i in range(len(stopp)):
        stop.append(stopp[i])
sp.close()

allword = []
word = []
allwordcount = ""
count = 0
c = 0
ab = ""
for line in fn:
    allmix = line.lower().strip().split()
    for i in range(len(allmix)):
        allword.append(allmix[i])
        allwordcount += str(allmix[i])+" "
        
    c += 1   
    alword = choose(allword)
alphacount = 0
for i in range(len(allword)):
    alphacount += len(remove(allword[i]).strip())
    
for i in range(len(allword)):
        if allword[i] not in stop:
            word.append(allword[i])
fn.close()
strink = choose(word).split()
count = len(allwordcount)-c

while True:
    hon = input("Use feature hashing ? (y,Y,n,N) " )
    if hon == "n" or hon == "N":
        print("-------------------")
        print("char count = " ,count)
        print("alphanumeric count = " ,alphacount)
        print("line count = " ,c)
        print("word count = " ,len(allword))
        print("BoW = " ,freq1(strink))
        break
    elif hon == "y" or hon == "Y":
        M = int(input("M = "))
        print("-------------------")
        print("char count = ", count)
        print("alphanumeric count = ",alphacount)
        print("line count = ",c)
        print("word count = " ,len(allword))
        print("BoW = " ,freq(for_M(M)))
        break
    else:
        print("try again.")


# 6330423221 (19.95) 254 (2021-03-22 22:57)



file_name= str(input())
print("File name ="," "+file_name)
file_name1=open(file_name,"r")
file_name2=open(file_name,"r")
file_name3=open(file_name,"r")
file_name4=open(file_name,"r")
file_name5=open(file_name,"r")
file_name6=open(file_name,"r")



file_delete=open("stopwords.txt","r")
hh=""
for line in file_delete:
    x=line.lower().split()  #["u","d","e"]
    hh+="".join(x)
x=hh
def fhash(w,M):
    h=0
    for i in range(len(w)):
        h+=(ord(w[i]))*  (37**(i))
    return h%M
def line_count(n):
    ss=0
    for line in n:
        ss+=1
    return ss
    





print("Use feature hashing ? (y,Y,n,N)")
fh=input()
while fh not in "YyNn":
    print("Try again.")
    fh=input()
if fh in "Yy":
    M=int(input())
    print("M =",M)
 
def char_count(n):
    c=0
    for line in n:
        
        for e in line:
                c+=1
    return c
file_name1=open(file_name,"r")
def alphanumeric_count(n):
    l=0
    for line in n :
        line=line.split()
        line="".join(line)
        for e in line:
            if "0"<=e<="9" or "a"<=e<="z" or  "A"<=e<="Z":
                l+=1
    return l
file_name1=open(file_name,"r")
def word_count(n):
    
    c=0
    for line in n :
        f=""
        for e in line : #It was the best of times,
            if e in "\\/\"\'!@#$%^&*()_-+=|{[}]:;<,>.?*":
                f+=" "
            else:
                f+=e
        f=f.split()
        for e in f:
            c+=1
    return c
def bow (fh,n):
    if fh in "Nn":
        t=[]
        f=""
        new=[]
        for line in n :
            for e in line : #It was the best of times,
                if e in "\\/\"\'!@#$%^&*()_-+=|{[}]:;<,>.?*" :
                    f+=" "
                else:
                    f+=e    ##It was the best of times
            
        f=f.lower().split()
        for i in range(len(f)):
                if f[i] in x:
                    f[i]=""
        for e in f:
            if len(e)!=0:
                new.append(e)
        k=[]
        for e in new:
            if e in k:pass
            else:
                k.append(e)
            
                
        
        
        h=[0]*len(k)
        for e in new :
            if e in k:
                h[k.index(e)]+=1#k=['best', 'times', 'worst', 'age', 'wisdom', '555']
                                   #h=[1, 2, 1, 1, 1, 1]
        d=[]
        for i in range(len(h)):
            d.append([k[i],h[i]])
            
        return d
    
    
    
    
    
    if fh in "Yy" :
        a=[]
        t=[]
        f=""
        new=[]
        p=[]
        for line in n :
            for e in line : #It was the best of times,
                if e in "\\/\"\'!@#$%^&*()_-+=|{[}]:;<,>.?*" :
                    f+=" "
                else:
                    f+=e    ##It was the best of times
            
        f=f.lower().split()
        for i in range(len(f)):
                if f[i] in x:
                    f[i]=""
        for e in f:
            if len(e)!=0:
                new.append(e)
        
    
        for e in new:
                a.append(fhash(e,M))    #[1, 0, 0, 3, 1, 3, 0, 0, 0, 1, 0, 2, 1, 3, 0, 0, 3, 2, 0, 0, 1, 1, 1, 3, 0, 2, 1, 0, 0, 3, 1, 3, 0, 0, 0, 1, 0, 3, 3, 2, 3, 0, 0, 3, 2, 0, 0, 1, 1, 1, 3, 0, 2, 1, 0, 0, 3, 1, 3, 0, 0, 0, 1, 0, 1, 3, 1, 0, 3, 2, 0, 3, 1, 3, 0, 3, 1, 2, 2, 2, 1, 1, 1, 2, 2]
        for e in a:
            if e not in p:
                p.append(e)
        p.sort()
        h=[0]*(M)
        for e in a:
            h[int(e)]+=1
        b=[]
        for i in range(M):
            b.append(int(i))
        gg=[]
        for i in range(len(b)):
            gg.append([b[i],h[i]])            
        return gg
 
            
        
    
    

print("-------------------")     
print("char count = ",char_count(file_name1)-line_count(file_name6))
print("alphanumeric count = ",alphanumeric_count(file_name2))
print("line count = ",line_count(file_name5))
print("word count = ",word_count(file_name3))
print("BoW = ",bow (fh,file_name4))

# 6330424921 (16.87) 255 (2021-03-22 22:00)

def removepunc(t):
    out = ''
    for e in t:
        if e not in  [ '(', ')', '-', '', '[', ']' ,'"' ,"'" ,';', ':', '>', '<','.','' ]:
            out += e
    return out
def fhashword(w,M):
    a = []
    show =[]
    for e in w:
        fhword = 0
        n=0
        for i in range(len(e)):
            fhword += ord(e[i])*37**n
            n  += 1
        a.append(fhword % int(M))
        
    for i in range(min(a),max(a)+1):
        if a.count(i)!=0:
            show.append([i,a.count(i)])
    return show

file_name = input('File name = ' )
feature_hashing = input('Use feature hashing ? (y,Y,n,N) ').lower()

while feature_hashing not in "yn" and len(feature_hashing)!=1 :
    print("Try again")
    feature_hashing = input('Use feature hashing ? (y,Y,n,N) ').lower()
    
if feature_hashing == 'y':
    m = input('M = ')
    print('-------------------')
    
    filebow = open(file_name,'r')
    linebow = filebow.readlines()
    filebow.close()

    box = []
    for i in linebow:
        line = i.replace('\n','')
        box.append(line)
    
    sample  = str(box)
    sample2 = removepunc(sample)

    file  = open('stopwords.txt','r')
    stopword = file.readlines()
    file.close()

    reallinebow = ''
    for i in sample2:
        reallinebow += i
    reallinebow = reallinebow.lower().split()

    lstopword = ''
    for i in stopword:
        lstopword += i
    lstopword.split()

    newbow = ''
    for i in range(len(reallinebow)):
        if reallinebow[i] not in lstopword:
            newbow += reallinebow[i]+','
        else:
            newbow += ''
        
    finalbow = newbow.split(',')[:-1]
    finalbow2 = []
    for i in finalbow:
        if i != '':
            finalbow2.append(i)
        
    wordfreq= []
    for w in finalbow2:
        wordfreq.append([w,finalbow2.count(w)])

    last = [] 
    for i in wordfreq: 
        if i not in last: 
            last.append(i)

    last2 = []
    for i in range(len(last)):
        if last[i][0] != '':
            last2.append(last[i])

    finallast = []
    for i in range(len(last2)):
        if last2[i][0] != '\\n':
            finallast.append(last2[i])
    
    wordcount=0
    for words in box:
        splitedwords=words.split()
        wordcount+=(len(splitedwords))
    
    alphabetcount=0
    for words in box:
        for alphabet in words:
            if alphabet in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789":
                alphabetcount+=1
    
    charcount=0
    for words in box:
        for char in words:
                charcount+=1
    
    print('char count =',charcount)
    print('alphanumeric count =',alphabetcount)
    print('line count =',len(box))
    print('word count =',wordcount)
    print('BoW =',fhashword(finalbow2,m))
        
elif feature_hashing == 'n':
    print('-------------------')
    
    filebow = open(file_name,'r')
    linebow = filebow.readlines()
    filebow.close()

    box = []
    for i in linebow:
        line = i.replace('\n','')
        box.append(line)
    
    sample  = str(box)
    sample2 = removepunc(sample)

    file  = open('stopword.txt','r')
    stopword = file.readlines()
    file.close()

    reallinebow = ''
    for i in sample2:
        reallinebow += i
    reallinebow = reallinebow.lower().split()

    lstopword = ''
    for i in stopword:
        lstopword += i
    lstopword.split()

    newbow = ''
    for i in range(len(reallinebow)):
        if reallinebow[i] not in lstopword:
            newbow += reallinebow[i]+','
        else:
            newbow += ''
        
    finalbow = newbow.split(',')[:-1]
    finalbow2 = []
    for i in finalbow:
        if i != '':
            finalbow2.append(i)
        
    wordfreq= []
    for w in finalbow2:
        wordfreq.append([w,finalbow2.count(w)])

    last = [] 
    for i in wordfreq: 
        if i not in last: 
            last.append(i)

    last2 = []
    for i in range(len(last)):
        if last[i][0] != '':
            last2.append(last[i])

    finallast = []
    for i in range(len(last2)):
        if last2[i][0] != '\\n':
            finallast.append(last2[i])
            
        wordcount=0
    for words in box:
        splitedwords=words.split()
        wordcount+=(len(splitedwords))
    
    alphabetcount=0
    for words in box:
        for alphabet in words:
            if alphabet in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789":
                alphabetcount+=1
    
    charcount=0
    for words in box:
        for char in words:
                charcount+=1        
    
    print('char count =',charcount)
    print('alphanumeric count =',alphabetcount)
    print('line count =',len(box))
    print('word count =',wordcount)
    print('BoW =',finallast)







# 6330425521 (30.00) 256 (2021-03-22 23:11)

def fhash(word, M):
    Fhash = 0
    M = int(M)
    for i in range(len(word)):
        Fhash += ord(word[i])*(37**i)
    return Fhash % M
def BoW(clause):
    bow = []
    clause = clause.split()
    clause.sort()
    n = 1
    b_word = None
    for word in clause:
        if word == b_word:
            n += 1
            b_word = word     
        else:
            bow.append([b_word, n])
            b_word = word
            n = 1
    bow.append([b_word, n])
    bow = bow[1::]
    return bow
def cut_symbol(clause):
    cut = ''
    clause = clause.lower()
    for e in clause:
        if 'a' <= e <='z' or '0' <= e <= '9':
            cut += e
        else:
            cut += ' '
    cut = ' '.join(cut.strip().split())    
    return cut
def clear_all(clause, list_of_stopwords):
    clause = cut_symbol(clause).split()
    clear = ''
    for word in clause:
        if word not in list_of_stopwords:
            clear += word + ' '
    return clear.strip()
    
file_name = input('File name = ')

open_file = open(file_name, 'r')
file = open_file.read()
open_file.close()
read_file = ''
for e in file:
    if e != '\n':
        read_file += e

line_file = open(file_name, 'r')
n_line_file = 0
for line in line_file:
    n_line_file += 1
line_file.close()

char_file = open(file_name, 'r')
n_char_file = 0
for line in char_file:
    n_char_file += len(line)
char_file.close()

stopwords = open('stopwords.txt', 'r')
list_of_stopwords = stopwords.read().split()
stopwords.close()


cut_file = cut_symbol(file)
clear_file = clear_all(file, list_of_stopwords)

n_char = len(read_file)
n_alphanumeric = len(''.join(cut_file.split()))
n_word = len(cut_file.split())

FH = input('Use feature hashing ? (y,Y,n,N) ')
while True:
    if FH not in ['y','Y','n','N']:
        print('Try again.')
        FH = input('Use feature hashing ? (y,Y,n,N) ')
    else: break
if FH in ['y','Y']:
    M = input('M = ')
    
print('-------------------')
print('char count =', n_char)
print('alphanumeric count =', n_alphanumeric)
print('line count =', n_line_file)
print('word count =', n_word)

if FH in ['y','Y']:    
    Fhash = ''
    for word in clear_file.split():
       Fhash += str(fhash(word, M)) + ' '
    bow_ = BoW(Fhash.strip())
    bow = []
    for fh, n in bow_:
        bow.append([int(fh), n])
else:
    bow = BoW(clear_file)

print('BoW =', bow)
# 6330426121 (30.00) 257 (2021-03-22 18:01)

def fhash(w,M):
    x = 0
    for i in range(len(w)):
        x += ord(w[i])*37**i
    return x%M
def to_alpha(s):
    i = 0
    for c in s:
        if c.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
            i += 1
    return i
def check_word(s):
    x = []
    w = ''
    for c in s:
        if c.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
            w += c
        else:
            if w != '':
                x.append(w)
                w = ''
    if w != '':
        x.append(w)
    return x

file_name = input('File name = ')
checkfh = input('Use feature hashing ? (y,Y,n,N) ')
while True:
    if checkfh == 'y' or checkfh == 'Y':
        M = int(input('M = '))
        checkfh = True
        break
    elif checkfh == 'n' or checkfh == 'N':
        checkfh = False
        break
    else:
        print('Try again.')
        checkfh = input('Use feature hashing ? (y,Y,n,N) ')
print('-------------------')    

stopword = open('stopwords.txt', 'r')
t = open(file_name, 'r')
stw = []
for line in stopword:
    for e in line.strip().split():
        stw.append(e.lower())    
stopword.close()

count_line = 0
count_c = 0
count_alp = 0
count_word = 0
for line in t:
    count_line += 1
    count_c += len(line)
    count_alp += to_alpha(line)
    count_word += len(check_word(line))
count_c -= count_line-1
t.close()

t = open(file_name, 'r')
BoW = []
if checkfh == True:
    W_in_BoW = []
    for line in t:
        for e in check_word(line):
            if e.lower() not in stw:
                if fhash(e.lower(),M) not in W_in_BoW:
                    W_in_BoW.append(fhash(e.lower(),M))
                    BoW.append([fhash(e.lower(),M),1])
                else:
                    for i in range(len(BoW)):
                        if BoW[i][0] == fhash(e.lower(),M):
                            BoW[i][1] += 1
    BoW.sort()
else:
    W_in_BoW = []
    for line in t:
        for e in check_word(line):
            if e.lower() not in stw:
                if e.lower() not in W_in_BoW:
                    W_in_BoW.append(e.lower())
                    BoW.append([e.lower(),1])
                else:
                    for i in range(len(BoW)):
                        if BoW[i][0] == e.lower():
                            BoW[i][1] += 1
t.close()

print('char count =',count_c)
print('alphanumeric count =',count_alp)
print('line count =',count_line)
print('word count =',count_word)
print('BoW =',BoW)



        

# 6330427821 (24.80) 258 (2021-03-22 14:04)
def fhash(w, M):
    result = 0
    G = 37
    for i in range(len(w)):
        result += ord(w[i])*(G**i)

    return result % M

def stop_words():
    f = open("stopwords.txt", "r")
    stop_words = []
    for line in f:
        s = line.lower().split()
        if len(s) != 0:
            stop_words.append(s)
    stop_words = [item.lower() for sublist in stop_words for item in sublist]
    return stop_words

def BOW(f, stop_words):
    # f = open(file, "r")
    # print(char_count(f))
    sentences = []
    for sentence in f:
        long_text = ''.join(c for c in sentence.rstrip('\n')
                            if c not in '?:!/;,."')
        sentences.append(long_text)
    # print(sentences)
    # length = 0
    length = [len(sentence.split()) for sentence in sentences]
    resultwords = []
    for sentence in sentences:
        for word in sentence.split():
            if word.lower() not in stop_words:
                resultwords.append(word.lower())

    result = ' '.join(resultwords)
    # result = ''.join(c for c in result if  c not in '?:!/;,."')

    return (result), sum(length)

def isNot_fhash(bag_of_words):
    words = []
    wordfreq = []
    for w in bag_of_words.split():
        if w not in words:
            words.append(w)

    wordfreq = [[w, bag_of_words.split().count(w)] for w in words]
    return sorted(wordfreq)

def is_fhash(bag_of_words, M):

    result_fhash = [fhash(w, M) for w in bag_of_words.split()]
    words_fhash = []
    for w in result_fhash:
        if w not in words_fhash:
            words_fhash.append(w)
    wordfreq_fhash = [[w, result_fhash.count(w)] for w in words_fhash]
    return sorted(wordfreq_fhash)

def letter_count(read_file):
    # print(read_file)
    length_char = 0
    alphanumeric_count = 0
    line_count = 0
    for line in read_file:
        for c in line.rstrip('\n'):
            if(c.isalpha()) or (c.isdigit()):
                alphanumeric_count += 1
        # digits = digits + 1
        if len(line) != 0:
            line_count += 1
        length_char += len(line.rstrip('\n'))

    return length_char, alphanumeric_count, line_count

def _print(file, M=-1):
    length_char, alphanumeric_count, line_count = letter_count(open(file, "r"))
    print("-------------------")
    print("char count = ", length_char)
    print("alphanumeric count = ", alphanumeric_count)
    print("line count = ", line_count)
    bag_of_words, len_BOW = BOW(open(file, "r"), stop_words())
    print("word count = ", len_BOW)
    if M != -1:
        print("BoW = ", is_fhash(bag_of_words, M))
    else:
        print("Bow = ", isNot_fhash(bag_of_words))


if __name__ == '__main__':
    try:
        file = input('File name = ')
        while True:
            choice = input("Use feature hashing ? (y,Y,n,N)")
            if choice not in ('y', 'Y', 'N', 'n'):
                print("Try again.")
            else:
                M = -1
                if choice in ('y', 'Y'):
                    M = input("M = ")
                _print(file, int(M))
                input()
                break
    except BaseException:
        input()

# 6330428421 (30.00) 259 (2021-03-22 00:37)
def cha_count(line):
    ch_count = 0
    for ch in line:
        if ch != '\n':
            ch_count+=1
    return ch_count
#--------------------------------
def alnum_count(line):
    alp_num_count = 0
    for ch in line:
        if ch.isalnum():
            alp_num_count+=1
    return alp_num_count
#--------------------------------
def word_list(line):
    result = ''
    for e in line:
        if not e.isalnum():
            result+=' '
        else:
            result+=e.lower()
    word = (result.split())
    return word
#--------------------------------
def BoW(s):
    for i in range(len(s)):
        if not s[i] in words:
            words.append(s[i])
            freq.append(1)
        elif s[i] in words:
            freq[words.index(s[i])]+=1
#-------------------------------------
def fhash(word,M):
    fhash = 0
    for i in range(len(word)):
        fhash+=ord(word[i])*37**i
    fhash %=int(M)
    return fhash
#------------------------------------
def remove_stop_words(s,stop_words):
    result = []
    for e in s:
        if e in stop_words:
            pass
        else:
            result.append(e)
    return result
#----------------------------------------
file_name = input('File name = ')+'.txt'
Fstop_words = open('stopwords.txt','r')
fin = open(file_name,'r')
while True:
    use_feature = input('Use feature hashing ? (y,Y,n,N) ')
    if use_feature == 'y' or use_feature=='Y':
        M=input('M = ')
        break
    elif use_feature == 'n' or use_feature=='N':
        break
    else:
        print('Try again.')
print('-'*19)
#Variables
ch_count=0
alp_num_count=0
line_count = 0
word_count = 0
fhashed=[]
stop_words = []
words = []
freq = []
bow = []
#---------------
for line in Fstop_words:
    stop_words+=word_list(line)
for line in fin:
    line_count+=1
    ch_count+=cha_count(line)
    alp_num_count+=alnum_count(line)
    word_count+=len(word_list(line))
    fhashed=remove_stop_words(word_list(line),stop_words)
    if use_feature=='n' or use_feature=='N':
        BoW(fhashed)
    else:
        for i in range(len(fhashed)):
            fhashed[i]=fhash(fhashed[i],M)
        BoW(fhashed)
for i in range(len(words)):
    bow.append([words[i],freq[i]])
    bow.sort()
print('char count =',ch_count)
print('alphanumeric count =',alp_num_count)
print('line count =',line_count)
print('word count =',word_count)
print('BoW =',bow)
fin.close()
Fstop_words.close()
# 6330429021 (21.70) 260 (2021-03-20 23:27)
n = input('File name = ')
dh = input('Use deature hashing ? (y,Y,n,N) ')
stop_words = open('stopwords.txt', 'r')
stop_w = []
new_file = []
neww_file = ''
for e in stop_words:
    e = e.split()
    for i in range(len(e)):
        stop_w.append(e[i])
#----------------------------------------------
file_name = open(n,'r')
for r in file_name:
    r = r.split()
    for t in r:
        t = t.lower()
        if t not in stop_w:
            new_file.append(t)
file_name.close()
stop_words.close()
#----------------------------------------------
file_name = open(n,'r')
c = 0
for r in file_name:
    c += 1
    r = r.split()
    for t in r:
        for u in t:
            if u not in '.\'\"\\:;/,':
                neww_file += u
file_name.close()
#-----------------------------------------------
word_count = ''
file_name = open(n,'r')
for r in file_name:
    word_count += ' '
    for t in r:
        if t not in '.\'\"\\:;/,':
            word_count += t
word_count = word_count.strip().split()
#----------------------------------------------
new2_file = ''
new3_file = ''
for s in new_file:
    new2_file += ' '
    for u in s:
        if u not in '.\'\"\\:;/,':
            new2_file += u
for s in new_file:
    for u in s:
        if u not in '.\'\"\\:;/,':
            new3_file += u
new2_file = new2_file.strip().split()
#-----------------------------------------------
lis = []
count = []
new22_file = []
for i in range(len(new2_file)):
    if new2_file[i] not in lis:
        lis.append(new2_file[i])
        count.append(new2_file.count(new2_file[i]))
for r in range(len(lis)):
    new22_file.append([lis[r],count[r]])
    
#-------------------------------------------------
new_file_name = ''
file_name = open(n,'r')
for l in file_name:
    l = l.strip()
    for e in l:
        new_file_name += e
file_name.close()
file_name = open(n,'r')
#------------------------------------------------
def fhash(w,M):
    s = 0
    for i in range(len(w)):
        s += ord(w[i])*(37**i)
    return s%int(M)
#------------------------------------------------
while dh not in 'YynN':
    print('Try again.')
    dh = input('Use deature hashing ? (y,Y,n,N) ')
if dh in 'Yy':
    M = input('M = ')
    lis2 = []
    count2 = []
    f_new2 = []
    new222_file = []
    for s in new2_file:
        f_new2.append(fhash(s,M))
    for i in range(len(new2_file)):
        if fhash(new2_file[i],M) not in lis2:
            lis2.append(fhash(new2_file[i],M))
            count2.append(f_new2.count(f_new2[i]))
    for r in range(len(lis2)):
        new222_file.append([lis2[r],count2[r]])
    print('-------------------')
    print('char count =',len(new_file_name))
    print('alphanumeric count =',len(neww_file))
    print('line count =',c)
    print('word count =',len(word_count))
    print('BoW =',sorted(new222_file))
elif dh in 'Nn':
    print('char count =',len(new_file_name))
    print('alphanumeric count =',len(neww_file))
    print('line count =',c)
    print('word count =',len(word_count))
    print('BoW =',sorted(new22_file))
# 6330430621 (19.27) 261 (2021-03-22 23:52)

file_name = open(input('File name = '),'r')
k1 = ''
k2 = ''
for i in file_name:
    k1 += i.lower()
for e in k1:
    if  e in 'abcdefghijklmnopqrstuvwxyz' or  e in '0123456789' :
        k2 += e
    else :
        k2 += ' '
kk = k2.split()

stop = open('stopword.txt','r')

bb = ''
for e in stop:
    if e != '\n':
        bb += e+' '
bbb = bb.split()    
c = []
for e in kk:
    if e in bbb:
        pass
    else :
        c.append(e)

alpha = 0
line = 0
word = len(kk)
def fhash(x,M):
    sumc = 0
    for i in range(len(x)) :
        xi = ord(x[i])*((37)**(i))
        sumc += xi
    sums = (sumc)%M
    return sums

for e in k1:
    if not e!= '\n'  :
        line += 1
cha = 0
for e in k1:
    if e == '\n':
        cha += 1
char = len(k1)-cha
hon = 0        
for e in k2:
    if  e == ' ':
        hon += 1
alpha = len(k2) - hon
    

b = input('Use feature hashing ? (y,Y,n,N) ')
while not b == 'n' and not b == 'N' and not b == 'y' and not b == 'Y':
    print('Try again.')
    b = input('Use feature hashing ? (y,Y,n,N) ')
if b == 'n' or b == 'N':
    print('-------------------')
    print('char count = ' + str(char))
    print('alphanumeric count = '+str(alpha))
    print('line count = '+str(line))
    print('word count = '+str(word))
    Bow = []
    i = 0
    while not (i > len(c)-1): 
        p = 1 
        y = i 
        while y < len(c)-1: 
            if (c[y] == c[y+1]): 
                p += 1
                y += 1
            else:
                break
        Bow.append([c[i],p])
        i = y+1
        mm = []
        for e in Bow:
            if not e in mm:
                mm.append(e)
            else:
                k = int(e[1])+1
                t = e[0]
                mm.remove(e)
                mm.append([t,k])
    print('Bow = ',mm)
elif b == 'y' or b == 'Y':
    M = int(input('M = '))
    print('-------------------')
    print('char count = ' + str(char))
    print('alphanumeric count = '+str(alpha))
    print('line count = '+str(line))
    print('word count = '+str(word))
    r = []
    boww = []
    for i in range(len(c)):
        h = fhash(c[i],M)
        r.append(h)
        r.sort()
    
    w = 0
    u = 0
    while not (w > len(r)-1): 
        cc = 1
        y = w 
        while (y < len(r)-1): 
            if not (r[y] == r[y+1]):
                u += 1
                break
            else:
                cc += 1
                y  += 1
        boww.append(cc)
        w = y+1
    lis = []
    g = []
    for e in r:
        if not e in g:
            g.append(e)
        else:
            g.remove(e)
            g.append(e)
    for i in range(len(boww)):
        lis.append([g[i],boww[i]])
            
    print('BoW =',lis)
   
   
  

file_name.close()
stop.close()
# 6330431221 (15.00) 262 (2021-03-22 23:11)
file_name =input("File name = ")
k = open("stopwords.txt","r")
y = k.read().lower().split()
k.close()
stopword=y
def line_count(file_name):
    a = open(file_name,'r')
    count = 0 
    for line in a:  
       if len(line) != 0:
          count +=1
    a.close()

    return count
def char_count(file_name):
    a = open(file_name, 'r')
    count = 0
    for line in a:
       x =line
       if '\n' in x:
         count += len(x)-1
       else:
         count += len(x)
    a.close()

    return count
def alphanum_count(file_name):
    a = open(file_name,'r')
    count = 0
    while True:
       x = a.readline().lower()
       if len(x)!=0:
          for i in x:
              if i in 'abcdefghijklmnopqrstuvwxyz' or i in '1234567890':
                 count += 1
       else:
          break
    a.close()
    return count
def word(file_name):
    a = open(file_name,'r')
    v=''
    c=[]
    b=a.read().lower()
    for e in b:
        c+=e
    for e in c:
        if e  in 'abcdefghijklmnopqrstuvwxyz' or e  in '1234567890':
            v+=e
        else:
            v+=' '
    
    a.close()
    z=v.split()
    return z
def word_count(file_name):
    a=word(file_name)
    return len(a)   
        
def fhash(w,M):
    a = 0
    for i in range(len(w)):
        a += ord(w[i])*(37**i)
    x = a % M
    return x
def cutword(n):
    x = []
    for i in n:
        if i not in stopword :
            x.append(i)
    x.sort()
    return x
    
    
    
def BOW(f,M):
    if f =='y':
        c=[]
        v=[]
        ppp =cutword(pp)
        for i in ppp:
           c.append(fhash(i,M))
        for i in c:
               if i not in v:
                 v.append(i)
        v.sort()
        k=[]
        for i in range(len(v)):
               n=0
               for e in c :
                  if e == v[i]:
                      n+=1
               k.append(n)
        ss=[]
        for i in range(len(v)):
            ss.append([v[i],k[i]])
        return ss
    else:
        ppp = cutword(pp)
        poop = []
        for u in ppp:
              if u not in poop:
                poop.append(u)
        Bowy=[]
        for i in range(len(poop)):
               n=0
               for e in ppp :
                  if e == poop[i]:
                      n+=1
               Bowy.append(n)
        zz=[]
        for i in range(len(poop)):
            zz.append([poop[i],Bowy[i]])
        return zz
        
        
        
        
        

while True:
   f=input("Use feature hashing ? (y,Y,n,N) ")
   g=f.lower()
   if g == 'y':
       M = int(input("M = "))
       print('-------------------')
       print('char count = ', char_count(file_name))
       print('alphanumeric count = ', alphanum_count(file_name))
       print('line count = ', line_count(file_name))
       print('word count = ', word_count(file_name))
       pp = word(file_name)
       YYY = BOW(g,M)
       print("BoW = ",YYY)
       break
       
   elif g == 'n':
       M = none
       print('-------------------')
       print('char count = ', char_count(file_name))
       print('alphanumeric count = ', alphanum_count(file_name))
       print('line count = ', line_count(file_name))
       print('word count = ', word_count(file_name))
       pp = word(file_name)
       cc = cutword(pp)
       BOW = BOW(g,M)
       print("BoW = ",BOW)
       break
       
   else:
       print("Try again.")

    



   


# 6330432921 (18.90) 263 (2021-03-22 12:29)
def read_txt():
    f = open(input("File name = "))
    text = f.read()
    f.close()
    return text
def read_stop():
    s = open("stopword.txt")
    stp = s.read().split()
    s.close()
    return stp
txt = read_txt()
stop = read_stop()
def fhash(txt,M):
    value = 0
    for j in range(len(txt)):
        value += (ord(txt[j])*(37**j))
        
    return value%M
def line_count(txt):
    line = (txt.count("\n"))+1
    return line 
def char_count(txt):
    char = (len(txt)-txt.count("\n"))
    return char 
    
def alphanumeric_count(txt):
    count = 0 
    for i in txt:
        if((ord(i) >= 65 and ord(i) <= 122) or (ord(i) >= 48 and ord(i) <= 57)):
            count += 1
        else:
            pass
    return count
def word_count(txt):
    for i in txt:
        if((ord(i) >= 65 and ord(i) <= 122) or (ord(i) >= 48 and ord(i) <= 57) or ord(i) == 32):
            txt = txt.replace(i,i.lower())
        elif(i == "\n"):
            txt = txt.replace(i," ")  
        else:
            txt = txt.replace(i,"")
        newtxt = txt.split(" ")
    return len(newtxt)
def Bow(txt):
    realtxt = []
    for i in txt:
        if((ord(i) >= 65 and ord(i) <= 122) or (ord(i) >= 48 and ord(i) <= 57) or ord(i) == 32):
            txt = txt.replace(i,i.lower())  
        else:
            txt = txt.replace(i," ")
    newtxt = txt.split()
    for i in newtxt:
        if i not in stop:
            realtxt.append(i)
    end = []
    for i in realtxt:
        if [i, realtxt.count(i)] not in end:
            end.append([i, realtxt.count(i)])
    return end
def Bow2(txt,M):
    realtxt = []
    for i in txt:
        if((ord(i) >= 65 and ord(i) <= 122) or (ord(i) >= 48 and ord(i) <= 57) or ord(i) == 32):
            txt = txt.replace(i,i.lower())
        else:
            txt = txt.replace(i," ")
    newtxt = txt.split()
    for i in newtxt:
        if i not in stop:
            realtxt.append(i)
    end = []
    vbow = []
    for i in realtxt:
        vbow.append(fhash(i,M))
    for i in vbow:
        if [i, vbow.count(i)] not in end:
            end.append([i, vbow.count(i)])
    return end

while(True):
    feature = str(input("Use feature hashing ? (y,Y,n,N) "))
    if feature.lower() == 'y' :
        M=int(input("M = "))
        print(f'''-------------------
char count = {char_count(txt)}
alphanumeric count = {alphanumeric_count(txt)}
line count = {line_count(txt)}
word count = {word_count(txt)}
Bow = {Bow2(txt,M)}''')
        break
    elif feature.lower() == 'n':
        print(f'''-------------------
char count = {char_count(txt)}
alphanumeric count = {alphanumeric_count(txt)}
line count = {line_count(txt)}
word count = {word_count(txt)}
Bow = {Bow(txt)}''')
        break
    else :
        print("Try again.")
# 6330433521 (22.99) 264 (2021-03-22 16:52)

def fhash(w,M):
    nword = []
    f = 0
    for e in w:
        nword.append(e[:len(e)+1])
    for i in range(len(nword)):
        f += (ord(nword[i])*37**i)
    return f%M
def alphabet(t):
    c = 0
    for e in t:
        if e.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
            c += 1
    return c
def chword(t):
    x = []
    w = ''
    for e in t:
        if e.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
            w += e
        else:
            if w != '':
                x.append(w)
                w = ''
    return x
    
file_name = input('File name = ')
feature = input('Use feature hashing ? (y,Y,n,N) ')
while True:
    if feature == 'y' or feature == 'Y':
        M = int(input('M = '))
        feature = True
        break
    elif feature == 'n' or feature == 'N':
        feature = False
        break
    else:
        print('Try again.')
        feature = input('Use feature hashing ? (y,Y,n,N) ')
print('-------------------')

    
stopword = open('stopwords.txt', 'r')
fle = open(file_name,'r')
stop = []
for line in stopword:
    for e in line.strip().split():
        stop.append(e.lower())
stopword.close

chcount = 0
alcount = 0
lincount = 0
wcount = 0
for line in fle:
    lincount += 1
    chcount += len(line)
    alcount += alphabet(line)
    wcount += len(chword(line))
chcount = chcount - lincount +1
fle.close()

fle = open(file_name, 'r')
bow = []
if feature == True:
    wbow = []
    for line in fle:
        for e in chword(line):
            if e.lower() not in stop:
                if fhash(e.lower(),M) not in wbow:
                    wbow.append(fhash(e.lower(),M))
                    bow.append([fhash(e.lower(),M), 1])
                else:
                    for i in range(len(bow)):
                        if bow[i][0] == fhash(e.lower(),M):
                            bow[i][1] += 1   
    bow.sort()
else:
    wbow = []
    for line in fle:
        for e in chword(line):
            if e.lower() not in stop:
                if e.lower() not in wbow:
                    wbow.append(e.lower())
                    bow.append([e.lower(),1])
                else:
                    for i in range(len(bow)):
                        if bow[i][0] == e.lower():
                            bow[i][1] += 1
fle.close()

print('char count = ', chcount)
print('alphanumeric count = ', alcount)
print('line count = ', lincount)
print('word count = ', wcount)
print('BoW = ', bow)
                  
# 6330434121 (21.55) 265 (2021-03-22 17:39)

def flash(w,m):
    c = 0
    for i in range(len(w)):
        c = c + (ord(w[i])*(37**i))
    number = c%int(m)
    return number 

file_name = input("File name = ")
feature = input("Use feature hashing ? (y,Y,n,N) ")
m = 0
if not feature == "y" and feature == "Y" and feature == "n" and feature == "N":
    a = 0
    while a == 0:
        print("Try again")
        feature = input("Use feature hashing ? (y,Y,n,N) ")
        if feature == "y" or feature == "Y" or feature == "n" or feature == "N":
            if feature == "y" or feature == "Y":
                m = input("M = ")
            break
else:
    if feature == "y" or feature == "Y":
        m = input("M = ")
        
stopword = []        
stopwords = open("stopwords.txt", "r")
for line in stopwords:
    words = line.strip().split()
    for i in range(len(words)):
        stopword.append(words[i])
stopwords.close()

file = []
newword = ""
line_count = 0
word_count = 0
char_count = 0
alphanumeric_count = 0

files = open(file_name, "r",encoding="utf-8")
for line in files:
    line_count += 1
    char_count += len(line.strip("\n"))+1
    check_word = line.strip().lower()
    
    
    for i in range(len(check_word)):
        
        if check_word[i] not in "abcdefghijklmnopqrstuvwxyz0123456789":
            newword = newword+" "
        else:
            newword = newword + check_word[i]
            alphanumeric_count += 1
    file = file + newword.strip().split()
    newword = ""
 
                      
files.close()
char_count = char_count - line_count
word_count = len(file)
new_file = []
for i in range(len(file)):
    if not file[i] in stopword:
        new_file.append(file[i])
new_file.sort()
bow = []
bow1 = []
c = 1
for i in range(len(new_file)-1):
    if new_file[i]==new_file[i+1]:
        c +=1
    else:
        bow.append([new_file[i],c])
        c = 1
if not m == 0:
    e = ""
    f = []
    for i in range(len(new_file)):
        e = flash(new_file[i],m)
        f.append(e)
    f.sort()
    for i in range(len(f)-1):
        if f[i]==f[i+1]:
            c +=1
        else:
            bow1.append([f[i],c])
            c = 1
    bow1.append([f[-1],c])
print("-------------------")
print("char count = ",char_count)
print("alphanumeric count = ",alphanumeric_count)
print("line count = ",line_count)
print("word count = ",word_count)
if m == 0:
    print("BoW = ",bow)
else:
    print("BoW = ",bow1)
    
    





        
        

# 6330435821 (30.00) 266 (2021-03-21 21:13)

def count( data, element ):
    c = 0
    for e in data:
        if e == element: c += 1
    return c

def BoW(list):
    BoW=[]
    appended=[]
    for word in list:
        if word not in appended:
            BoW.append([word,count(list,word)])
            appended.append(word)
    return BoW

def fhash(w,M):
    G=37
    fhash=0
    for i in range(len(w)):
        fhash+=ord(w[i])*(G**i)
    fhash%=M
    return fhash

def BoW_w_fhash(list,M):
    BoW=[]
    fhash_list=[]
    appended=[]
    for word in list:
        fhash_list.append(fhash(word,M))
    for i in fhash_list:
        if i not in appended:
            appended.append(i)
            BoW.append([i,count(fhash_list,i)])
    return BoW


stopwords_in=open('stopwords.txt','r') #3
stopwords_list=[]
for line in stopwords_in:
    stopwords_list+=line.split()
stopwords_in.close()


file_name=input('File name = ') #1
fin=open(file_name,'r')
words_list=[]
char_count=0
alphanumeric_count=0
line_count=0
word_count=0
for line in fin:
    line_count+=1
    word = ''
    for letter in line.lower():
        allow_letters='abcdefghijklmnopqrstuvwxyz0123456789'
        if letter!='\n':char_count+=1
        if letter in allow_letters:
            word+=letter
            alphanumeric_count+=1
        else:
            if word!='':
                word_count+=1
                if word not in stopwords_list:
                    words_list.append(word)
            word=''
    if word!='':
        word_count+=1
        if word not in stopwords_list:
            words_list.append(word)
fin.close()


mode=input('Use feature hashing ? (y,Y,n,N) ') #2
while mode not in ['y','Y','n','N']:
    print('Try again.')
    mode = input('Use feature hashing ? (y,Y,n,N) ')
if mode=='Y' or mode=='y':
    M=int(input('M = '))
    BoW=BoW_w_fhash(words_list,M) #4
else:
    BoW=BoW(words_list)
BoW.sort()


print('-------------------')
print('char count =',char_count)
print('alphanumeric count =',alphanumeric_count)
print('line count =',line_count)
print('word count =',word_count)
print('BoW =',BoW)

# 6330436421 267 (2021-03-22 19:48)

# Set the enumerate
CHAR_LIST = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ012345678"

def clear_word(word):
    """Get a word, clear a word to just a word and convert it to lowercase
    Parameter : Ugly like shit word from anywhere (Kasumi's house?)
    Return : Word as a list (because it must support if it have a separator in there)

    Doctest :
        >>> clear_word("Abc:a18")
        ['abc', 'a18']
    """
    result_list = []
    # Turn a word to lowercase
    word = word.lower()
    # Convert to list to for loop
    word_list = list(word)
    # For loop to seperate character and (if) seperator and set it to result_list
    for i in range(len(word_list)):
        if word_list[i] in CHAR_LIST:
            result_list.append(word_list[i])
        else:
            result_list.append(" ")
    # Convert result list to normal string (Make it easier to find a real 'space')
    result_list = "".join(result_list)
    # Convert back to list again but we have a real 'space' bewteen word now
    result_list = result_list.split(" ")
    # Next : We must clear some shit if a member in result list is ' ' (blank space)
    # First, we must find that how many shit space we have by using for loop.
    space_number = 0
    other_number = 0
    for member in result_list:
        if member == ' ':
            space_number += 1
        elif member == '':
            other_number += 1
    # Second, use function 'remove' to remove a shit space n times. We cannot use remove and not check a number
    # because if it not have blank space in result_list it will run to error.
    if space_number != 0:
        for i in range(space_number):
            result_list.remove(' ')
    if other_number != 0:
        for i in range(other_number):
            result_list.remove('')
    # Complete! Then, return!
    return result_list

def clear_stop_word(file_name, sentence_list):
    """Clear a stop word from list of word in sentence
    Parameter : file name that contain stop words and sentence as list
    Return : Sentence as list that already clear a stop words
    """
    duplicate_list = []
    stopwords = []
    # Open stopword file
    stopwords_file = open(file_name, "r")
    for line in stopwords_file:
        stop = line.strip().split()
        for i in stop:
            stopwords.append(i)
    stopwords_file.close()
    # Use list comprehension for finding stop words in sentence list
    for member in sentence_list:
        if member in stopwords:
            duplicate_list.append(member)
    # Remove stop word that in sentence from sentence list by use of duplicate list
    for member in duplicate_list:
        sentence_list.remove(member)
    # Return!
    return sentence_list

def sentence_to_list(sentence):
    """Convert and clean a sentence to a list for bow
    Parameter : String sentence
    Return : Clean sentence ready to use in bow
    """
    # Declare a variable zone to make a code more 'clean' not like that shit
    sentence_complete = []
    # Seperate a sentence to list
    sentence = sentence.split()
    for word in sentence:
        # Clear and spread the word by using 'clear_word' function that we write before
        sentence_complete.append(clear_word(word))
    # Convert list in list to just a word
    sentence_complete = list(map(''.join, sentence_complete))
    # Clear a stop word by using clear_stop_word function
    sentence_complete = clear_stop_word('stopwords.txt', sentence_complete)
    # Sort a list to make a result list as ordered number and word
    sentence_complete.sort()
    return sentence_complete

def bow(sentence):
    """Get a sentence and 'BoW' it
    Parameter : Sentence
    Return : BoW result

    Doctest :
        >>> bow("Shane likes football; he is a big fan of Arsenal football team.")
        [['shane', 1], ['likes', 1], ['football', 2], ['big', 1], ['fan', 1], ['arsenal', 1], ['team', 1]]
    """
    # Declare a variable zone to make a code more 'clean' not like that shit
    result = []
    word_already_append = []  # List for check if we already append it in result list to make it more convenience
    # Use sentence_to_list function to get a clean sentence ready for bow
    sentence = sentence_to_list(sentence)
    # Make a result list
    for word in sentence:
        if word not in word_already_append:
            result.append([word, 1])
            word_already_append.append(word)
        else:
            for list_in_result_index in range(len(result)):
                if result[list_in_result_index][0] == word:
                    result[list_in_result_index][1] += 1
    return result

def fhash(word, M):
    """Just fhash
    Parameter : word, M
    Return : fhash result

    Doctest :
        >>> fhash('big', 4)
        2
    """
    # Calculate fhash formular
    G = 37
    result = 0
    word = list(word)
    for i in range(len(word)):
        if i == 0:
            result += ord(word[i])
        else:
            result += ord(word[i]) * (G ** i)
    return result % M

def bow_fhash(sentence, M):
    """Bow with fhash
    Parameter : sentence, M
    Return : List result of bow with fhash

    Doctest :
        >>> bow_fhash("Shane likes football; he is a big fan of Arsenal football team.", 4)
        [[0, 1], [1, 1], [2, 2], [3, 4]]
    """
    # Declare a variable zone to make a code more 'clean' not like that shit
    result = []
    fhash_list = []
    already_append = []  # List for check if we already append it in result list to make it more convenience
    # Use sentence_to_list function to get a clean sentence ready for bow
    sentence = sentence_to_list(sentence)
    for word in sentence:
        fhash_list.append(fhash(word, M))
    # Sort a list to make a result list as ordered number
    fhash_list.sort()
    # Make a result list
    for fhash_number in fhash_list:
        if fhash_number not in already_append:
            result.append([fhash_number, 1])
            already_append.append(fhash_number)
        else:
            for list_in_result_index in range(len(result)):
                if result[list_in_result_index][0] == fhash_number:
                    result[list_in_result_index][1] += 1
    return result

def count(file_name, fhash=False, M=0):
    """Count and print all
    Parameter : file name, fhash True or False as a boolean (Default is False), M if fhash is True
    Return : Nothing
    """
    # Declare a variable
    char_count = 0
    alphanumberic_count = 0
    word_count = 0
    file_list = []
    line_list = []
    word_list = []
    clear_word_list = []
    # Open file and readline
    file = open(file_name, "r")
    # Append each line of file to list
    for x in file:
        file_list.append(x)
    # We get line_count from how many list that it append in for loop
    line_count = len(file_list)
    # Next, for loop each line to make a list of line and a list of word
    for line in file_list:
        line_list.append(line.strip())
        word_list.append(line.strip().split())
    # Use a line list that we just make to count a character and alphanumberic
    for line in line_list:
        char_count += len(line)
        for character in line:
            if character in CHAR_LIST:
                alphanumberic_count += 1
    # Clear a word list by using a clear_word function
    for word in word_list:
        clear_word_list.append(clear_word(str(word)))
    # After we get a clean word now, count it.
    for member in clear_word_list:
        word_count += len(member)
    # Print a result
    print("-------------------")
    print(f"char count = {char_count}")
    print(f"alphanumberic count = {alphanumberic_count}")
    print(f"line count = {line_count}")
    print(f"word_count = {word_count}")
    sentence = ""
    for line in line_list:
        sentence += line
        sentence += " "
    # We set 2 parameter (fhash and M) to an enum to so you can use this function if you don't have fhash and M to.
    if fhash == False:
        print(f"BoW = {bow(sentence)}")
    else:
        print(f"BoW = {bow_fhash(sentence, M)}")


# Run Program
file_name = input("File name = ")
while True:
    hashing_or_not = input("Use feature hashing ? (y,Y,n,N) ")
    if hashing_or_not == "Y" or hashing_or_not == "y":
        M = int(input("M = "))
        # Turn fhash mode on and put M in function
        count(file_name, fhash=True, M=M)
        break
    elif hashing_or_not == "N" or hashing_or_not == "n":
        # Use this function normally, fhash mode of and not input M because we don't require M
        count(file_name)
        break
    else:
        print("Try again.")
# 6330437021 (22.99) 268 (2021-03-22 21:54)
def feature_hashing(w, M): 
    A=0
    for i in range(len(w)):
        A+=(ord(w[i])*(37**i))   
    return A%M
file_name=input("File name = ")
A=input("Use feature hashing ? (y,Y,n,N) ")
while A not in ["y","Y","n","N"]:
    print("Try again.")
    A=input("Use feature hashing ? (y,Y,n,N) ")
if A in ["y","Y"]:
    M=int(input("M = "))
    print("-------------------")
    B=open(file_name,"r")
    C=0
    for i in B:
        for c in i:
            C+=1
        if "\n" in i:
            C-=1
    print("char count =",C)
    B.close()
    
    B = open(file_name,"r")
    D=0
    for i in B:
        for c in i:
            if ("a"<=c<="z"):
                D+=1
            elif ("A"<=c<="Z"):
                D+=1
            elif ("0"<=c<="9"):
                D+=1
    print("alphanumeric count =",D)
    B.close()

    B=open(file_name,"r")
    E=0
    for i in B:
       if "\n" in i:
           E+=1
    print("line count =",E+1)
    B.close()
    
    B=open(file_name,"r")
    F=[]
    G=""
    for i in B:
        for c in i:
            if ("a"<=c<="z"):
                G+=c
            elif ("A"<=c<="Z"):
                G+=c
            elif ("0"<=c<="9"):
                G+=c
            else:
                if len(G) != 0:
                    F.append(G)
                G=""
    print("word count =",len(F))
    B.close()
       
    Y=[]
    Z=open("stopwords.txt","r")
    for i in Z:
        for w in i.strip().split():
            w = w.lower()
            if w not in Y:
                Y.append(w)
    Z.close()
    
    J=[] 
    K=[]
    L=[]
    N=[]
    n=0
    for i in F:
        i=i.lower()
        if not i in Y:
            J.append(feature_hashing(i,M))
            for i in J:
                if i not in K:
                    K.append(i)
    for c in K:
        n=0
        for i in J:
            if c==i:
                n+=1
        N.append([c,n])
    print("BoW =",N)

elif A in ["n","N"]:
    print("-------------------")
    B=open(file_name,"r")
    C=0
    for i in B:
        for c in i:
            C+=1
        if "\n" in i:
            C-=1
    print("char count =",C)
    B.close()
    
    B = open(file_name,"r")
    D=0
    for i in B:
        for c in i:
            if ("a"<=c<="z"):
                D+=1
            elif ("A"<=c<="Z"):
                D+=1
            elif ("0"<=c<="9"):
                D+=1
    print("alphanumeric count =",D)
    B.close()

    B=open(file_name,"r")
    E=0
    for i in B:
       if "\n" in i:
           E+=1
    print("line count =",E+1)
    B.close()
    
    B=open(file_name,"r")
    F=[]
    G=""
    for i in B:
        for c in i:
            if ("a"<=c<="z"):
                G+=c
            elif ("A"<=c<="Z"):
                G+=c
            elif ("0"<=c<="9"):
                G+=c
            else:
                if len(G) != 0:
                    F.append(G)
                G=""
    print("word count =",len(F))
    B.close()
    
    H=[]
    I=open("stopwords.txt","r")
    for i in I:
        for w in i.strip().split():
            w = w.lower()
            if w not in H:
                H.append(w)
    I.close()
    J=[] 
    K=[]
    L=[]
    N=[]
    n=0
    for i in F:
        i=i.lower()
        if not i in H:
            J.append(i)
            for i in J:
                if i not in K:
                    K.append(i)
    for c in K:
        n=0
        for i in J:
            if c==i:
                n+=1
        N.append([c,n])
    print("BoW =",N)
   

    
# 6330438721 (23.15) 269 (2021-03-22 22:04)
file = input("File name = ")
ans = input("Use feature hashing ? (y,Y,n,N) ")

while ans not in "yYnN":
    print("Try again.")
    ans = input("Use feature hashing ? (y,Y,n,N) ")
    
fn = open(file,"r")
fs = open("stopwords.txt", "r")
stop = []
for e in fs:
    stop += e.split()
backn = 0
cha_c = 0
alp_c = 0
line_c = 0
w_c = 0
words = ""
def fhash(word, M):
    weight = 1
    hashsum = 0
    for w in word:
        hashsum += ord(w) * weight
        weight *= 37
    return hashsum % M
if ans == "Y" or ans == "y":
    M = int(input("M = "))
    print("-------------------")
    for line in fn:
        cha_c += len(line)
        for e in line:
            if e == "\n":
                backn += 1
            if e in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789":
                alp_c += 1
                words += e
            else:
                words += " "
        line_c += 1
        list_words = words.split()
        w_c = len(list_words)
    print("char count =", cha_c - backn)
    print("alphanumeric count =", alp_c)
    print("line count =", line_c)
    print("word count =", w_c)
    not_stop_words = []
    for w in list_words:
        if w.lower() not in stop:
            not_stop_words.append(w)
    pairs = []
    temp_words = []
    for m in not_stop_words:
        temp_words.append(fhash(m, M))
    temp_words.sort()
    for k in temp_words:
        found = False
        for p in pairs:
            if k == p[0]:
                p[1] += 1
                found = True
                break
        if not found:
            pairs.append([k, 1])
    print("BoW =",pairs)
elif ans == "N" or ans == "n":
    print("-------------------")
    for line in fn:
        cha_c += len(line)
        for e in line:
            if e == "\n":
                backn += 1
            if e in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789":
                alp_c += 1
                words += e
            else:
                words += " "
        line_c += 1
        list_words = words.split()
        w_c = len(list_words)
    print("char count =", cha_c - backn)
    print("alphanumeric count =", alp_c)
    print("line count =", line_c)
    print("word count =", w_c)
    not_stop_words = []
    for w in list_words:
        if w.lower() not in stop:
            not_stop_words.append(w)
    pairs = []
    for k in not_stop_words:
        found = False
        for p in pairs:
            if k == p[0]:
                p[1] += 1
                found = True
                break
        if not found:
            pairs.append([k, 1])
    print("BoW =",pairs)
            
fn.close()
fs.close()
# 6330439321 (12.00) 270 (2021-03-21 22:31)

running = True

def fhash(word: str, M: int) -> int:

    weight = 1
    hashsum = 0

    for w in word:

        hashsum += ord(w) * weight
        weight *= 37

    return hashsum % M

def bow(words: list, command: str, M: int) -> list:

    pairs = list()
    temp_words = list()

    if command == 'y':
        for w in words:
            temp_words.append(fhash(w, M))
        temp_words.sort()
    else:
        temp_words = words

    for w in temp_words:

        found = False

        for p in pairs:
            if w == p[0]:
                p[1] += 1
                found = True
                break

        if not found:
            pairs.append([w, 1])

    return pairs


while running:

    file_name = input('File name = ')
    cmd = input('Use feature hashing ? (y,Y,n,N) ').lower()

    if cmd not in ('y', 'n'):
        print('Try again.')

    else:

        stopwords = ['it', 'they', 'the', 'a', 'an', 'of', 'on', 'in', 'at', 'is', 'am', 'are', 'was', 'were']
        # extract filechar to line
        n_line = 0  # count number of lines
        f = open(file_name, 'r')
        file_line = ''
        for line in f:
            for c in line:
                file_line += c
            n_line += 1
        # end extract
        if cmd == 'y':
            M = int(input('M = '))  # get
        else:
            M = 0

        ###### END INPUT STAGE #######

        # print char count
        print('char count =', len(file_line) - (n_line-1))

        n_alphanumeric = 0
        words = list()
        tempword = ''

        for c in file_line:

            if c.isalpha() or c.isdigit():

                tempword += c
                n_alphanumeric += 1

            # reset
            elif tempword:
                words.append(tempword.lower())
                tempword = ''

        # print alphanumeric, line and word count
        print('alphanumeric count =', n_alphanumeric)
        print('line count =', n_line)
        print('word count =', len(words))

        # calculate BOW
        not_stop_words = list()
        for w in words:
            if w not in stopwords:
                not_stop_words.append(w)

        print(bow(not_stop_words, cmd, M))

        running = False

# 6330440921 (26.00) 271 (2021-03-22 12:33)

file_name = input('File name = ').strip() # sample.txt
check = ''
wordcount, line, alphacount, charcount = 0, 0, 0, 0
stopwords = open('stopwords.txt', "r")
stwords = []
for stline in stopwords:
    stwords += stline.split()
stopwords.close()
def punc(sentence):
    new_sentence = ''
    alpha = 0
    for i in range(len(sentence)):
        if sentence[i].isalnum() == True: # isalnum check alphanumeric
            new_sentence += sentence[i]
            alpha += len(sentence[i])
        else:
            new_sentence += ' '
    return new_sentence.split() + [alpha]
def backn(sentence):
    if sentence[-1] == '\n':
        return True
def fhash(word, M):
    G = 37
    total = 0
    for i in range(len(word)):
        total += ord(word[i])*G**i
    return total % M
def find_all(word, sentence):
    count = 0
    for i in range(len(sentence)):
        if word == sentence[i]:
            count += 1
    return count
def unique(list_all, list_stop): #all_word, stwords
    unique = []
    for e in (list_all):
        if e not in list_stop:
            unique.append(e)
    return unique

all_word, bow, BoW = [], [], []
fname = open(file_name, "r")
for line_f in fname:
    all_word += punc(line_f.lower())[:-1]
    new_line = punc(line_f)[:-1]
    wordcount += len(new_line)
    line += 1
    alphacount += punc(line_f)[-1]
    if backn(line_f) == True:
        charcount += len(line_f) - 1
    else:
        charcount += len(line_f)
fname.close()
unique_list = unique(all_word, stwords)
while check != 'n' or check != 'y':
    check = input('Use feature harshing ? (y,Y,n,N) ').lower()
    if check == 'y':
        M = int(input('M = '))
        break
    elif check == 'n':
        break
    else:
        print('Try again.')
print('-'*20)
print('char count =', charcount)
print('alphanumeric count =', alphacount)
print('line count =', line)
print('word count =', wordcount)
if check == 'y':
    fh = [fhash(e, M) for e in unique_list]
    for i in range(len(fh)):
        bow.append([fh[i], find_all(fh[i], fh)])
else:
    for e in unique_list:
        bow.append([e, find_all(e, unique_list)])
bow.sort()
for i in range(len(bow) - 1):
    if bow[i] != bow[i+1]:
        BoW.append(bow[i])
BoW.append(bow[-1])
print('BoW =', BoW)
# 6330441521 (30.00) 272 (2021-03-21 23:02)
fn = input("File name = ")
x = 0
file_name = open(fn,'r')
fh = input("Use feature hashing ? (y,Y,n,N,) ")
while True:
    if fh in ['y','Y']:
        x = 1
        break
    elif fh in ['n','N']:
        x = 0
        break
    else:
        print("Try again.")
        fh = input("Use feature hashing ? (y,Y,n,N,) ")
if x == 1:
    M = int(input("M = "))

stop = open('stopwords.txt','r')
sw = []
stopwords = []
for line in stop:
    line = line.split()
    if line != []:
        sw.append(line)
for d in sw:
    for c in d:
        stopwords.append(c.lower())

line_count = 0
char_count = 0
an_count = 0
w = ''
lw = []
list_word = []
word = ''
for line in file_name:
    line_count += 1
    char_count += len(line.strip('\n'))
    line += ' '
    for d in line:
        if 'A' <= d <= 'Z' or 'a' <= d <= 'z' or '0' <= d <= '9':
            word += d
            an_count += 1
        else:
            if word != '':
                word = word.lower()
                list_word.append(word)
                word = ''

list_bow = []
for d in list_word:
    if d not in stopwords and d not in list_bow:
        list_bow.append(d)
        
def fhash(w,M):
    G = 37
    s = 0
    for d in range(0,len(w)):
        s += ord(w[d])*(G**d)
    return s%M

BoW = []
if x == 0:
    for d in list_bow:
        count = list_word.count(d)
        BoW.append([d,count])
elif x == 1:
    list_fh = []
    cfh = []
    for d in list_bow:
        fh = fhash(d,M)
        for c in range (0,list_word.count(d)):
            cfh.append(fh)
        if fh not in list_fh:
            list_fh.append(fh)
    for d in list_fh:
        count = cfh.count(d)
        BoW.append([d,count])
        
        
print("-------------------")
print("char count =",char_count)
print("alphanumeric count =",an_count)
print("line count =",line_count)
print('word count =',len(list_word))
print('BoW =',BoW)
file_name.close()
stop.close()


        
        
        

    
                
# 6330443821 (20.65) 273 (2021-03-21 18:53)
def bow_for_yes(lst, M):
    if lst:
        uniq = []
        for word in lst:
            if word not in uniq:
                uniq.append(fhash(word, M))
    bow1 = []
    for i in range(len(uniq)):        
        bow1.append([uniq[i],uniq.count(uniq[i])])
        
        
    if bow1:
        uniq2 = []
        for ele in bow1:
            if ele not in uniq2:
                uniq2.append(ele)
    uniq2.sort()
    return  uniq2
def bow(lst):
    if lst:
        uniq = []
        for word in lst:
            if word not in uniq:
                uniq.append(word)
    bow1 = []
    for i in range(len(uniq)):        
        bow1.append([uniq[i],lst.count(uniq[i])])
    bow1.sort()        
    return bow1
def all_lower(x_file):
    file = open(x_file, 'r')
    stop_file = open('stopwords.txt', 'r')
    word = []
    stop_word = []
    for line in stop_file:
        line = line.strip('\n')
        line2 = line.split()
        for i in range(len(line2)):
            stop_word.append(line2[i].lower())

    for line in file:
        line = line.strip('\n')
        line2 = line.split()
        for i in range(len(line2)):
            if line2[i].lower() not in stop_word:
                word.append(line2[i].lower())

    word2 = ''
    lst = []
    for i in range(len(word)):
        for j in range(len(word[i])):
            if word[i][j].isalnum():
                word2 += word[i][j]    
        lst.append(word2)
        word2 = ''
    file.close()
    return lst
def alphanumeric_count(file_name):
    file = open(file_name, 'r')
    summ = 0
    for line in file:
        line = line.strip('\n')
        for i in range(len(line)):
            if line[i].isalnum():
                summ += 1
    file.close()
    return summ

def fhash(strr, M):
    G = 37
    summ = 0
    for i in range(len(strr)):        
        summ += ord(strr[i])*(G**i)
    return summ % M

def countt(file_name):
    file = open(file_name, 'r')
    
    num_of_lines = 0 
    num_of_words = 0
    num_of_char = 0
    
    for line in file:
        line = line.strip("\n")
        
        words = line.split()
        num_of_lines += 1
        num_of_words += len(words)
        num_of_char += len(line)
    
    file.close()
    
    return num_of_char, num_of_words, num_of_lines




file_name = input("File name = ")
x = input("Use feature hashing ? (y,Y,n,N) ")
while x not in ['y', 'Y', 'n', 'N']:
    print("Try again.")
    x = input("Use feature hashing ? (y,Y,n,N) ")
    
    
if x in ['n', 'N']:        
        print("-------------------")
        print("char count = "+ str(countt(file_name)[0]))
        print("alphanumeric count = "+ str(alphanumeric_count(file_name)))
        print("line count = "+ str(countt(file_name)[2]))
        print("word count = "+ str(countt(file_name)[1]))
        print("BoW = " +str(bow(all_lower(file_name))))
        
elif x in ['y', 'Y']:
        M = input("M = ")
        print("-------------------")
        print("char count = "+ str(countt(file_name)[0]))
        print("alphanumeric count = "+ str(alphanumeric_count(file_name)))
        print("line count = "+ str(countt(file_name)[2]))
        print("word count = "+ str(countt(file_name)[1]))
        print("BoW = " +str(bow_for_yes(all_lower(file_name), int(M))))
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        

# 6330444421 (30.00) 274 (2021-03-22 15:48)

def isAl(w):
  o = ord(w)
  if 48 <= o <= 57 or 97 <= o <= 122:
    return True
  return False

# does not concert of stopWords
# count occurance of a word in words
def bagOfWord(words):
  BoW = []
  words.sort()
  i = 0
  while i < len(words):
    dup = 1
    first = words[i]
    while i+1 < len(words) and words[i+1] == first:
      dup += 1
      i += 1
    i += 1
    BoW.append([first,dup])
  return BoW
# transform word in words to hash
def fhash(words, M):
  hash = []
  for w in words:
    mod = 0
    i = 0
    for c in w:
      mod += ord(c)*(37**i)
      i += 1
    mod %= M
    hash.append(mod)
  return hash
def removeStopWords(words, stopWords):
  removed = []
  for w in words:
    if w not in stopWords:
      removed.append(w)
  return removed
def detail(fileText):
  alphaCount = 0

  preprocessText = fileText.lower()
  postprocessText = ""
  for c in preprocessText:
    if isAl(c):
      postprocessText += c
      alphaCount += 1
    else:
      postprocessText += " "
  words = postprocessText.split()
  lineCount = preprocessText.count("\n") + 1
  charCount = len(preprocessText) - lineCount + 1 
  wordCount = len(words)
  return [postprocessText, words, lineCount, charCount, wordCount, alphaCount]





def main():
  filename = input("File name = ")
  file = open(filename, "r")
  stopWords = open("stopwords.txt","r").read()
  useHash = input("Use feature hashing ? (y,Y,n,N) ")
  while useHash not in ["y","Y","n","N"]:
    print("Try again.")
    useHash = input("Use feature hashing ? (y,Y,n,N) ")
  
  d = detail(file.read())
  words, lineCount, charCount, wordCount, alphaCount= d[1],d[2],d[3],d[4],d[5]

  rawText = removeStopWords(words, stopWords.replace("\n"," ").split())

  if useHash in ["n","N"]:
    print("-------------------")
    print("char count =", charCount)
    print("alphanumeric count =", alphaCount)
    print("line count =", lineCount)
    print("word count =", wordCount)
    print("BoW =", bagOfWord(rawText))
  if useHash in ["y","Y"]:
    M = int(input("M = "))
    print("-------------------")
    print("char count =", charCount)
    print("alphanumeric count =", alphaCount)
    print("line count =", lineCount)
    print("word count =", wordCount)
    print("BoW =", bagOfWord(fhash(rawText, M)))


main()
# 6330445021 (26.67) 275 (2021-03-21 22:50)
#Prog-08: Bag-of-words
# 6330445021 (26.67) Matt Yongpiyakul
def file():
    f = open(file_name,'r')
    return f
def words_in(file):
    f = file
    s = ''
    for line in f:
        l = line
        for k in range(len(l)):
            char = l[k].lower()
            if not l[k].isalnum():
                char = ' '
            s += char
    words = s.split()
    return words
def stopwords():
    sw = open('stopword.txt','r')
    return words_in(sw)
def filtered():
    pos = []
    words = words_in(file())
    for k in range(len(words)):
        if words[k] in stopwords():
            pos.insert(0,k)
    for i in pos:
        words.pop(i)
    return words
def characters():
    count = 0
    for line in file():
        count += len(line.strip())
    return count
def alnum():
    count = 0
    for i in words_in(file()):
        count += len(i)
    return count
def word_count():
    return len(words_in(file()))
def line_count():
    count = 0
    for line in file():
        count += 1
    return count
def fhash(w):
    ords = 0
    for k in range(len(w)):
        ords += ord(w[k]) * 37**k
    return ords
def bow():
    l = filtered()
    l.sort()
    words = []
    for i in l:
        if not i in words:
            words.append(i)
    bow = []
    for i in words:
        bow.append([i,l.count(i)])
    return bow
def fbow(M):
    l = filtered()
    fbow = []
    for k in range(M):
        fbow.append([k,0])
    for i in l:
        fbow[fhash(i)%M][1] += 1
    for i in fbow[::-1]:
        if i[1] == 0:
            fbow.remove(i)
    return fbow
def hashing():
    cond = input('Use feature hashing ? (y,Y,n,N) ').lower()
    if cond == 'y':
        return True
    if cond == 'n':
        return False
    print('Try again.')
    return hashing()

file_name = input('File name = ')
if hashing():
    M = int(input('M = '))
    bow = fbow(M)
else:
    bow = bow()
print('-'*19)
print('char count =',characters())
print('alphanumeric count =',alnum())
print('line count = ',line_count())
print('word count =',word_count())
print('BoW =',bow)
# 6330446721 (20.20) 276 (2021-03-21 14:50)

file_name = input("File name = ")
cd = input("Use feature hashing ? (y,Y,n,N) ")
while cd not in "yYnN":
        print("Try again.")
        cd = input("Use feature hashing ? (y,Y,n,N) ")
if cd == "y" or cd =="Y" :
    M = int(input("M = "))
def count_words(words,wordslist):
    c = 0
    for e in wordslist:
        if e == words:
            c += 1
    return c
def BoW(wordslist):
    bow = []
    wordslist.sort()
    for e in wordslist:
        if e not in bow :
            bow.append(e)
    reBow = []
    for x in bow:
        reBow.append([x,count_words(x,wordslist)])
    return reBow
def Hash(w,M):
    c = 0
    for i in range(len(w)):
        c += ord(w[i])*(37**i)
    fh = c%M
    return fh
def cut_st_words(words,stopwords):
    for i in range(len(words)):
        words[i] = words[i].lower()
    w = []
    for e in words:
        if e not in stopwords:
            w.append(e)
    return w

stopwords = []
stwfile = open("stopwords.txt","r")
for line in stwfile:
    if len(line) > 0 :
        for e in line.split():
            stopwords.append(e)
stwfile.close()

words = []
c_c = 0
alp_c = 0
l_c = 0
w_c = 0
file = open(file_name,"r")
for line in file:
    if line[-1] == "\n" :
        line = line[:-1]
    c_c += len(line)
    line= line.lower()
    if len(line) > 0:
        l_c += 1
    for e in line:
        if 'a' <= e <= "z" or '0' <= e <= '9':
            alp_c += 1
        
    w = ""
    for i in range(len(line)):
        u = line[i]
        if 'a' <= u <= "z" or '0' <= u <= '9'  :
           w += u
        else :
            if w != "" :
                words.append(w)
                w = ""
file.close()
    
final_words = cut_st_words(words,stopwords)

if cd == "y" or cd =="Y" :
    for i in range(len(final_words)):
        final_words[i] = Hash(final_words[i],M)
    rb = BoW(final_words)    
else:
    rb = BoW(final_words)

    
print("-------------------")
print("char count = ",c_c )
print("alphanumeric count =",alp_c)
print("line count =",l_c)
print("word count =",len(words))
print("BoW =",rb)
        

            










        

            










# 6330447321 (22.99) 277 (2021-03-22 02:40)

file_name = str(input('File name = ').strip())
while True:
    ans = input('Use feature hashing ? (y,Y,n,N) ')
    if ans in ['y','n','Y','N']:
        if ans in ['Y','y']:
            fh = True
            M = int(input('M = '))
        elif ans in ['N','n']:
            fh = False
        break
    print('Try again.')
print('-------------------')

f = open(file_name,'r')
w = ''
c,i,l,d = 0,0,0,0
L7 = []
for e in f.readlines():
    l += 1
    if '\n' in e :
        w += e[:-1]+' '
        d += 1
    else :
        w += e
    L7.append(e)
L7 = L7[::-1]
for z in range(len(L7)):
    if L7[z] != '\n':break
print('char count =',len(w)-d)

while i<len(w):
    if w[i].isalpha() or w[i].isnumeric():
        c += 1
    i += 1
print('alphanumeric count =',c)
print('line count =',l-z)
L,L2 = [],[] #list fo word L1 = L - stopwords
w3 = ''
for i in range(len(w)):
    if w[i].isalpha() or w[i].isnumeric():
        w3 += w[i]
    else :
        if w3 != '' :
            L.append(w3.lower())
        w3 = ''  
print('word count =',len(L))
f.close()

g = open('stopwords.txt','r')
L1,ww = [],''
for line in g.readlines():
    if '\n' in line :
        L1.extend(line[:-1].split())
    else :
        L1.extend(line.split())

for h in L:
    if not h in L1:
        L2.append(h)

#count word in L2 (not same)
L3 = []
wL3 = []
for i in range(len(L2)):
    if not L2[i] in wL3:
        wL3.append(L2[i])
        L3.append([L2[i],1])
    else:
        for j in range(len(L3)):
            if L3[j][0] == L2[i]:
                L3[j][1] += 1
                break
def fhash(y,M):
    count = 0
    for i in range(len(y)):
        count += ord(y[i])*(37**i)
    return count%M

L3.sort()
L5,L6 = [],[]
if not fh:
    print('BoW =',L3)
else :
    L4 = []
    for i in range(len(L3)):
        L4.append([fhash(L3[i][0],M),L3[i][1]])
    for i in range(len(L4)):
        if not L4[i][0] in L6:
            L5.append(L4[i])
            L6.append(L4[i][0])
        else:
            for j in range(len(L5)):
                if L5[j][0] == L4[i][0]:
                    L5[j][1] += L4[i][1]
    print('BoW =',sorted(L5))
# 6330448021 (0.00) 278 (2021-03-22 23:58)
file_name = input('Flie name')
use_feature = input('Use feature hashing ? (y,Y,n,N) ')
def blank(t):
	result = ""
	for c in t:
		if c in "\"\'/\\,.:;":
			result += " "
		else:
			result += c
	return result
def flash(w,M) :
	mavis = 0
	for i in range(len(w)) :
		mavis += ord(w[i])*37**i
	meow = mavis%M
	return meow
fin = open(file_name,"r")
line = fin.readline()
number_of_lines = 0
number_of_words = 0
number_of_characters = 0
number_of_alphanumeric = 0
for line in fin:
	line = line.strip("\n")
	words = line.split()
	number_of_lines += 1
	number_of_words += len(words)
	number_of_characters += len(line)
for line in fin :
	clear_sentence = blank(line)
	lower_sentence = clear_sentence.lower()
fin.close()
while use_feature not in 'yYnN' :
	use_feature = input('Use feature hashing ? (y,Y,n,N) ')
	if use_feature == 'y' or use_feature == 'Y' :
		M = int(input('M = '))
		print(19*'-')
		print('char count = ',number_of_characters)
		print('alphanumeric count = ')
		print('line count = ',number_of_lines)
		print('word count =',number_of_words)
		print('BoW = ')
	elif use_feature == 'n' or use_feature == 'N' :
		print(19*'-')
		print('char count = ',number_of_characters)
		print('alphanumeric count = ')
		print('line count = ',number_of_lines)
		print('word count =',number_of_words)
		print('BoW = ')
	

# 6330449621 (28.00) 279 (2021-03-22 23:52)

def fhash(word, M):
    sum_ord_num = 0
    for i in range(len(word)) :
        ord_num = ord(word[i])
        sum_ord_num += ord_num*(37**i)
        result = sum_ord_num%M
    return result

#--------------------------------------------------------------------------------------------------

file_name = input("File name = ")
c = input("Use feature hashing ? (y,Y,n,N) ")
check = c.lower()

while True :
    if check not in ("yn"):
        print("Try again.")
        check = input("Use feature hashing ? (y,Y,n,N) ").lower()
    else:
        break

f1 = open(file_name, "r")
char = 0
alpha = 0
file_line = ""
    
for i in f1 :
       for e in i:
             if e.isalnum():
                   file_line += e
                   alpha +=1
             else:
                   file_line += ' '

             if e!= '\n':
                   char += 1
        
if check == 'y':
       M = int(input('M = '))  
else:
       M = 1
            
f1 = open(file_name, "r")
noline = 0

for line in f1 :
    noline += 1
    
#---------------------------------------------------------------------------------------------------
def duplicates(numbers_list):
    store = []
    checked = []
    for i in range(len(numbers_list)) :
        counter = 1 
        for j in range(i+1,len(numbers_list)) :
            if numbers_list[i] not in checked and numbers_list[j] == numbers_list[i] :
                counter += 1 
        if counter > 1 :
            store.append(numbers_list[i])
            checked.append(numbers_list[i])
    return store

#-----------------------------------------------------------------------------------------------------
word_list = file_line.lower().strip().split()
num_word = len(word_list)

stp = open("stopwords.txt", "r")    
stopword=[]
for line in stp :
    stopword += line.strip().split()

listofdata = []
for e in word_list :
    if not e in stopword :
        listofdata.append(e)

listoffhash = []
for w in listofdata:
    listoffhash.append(fhash(w,M))

if c in "yY" :
    a = []
    b = []
    for num in listoffhash:
        if num not in a :
            a.append(num)
            b.append([num,listoffhash.count(num)])
        
elif c in "nN" :
    a = []
    b = []
    for word in listofdata:
        if word not in a :
            a.append(word)
            b.append([word,listofdata.count(word)])
            
#-------------------------------------------------------------------------------------------------------
print('-'*19)
print('char count =',char)
print('alphanumeric count =',alpha)
print('line count =',noline)
print('word count =',num_word)
print("BoW =", b)

f1.close()
stp.close()

# 6330450121 (14.99) 280 (2021-03-22 22:57)

#-----------------------------------------------------------------------------------
def fhash(w, M) :
    
    n= 0
    
    for i in range(len(w)):
        n=  n + (ord(w[i]) * (37**i))
    r=n%M 
    
    return r
def get_b( words , stopWords , Bow_Con , M ) :
    r=list()
    
    for c in words :
        a=str()
        for n in c:
            a=a+n.lower()
        c=a
            
        #c=  c.lower()
        
        if c in stopWords: pass
        
        else:
            Con = False
            if Bow_Con:
                #print(c)
                cEdit=     fhash(c, M)
                for i in range(len(r)):
                    
                    if r[i][0] == cEdit:
                        
                        r[i][1]=r[i][1]+  1
                        Con= True
                        break
                    
                if not Con:
                    
                    cEdit = fhash(c, M)
                    r.append([cEdit, 1])
                    
            else:
                a=len(r)
                #myyheadddddddd
                for i in range(a):
                    
                    if r[i][0] == c:
                        r[i][1]=r[i][1]+  1
                        Con = True
                        break
                    
                if not Con:
                    
                    r.append([c, 1])
    r.sort()
    return r
#-----------------------------------------------------------------------------------------------
#tired
M=0

file_name=   input('File name = ')
Bow_Con=  input('Use feature hashing ? (y,Y,n,N) ')

while Bow_Con not in [ 'y', 'Y' , 'n' , 'N' ]:
    
    print('Try again.')
    Bow_Con=  input('Use feature hashing ? (y,Y,n,N) ')
    
if Bow_Con in ['y', 'Y']:
    
    M=  int(input('M = '))
    Bow_Con=   True
    #b true
    
else:
    
    Bow_Con = False
print('-------------------')

#---------------------------------------

r=list()
stopWordsFile=  open( 'stopwords.txt', 'r' )
for line in stopWordsFile:
    for w in line.strip().split():
        
        w=   w.lower()
        if w in r:
            pass
        else:
            r.append(w)
stopWords=r
#close
stopWordsFile.close()

#-----------------------------------
kyow=  0
kyow2=  0
words=list()

wordsFile=  open( file_name, 'r' )
lineCount=  0
for line in wordsFile:
    lineCount=  lineCount + 1
    for c in line:
        kyow=kyow+1
        
        if c == '\n':
            
            kyow=  kyow - 1
        if ( '0'<= c <='9' ) or ( 'A'<= c <='Z' ) \
           or ( 'a'<= c <='z' ):
            
            kyow=  kyow + 1

    word=  str()
    
    for c in line:
        if ( '0'<= c <='9' ) or ( 'A'<= c <='Z' ) \
           or ( 'a'<= c <='z' ):
            #print(c+'hola')
            word=word+c
            
        else:
            if len(word)==0:
                pass
            else:
                words.append(word)
            #empty
            word= str()
           
#close            
wordsFile.close()



print( 'char count =' , kyow )

print( 'alphanumeric count =' , kyow2 )

print( 'line count =' , lineCount )

print( 'word count =' , len(words) )

print( 'BoW =' , get_b(words, stopWords, Bow_Con, M) )

# 6330452421 (25.00) 281 (2021-03-22 15:00)
file_name = input("File name = ",)
choose = input("Use feature hashing ? (y,Y,n,N) ")

while choose.upper() not in "NY":
    print("Try again.")
    choose = input("Use feature hashing ? (y,Y,n,N) ")

if choose.upper() == "Y":
    M = input("M = ")

stopfile = open("stopwords.txt","r")
list_stop =[]
for line in stopfile:
    line = line.split()
    for e in line:
        list_stop.append(e)
stopfile.close()

fn = open(file_name,"r")
char_count = 0
line_count = 0
alph_count = 0
new_sen = ""
for line in fn:
    line_count+=1
    for e in line:
        if e != "\n":
            char_count+=1
        if "a"<=(e.lower())<="z" or "0"<=e<="9":
            alph_count+=1
            new_sen+=e.lower()
        else:
            new_sen+=" "
list_word = new_sen.strip().split()
word_count = len(list_word)
fn.close()

print("-------------------")
print("char count =",str(char_count))
print("alphanumeric count =",str(alph_count))
print("line count =",str(line_count))
print("word count =",str(word_count))

no_stop =[]
for e in list_word:
    if e not in list_stop:
        no_stop.append(e)
def fhash(word,M):
    sum = 0
    for i in range(len(word)):
        sum += ord(word[i])*((37)**i)
    return sum%int(M)

if choose.upper() == "Y":
    list_fhash = []
    for e in no_stop:
        list_fhash.append(fhash(e,M))
    c=1
    list_fhash.sort()
    BoW = []
    if len(list_fhash)>1:
        for i in range(1,len(list_fhash)):
            if list_fhash[i] == list_fhash[i-1]:
                c+=1
            else:
                BoW.append([list_fhash[i-1],c])
                c=1
        BoW.append([list_fhash[i],c])
    else:
        BoW.append([list_fhash[0],c])
   
elif choose.upper() == "N":
    c=1
    no_stop.sort()
    BoW = []
    if len(no_stop)>1:
        for i in range(1,len(no_stop)):
            if no_stop[i] == no_stop[i-1]:
                c+=1
            else:
                BoW.append([no_stop[i-1],c])
                c=1
        BoW.append([no_stop[i],c])
    else:
        BoW.append([no_stop[0],c])

print("BoW =",BoW)

# 6330453021 (22.80) 282 (2021-03-22 20:04)
n = input('File name = ')
o = input('Use feature hashing ? (y,Y,n,N) ')
while o not in 'yYnN':
    print('Try again.')
    o = input('Use feature hashing ? (y,Y,n,N) ')
if o in 'yY':
    M = input('M = ')
elif o in 'nN':
    pass

#-------------------------------------
list_inf =  ''
inf = open(n, 'r')
for line in inf:
    list_inf += line
inf.close()
#-------------------------------------
stop_word = ''
stop = open('stopwords.txt', 'r')
for line in stop:
    stop_word += line
stop.close()
#-------------------------------------

stop_word = stop_word.split()
#print(stop_word)


list_inf = list_inf.lower()


word = list_inf.split()
word2 = ''
for i in range(len(word)):
    for k in word[i]:
        if 'a' <= k <= 'z' or '0' <= k <= '9':
            word2 += k
    word2 += ' '
word2 = word2.split()
#print(word2)
word3 = []
for i in range(len(word2)):
    word3.append(word2[i])
#print(word)
word4 = []
for i in range(len(word3)):
    if word3[i] in stop_word:
        word4.append(word3[i])
for i in range(len(word4)):
    word3.remove(word4[i])

#print(word3)

#-------------------------------------
def char_count(list_inf):
    c1 = 0
    for i in range(len(list_inf)):
        if list_inf[i] not in '\n':
            c1 += 1
    print('char count = '+str(c1))
#-------------------------------------
def alphanumeric_count(list_inf):
    c2 = 0
    for i in range(len(list_inf)):
        if 'A' <= list_inf[i] <= 'z' or '0' <= list_inf[i] <= '9':
            c2 += 1
    print('alphanumeric count = '+str(c2))
#-------------------------------------
def line_count(list_inf):
    d = open(n, 'r')
    c3 = 0
    for i in d:
        c3 += 1
    print('line count = '+str(c3))
    d.close()
#-------------------------------------

first_bow = []
back_bow = []
bow = []
def not_fhash(word3):
    for i in range(len(word3)):
        if word3[i] not in first_bow:
            first_bow.append(word3[i])
            back_bow.append(word3.count(word3[i]))
        elif word3[i] in first_bow:
            pass
    for i in range(len(first_bow)):
        bow.append([first_bow[i],back_bow[i]])
    print('BoW = '+str(bow))
numword = []
ffirst_bow = []
fback_bow = []
fbow = []
G = 37
def fhash(word3, M):
    for i in range(len(word3)):
        s = 0
        p = 0
        for k in word3[i]:
            s += ord(k)*(G**p)
            p += 1
        s = s % int(M)
        numword.append(s)
    numword.sort()
    for i in range(len(numword)):
        if numword[i] not in ffirst_bow:
            ffirst_bow.append(numword[i])
            fback_bow.append(numword.count(numword[i]))
        elif word3[i] in ffirst_bow:
            pass
    for i in range(len(ffirst_bow)):
        fbow.append([ffirst_bow[i],fback_bow[i]])
    print('BoW = '+str(fbow))

print(str('-------------------'))
char_count(list_inf)
alphanumeric_count(list_inf)
line_count(list_inf)
print('word count = '+str(len(word2)))
if o in 'nN':
    not_fhash(word3)
elif o in 'yY':
    fhash(word3,M)

# 6330454721 (25.15) 283 (2021-03-22 18:48)
def fhash(w,M) :
    G = 37
    total = 0
    for i in range(len(w)) :
        out =  ord(w[i]) * (G**i)
        total += out
    total = total % int(M)
    return total

#-------------
def charcount(file_name) :
    fin = open(file_name,"r")
    charcount = 0
    for line in fin :
        for e in line :
            if e != "\n" : charcount += 1
    fin.close()
    return charcount
#------------
def alphanum(file_name) :
    fin = open(file_name,"r")
    alphanum = 0
    for line in fin :
        for e in line :
            if ("0" <= e <= "9") or ("a" <= e <= "z") or ("A" <= e <= "Z") :
                alphanum += 1
    fin.close()
    return alphanum
#-----------
def linecount(file_name) :
    fin = open(file_name,"r")
    i = 0
    for line in fin :
        i += 1
    fin.close()
    return i
#-----------
def wordcount(file_name) :
    fin = open(file_name,"r")
    x = ""
    for line in fin :
        for e in line :
            if e not in "\"\'/\\,.:;-()><|[]{}_" :
                x += e
    fin.close()
    return len(x.split())
#-----------
def bow(file_name) :
    fin1 = open("stopwords.txt","r")
    stopword = ""
    for line in fin1 :
        for e in line :
            if e not in "\"\'/\\,.:;-()><|[]{}_" :
                stopword += e.lower()
    stopword = stopword.split()
    fin1.close()

    fin = open(file_name,"r")
    word = ""
    for line in fin :
        for e in line :
            if e not in "\"\'/\\,.:;-()><|[]{}_" :
                word += e.lower()
    word = word.split()
    fin.close()

    bow = []
    for i in range(len(word)) :
        if word[i] not in stopword :
            bow.append(word[i])
    bow.sort()
    unique = []
    for e in bow :
        if e not in unique : unique.append(e)
    wordfre = [bow.count(w) for w in unique]
    last = []
    for i in range(len(unique)) :
        last.append([unique[i],wordfre[i]])
    return last
#-----------
def bowy(file_name,M) :
    fin1 = open("stopwords.txt","r")
    stopword = ""
    for line in fin1 :
        for e in line :
            if e not in "\"\'/\\,.:;-()><|[]{}_" :
                stopword += e.lower()
    stopword = stopword.split()
    fin1.close()

    fin = open(file_name,"r")
    word = ""
    for line in fin :
        for e in line :
            if e not in "\"\'/\\,.:;-()><|[]{}_" :
                word += e.lower()
    word = word.split()
    fin.close()

    bow = []
    for i in range(len(word)) :
        if word[i] not in stopword :
            bow.append(word[i])
    bow.sort()
    x = []
    for i in range(len(bow)) :
        x.append(fhash(bow[i],M))
    unique = []
    for i in range(len(x)) :
        if x[i] not in unique : unique.append(x[i])
    wordfre = [x.count(w) for w in unique]
    out = []
    for i in range(len(unique)) :
        out.append([unique[i],wordfre[i]])
    out.sort()
    if len(out) > int(M) :
        return out[:M]
    else :
        return out      
#-----------
def show() :
    print("-"*19)
    print("char count =",charcount(file_name))
    print("alphanumeric count =",alphanum(file_name)) 
    print("line count =",linecount(file_name))
    print("word count =",wordcount(file_name))
#----------
    
file_name = input("File name = " , )
feature = input("Use feature hashing ? (y,Y,n,N) ", )
while feature not in ["y","Y","n","N"] :
    print("Try again.")
    feature = input("Use feature hashing ? (y,Y,n,N) ", )
if feature in ["n","N"] :
    show()
    print("BoW =",bow(file_name))
if feature in ["y","Y"]:
    M = input("M = ", )
    show()
    print("BoW =",bowy(file_name,M))
    
#----------
# 6330455321 (26.00) 284 (2021-03-19 00:28)
def fhash(w,M) :
    su = 0
    for i in range(len(w)) :
        su += ord(w[i])*(37**i)
    return su%M
def chcount(w) :
    n = 0
    for i in w :
        if i != '\n':
            n += 1
    return n
def alpcount(w) :
    n = 0
    for e in w :
        if 'a'<= e <= 'z' or "A" <= e <= "Z" or '0' <= e <= '9' :
            n += 1
    return n
def wor(w) :
    w += " "
    li = []
    st = ""
    for e in w :
        if 'a'<= e <= 'z' or "A" <= e <= "Z" or '0' <= e <= '9' :
            st += e
        else:
            li.append(st.lower())
            st = ""
    li2 = []
    for e in li :
        if e != "" :
            li2.append(e)
    return li2
def bowN(W,stop) :
   
    W2 = []
    fin = []
    for e in W :
        if e.lower() not in stop :
            W2.append(e)
    W2.sort()
    n = 1
    for i in range(len(W2)-1) :
        if W2[i] == W2[i+1] :
            n += 1
        else :
            fin.append([W2[i],n])
            n = 1
    fin.append([W2[-1],n])
    return fin
def bowY(W,stop,M) :
    
    W2 = []
    fin = []
    for e in W :
        if e not in stop :
            W2.append(fhash(e,int(M)))
    W2.sort()
    n = 1
    for i in range(len(W2)-1) :
        if W2[i] == W2[i+1] :
            n += 1
        else :
            fin.append([W2[i],n])
            n = 1
    fin.append([W2[-1],n])
    return fin
file_name = input("File name = ")
line = open(file_name, "r")
sto = open('stopwords.txt', "r")
stop = []
for i in sto :
    stop += wor(i)
YN = input("Use feature hashing ? (y,Y,n,N) ")
while YN not in ['y','Y','n','N']:
    print('Try again')
    YN = input("Use feature hashing ? (y,Y,n,N) ")
if YN in ['y','Y'] :
    Mm = int(input("M = "))
    print('-------------------')
    su = 0
    alsu = 0
    lincn = 0
    wo = []
    for y in line :
        su += int(chcount(y))
        alsu += int(alpcount(y))
        lincn += 1
        wo += wor(y)
    print('char count =',su)
    print('alphanumeric count =',alsu)
    print('line count =',lincn)
    print('word count =',len(wo))
    print('BoW =',bowY(wo,stop,Mm))
else :
    print('-------------------')
    su = 0
    alsu = 0
    lincn = 0
    wo = []
    for y in line :
        su += int(chcount(y))
        alsu += int(alpcount(y))
        lincn += 1
        wo += wor(y)
    print('char count =',su)
    print('alphanumeric count =',alsu)
    print('line count =',lincn)
    print('word count =',len(wo))
    print('BoW =',bowN(wo,stop))
    
# 6330458221 (30.00) 285 (2021-03-20 15:34)

file_name = input('File name = ')

use = input('Use feature hashing ? (y,Y,n,N) ')
while use != 'y' and use != 'Y' and use != 'n' and use != 'N':
    print('Try again.')
    use = input('Use feature hashing ? (y,Y,n,N) ')

if use == 'y' or use == 'Y':
    M = int(input('M = '))

print('-------------------')

stop_word = open('stopwords.txt', 'r')
stop = ''
for s in stop_word:
    for i in range(len(s)):
        if s[i] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789':
            stop += s[i]
        else:
            stop += ' '
stop = [s.lower() for s in stop.split()]

f = open(file_name, 'r')
file = []
for s in f:
    file.append(s)

char_count = 0
for s in file:
    char_count += len(s)
    if s != file[-1]:
        char_count -= 1
print('char count = ' + str(char_count))

alphanumeric_count = 0
for s in file:
    for i in range(len(s)):
        if s[i] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789':
            alphanumeric_count += 1
print('alphanumeric count = ' + str(alphanumeric_count))

line_count = 0
for s in file:
    line_count += 1
print('line count = ' + str(line_count))

word = ''
for s in file:
    for i in range(len(s)):
        if s[i] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789':
            word += s[i]
        else:
            word += ' '
word = [s.lower() for s in word.split()]
print('word count = ' + str(len(word)))

tword = []
for i in range(len(word)):
    if word[i] not in stop :
        tword += [word[i]]

if use == 'n' or use == 'N':
    bow0 = []
    for i in range(len(tword)):
        bow0.append([tword[i], 1])
    bow = []
    for i in range(len(bow0)):
        q=0
        for d in range(len(bow)):
            if bow0[i][0] == bow[d][0]:
                bow[bow.index(bow[d])][1] += 1
                q = 1
                break
        if q == 0:
            bow.append(bow0[i])
    bow.sort()
    print('BoW = '+ str(bow))
elif use == 'y' or use == 'Y':
    def fhash(w, M):
        fh = 0
        for i in range(len(w)):
            fh += ord(w[i])*(37**(i))
        fh %= M
        return fh
    bow0 = []
    for i in range(len(tword)):
        bow0.append([fhash(tword[i], M),1])
    bow = []
    for i in range(len(bow0)):
        q=0
        for d in range(len(bow)):
            if bow0[i][0] == bow[d][0]:
                bow[bow.index(bow[d])][1] += 1
                q = 1
                break
        if q == 0:
            bow.append(bow0[i])
    bow.sort()
    print('BoW = '+ str(bow))

# 6330459921 (22.99) 286 (2021-03-21 17:25)

# --------------------------------------------------
def fhash(w,M) :
    c = []
    for i in range(len(w)) :
        if ("a" <= w[i] <= "z") or ("A" <= w[i] <= "Z") or ("0" <= w[i] <= "9"):
            c.append(w[i])

    a = []
    for i in range(len(c)) :
        x = str(ord(c[i]))
        a.append(x)
       

    ass = 0   
    for i in range(len(a)):
        z =  int(a[i]) * (37**i )
        ass += z
    ass = ass % M
    return ass
         
# --------------------------------------------------
file_name = input("File name = ")
useBoW = input("Use feature hashing ? (y,Y,n,N)")
# หา "char count" , "alphanumeric count " , "line count" , "words count"
charcount = 0
alphanumericcount = 0
newline = 0
words = []
linecount = 0
file = open(file_name , "r")
for line in file :
    linecount += 1
    for c in line :
        charcount += 1
        if c == "\n" :
            newline += 1
        if "a" <= c <= "z" or "A" <= c <= "Z" or "0" <= c <= "9" :
            alphanumericcount += 1
    word = ""
    for c in line:
        if "a" <= c <= "z" or "A" <= c <= "Z" or "0" <= c <= "9" :
            word += c
        else :
            if len(word) != 0 :
                words.append(word)
            word = ""
charcount = charcount - newline
file.close()

#หาstop words 
stopwords = []
stopw = open( "stopwords.txt" , "r")
for line in stopw :
    for w in line.strip().split() :
        w = w.lower()
        if w not in stopwords :
            stopwords.append(w)



while useBoW not in ["y" , "Y" , "n" , "N"] :
    print("Try again.")
    useBoW = input("Use feature hashing ? (y,Y,n,N)")
BoW = []
if useBoW in ["y" , "Y"] :
    M = int(input("M = "))
    print("-------------------")
    for c in words :
        c = c.lower()
        if c in stopwords :
            pass
        else :
            found = 0
            newc = fhash(c,M)
            for i in range(len(BoW)) :
                if  BoW[i][0] == newc :
                    BoW[i][1] += 1
                    found = 1
                    break
            if not found :
                BoW.append([newc,1])
                            
if useBoW in ["n" , "N"] :
    print("-------------------")
    for c in words :
        c = c.lower()
        if c in stopwords :
            pass
        else :
            found = 0
            for i in range(len(BoW)) :
                if  BoW[i][0] == c :
                    BoW[i][1] += 1
                    found = 1
                    break
            if not found :
                BoW.append([c,1])
                    
        
print("char count = " , charcount)
print("alphanumeric count = " , alphanumericcount)
print("line count = " , linecount)
print("word count = " , len(words))
print("BoW = " , BoW)










# 6330460421 (21.40) 287 (2021-03-20 23:08)

def remove_punc(word):
    the_string = ''
    for i in word:
        if i.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789 ':
            the_string += i
        else:
            the_string += ' '
    return the_string


f = open(input("File name = "), "r")

strofline = ""
listofline = []
linec = 0
charc = 0
for line in f:
    listofline.append(line.lower())
    linec += 1
for i in range(0, len(listofline) - 1):
    listofline[i] = listofline[i][:-1]
for i in range(0, len(listofline)):
    charc += len(listofline[i])
    strofline += listofline[i]
strofline = remove_punc(strofline)
wordstr = ""
for e in strofline:
    if e != " ":
        wordstr += e
    else:
        wordstr += ""
listofwordReal = []
listofword = strofline.strip().split(" ")
while "" in listofword:
    listofword.remove("")
listofwordReal += listofword

f.close()

liststop = []
stop = open("stopwords.txt", "r")

for line in stop:
    liststop.append(line.lower())
for i in range(0, len(liststop) - 1):
    liststop[i] = liststop[i][:-1]

wordstop = []
wordstoplist = []
for i in range(len(liststop)):
    wordstop.append(liststop[i].split(" "))
for i in range(len(wordstop)):
    wordstoplist += wordstop[i]

listofword = strofline.strip().split(" ")
while "" in listofword:
    listofword.remove("")

rdyforhash = []
rdyforhash += listofword

for e in rdyforhash:
    if e in wordstoplist:
        listofword.remove(e)

featureh = input("Use feature hashing ? (y,Y,n,N) ")

while featureh not in ["Y", "y", "N", "n"]:
    print("Try again")
    featureh = input("Use feature hashing ? (y,Y,n,N) ")

listofword.sort()
bowl = []

if not featureh in ["n", "N"]:
    M = input("M = ")
    hashh = []
    for i in range(len(listofword)):
        hasn = 0
        for j in range(len(listofword[i])):
            hasn += (ord(listofword[i][j]) * 37 ** j)
        hasn = hasn % int(M)
        hashh.append(hasn)
        hashh.sort()
    for i in range(len(hashh)):
        if i == 0:
            n = hashh.count(hashh[i])
            bowl.append([hashh[i], n])
        elif hashh[i] != hashh[(i - 1)]:
            n = hashh.count(hashh[i])
            bowl.append([hashh[i], n])

else:
    for i in range(len(listofword)):
        if i == 0:
            n = listofword.count(listofword[i])
            bowl.append([listofword[i], n])
        elif listofword[i] != listofword[(i - 1)]:
            n = listofword.count(listofword[i])
            bowl.append([listofword[i], n])

# print(wordstoplist)
# print(listofline)
# print(strofline)
# print(listofword)
print("-------------------")
print("char count = " + str(charc))
print("line count = " + str(linec))
print("alphanumeric count = " + str(len(wordstr)))
print("word count = " + str(len(listofwordReal)))
print("BoW = " + str(bowl))
# 6330461021 (30.00) 288 (2021-03-22 04:13)

file_read = input("File name = ")
ht = input("Use feature hashing ? (y,Y,n,N) ")
while ht not in ["y","Y","n","N"]:
    print("Try again.")
    ht = input("Use feature hashing ? (y,Y,n,N) ")
if ht == "y" or ht == "Y":
    M = int(input("M = "))
num_words = 0
num_lines = 0
num_charc = 0
num_al = 0
words = ""
fn = open(file_read, "r")
for line in fn:
    num_lines += 1
    for e in line:
        if e != "\n":
            num_charc += 1
        if e in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789":
            num_al += 1
            words += e
        else:
            words += " "
a = words.strip().split()
num_words += len(a)
fn.close()
print("-------------------")
print("char count = " + str(num_charc))
print("alphanumeric count = " + str(num_al))
print("line count = " + str(num_lines))
print("word count = " + str(num_words))

fn1 = open(file_read ,"r")
sample = ""
for line in fn1:
    for e in line:
        if e != "\n":
            sample += e
        else:
            sample += " "
sample_c = ""
for e in sample:
    e = e.lower()
    if e in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789":
        sample_c += e
    else:
        sample_c += " "
sample_b = sample_c.strip().split()
fn1.close()

fn2 = open("stopwords.txt","r")
stops = ""
for line in fn2:
    for e in line:
        if e != "\n":
            stops += e
        else:
            stops += " "
stop_c = ""
for e in stops:
    e = e.lower()
    if e in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789":
        stop_c += e
    else:
        stop_c += " "
stop_sp = stop_c.strip().split()
fn2.close()

sample_b1 = []
for e in sample_b:
    if e not in stop_sp:
        sample_b1.append(e)
sample_b1.sort()
sample_b2 = []
for c in sample_b1:
    if c not in sample_b2:
        sample_b2.append(c)
def count( data, element ):
    c = 0
    for e in data:
        if e == element:
            c += 1
    return c

listb = []
for i in range(len(sample_b2)):
    listb.append(count(sample_b1,sample_b2[i]))
bow = []
for i in range(len(sample_b2)):
    bow.append([sample_b2[i],listb[i]])
def fhash(w,m):
    sum = ord(w[0])
    for i in range(1,len(w)):
        sum += ord(w[i])*37**i
    remainder = sum%m
    return remainder
if ht == "n" or ht == "N":
    print("BoW = " + str(bow))
elif ht == "y" or ht == "Y":
    hbow = []       
    for e in bow:
        hbow.append([fhash(e[0],M),e[1]])
    hbow_n = []
    hbow_c = []
    for e in hbow:
        if e[0] not in hbow_n:
            hbow_n.append(e[0])
            hbow_c.append(e[1])
        else:
            i = hbow_n.index(e[0])
            hbow_c[i] += e[1]
    hbow_f = []
    for i in range(len(hbow_n)):
        hbow_f.append([hbow_n[i],hbow_c[i]])
    hbow_f.sort()
    print("BoW = " + str(hbow_f))
# 6330462721 (30.00) 289 (2021-03-21 19:34)

def fhash( w , M ) :
    x = 0 ; i = 0
    for ch in w :
        x += ord(ch)*(37**i)
        i += 1
    return x%int(M)
def read(file) :
    
    f = open(file,"r")
    words = []
    for line in f :
        line = line.lower()
        for i in range(len(line)):
            if not ("a" <= line[i] <= "z" or "0" <= line[i] <= "9") :
                line = line[:i]+" "+line[i+1:]
        x = line.split()
        for w in x :
            words.append(w)
    f.close()
    
    return words
def clear(words,stopwords):
    
    clear = words[::]
    for i in range(len(words)) :
        if words[i] in stopwords :
            clear.remove(words[i])
    return clear
def BoW_y( L,M ):
    
    f = [] ; BoW = [] ; unique = [] ; x = 0
    for i in range(len(L)) :
        f.append(fhash( L[i] , M ))
    f.sort
    
    for i in range(len(f)) :
        if f[i] not in unique :
            unique.append(f[i])
            
    for i in range(len(unique)) :
        x = f.count(unique[i])
        BoW.append([unique[i],x])
            
    return BoW
def BoW_n( L ):
    
    BoW = [] ; unique = [] ; x = 0
    
    for i in range(len(L)) :
        if L[i] not in unique :
            unique.append(L[i])
            
    for i in range(len(unique)) :
        x = L.count(unique[i])
        BoW.append([unique[i],x])
            
    return BoW
    
    
def count(file_name):
    
    f = open(file_name,"r")
    ch_count = 0 ; a_count = 0 ; l_count = 0
    w_count = 0  ; words = [] ; x = 0
    
    for line in f :
        line = line.lower()
        ch_count += len(line)
        l_count += 1
        for i in range(len(line)) :
            if "a" <= line[i] <= "z" or "0" <= line[i] <= "9" :
                a_count += 1
            else :
                line = line[:i]+" "+line[i+1::]
        w_count += len(line.split())
    
    ch_count = ch_count-l_count+1
    f.close()
        
    return [ch_count,a_count,l_count,w_count]


file_name = input("File name = ")

words = read(file_name)
stopwords = read("stopwords.txt")

ask = input("Use feature hashing ? (y,Y,n,N) ")

while ask not in ["y","Y","n","N"] :
    print("Try again.")
    ask = input("Use feature hashing ? (y,Y,n,N) ")

if ask == "Y" or ask == "y" :
    M = input("M = ")
    BoW = BoW_y(clear(words,stopwords),M )
    BoW.sort()
else :
    BoW = BoW_n(clear(words,stopwords))
    BoW.sort()
    
print("-------------------")
print("char count = "+str(count(file_name)[0]))
print("alphanumeric count = "+str(count(file_name)[1]))
print("line count = "+str(count(file_name)[2]))
print("word count = "+str(count(file_name)[3]))
print("BoW =",BoW)
print(" ")




    

    
# 6330463321 (30.00) 290 (2021-03-22 22:38)

def fhash(c,M):
    f = 0
    for i in range(len(c)):
        f += (ord(c[i]))*(37**i)
    return f%M

T = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
t = T.lower()
n = '0123456789' 

file_name = input('File name = ')
b = input('Use feature hashing ? (y,Y,n,N) ')
while b not in  ['y','Y','n','N']:
    print('Try again.')
    b = input('Use feature hashing ? (y,Y,n,N) ')
if b in ['y','Y']:
    M = int(input('M = '))
    
stopwords = []
fin = open('stopwords.txt','r')
for line in fin:
    for e in line.lower().split():
        stopwords.append(e)
fin.close()

print('-------------------')
all_a = ''
fin = open(file_name,'r')
for line in fin:
    for e in line:
        if e != '\n':
            all_a += e
        else:
            all_a += ' '
fin.close()

a2 = ''
for e in all_a:
    if e in T or e in t or e in n:
        a2 += e
    else:
        a2 += ' '

new_a = a2.strip().lower().split()

aaa = ''
alpha = ''
l_count = 0
fin = open(file_name,'r')
for line in fin:
    l_count += 1
    for e in line:
        if e != '\n':
            aaa += e
    c_count = len(aaa)
print('char count = '+ str(c_count))

fin.close()
for e in aaa:
    if e in T or e in t or e in n:
        alpha += e
    a_count = len(alpha)
print('alphanumeric count = '+ str(a_count))
print('line count = '+str(l_count))
    
w_count = len(new_a)
print('word count = '+str(w_count))


a_withoutstopwords = []
for e in new_a:
    if e not in stopwords:
        a_withoutstopwords.append(e)        
   
      
if b in ['y','Y']:
    f = 0
    new_a = []
    for e in a_withoutstopwords:
        new_a += [fhash(e,M)]
    a_withoutstopwords = new_a
    
c = []
BoW = []
for e in a_withoutstopwords:
    if e not in c:
        c += [e]
        
for e in c:
    count = 0
    for k in a_withoutstopwords:
        if k == e:
            count += 1
    BoW += [[e,count]]
  
print('BoW = '+str(BoW))

# 6330464021 (14.15) 291 (2021-03-21 22:15)
def fhash(w,M) :
    sm=0
    for i in range(len(w)):
        sm+=ord(w[i])*(37**i)
    return sm%M

stopwords=open("stopwords.txt","r")
sw=[]
for e in stopwords:
    a=e.split()
    sw+=a
stopwords.close()

fn=input("File name = ")
file_name=open(fn,"r")
ans=input("Use feature hashing ? (y,Y,n,N) ")
while ans not in "YyNn":
    print("Try again.")
    ans=input("Use feature hashing ? (y,Y,n,N) ")
if ans in "Nn":
    pass
if ans in "Yy":
    M=int(input("M = "))
print("-------------------")    
b=0
for e in file_name:
    for i in e :
        if i != '\n' :
            b+=1
print('char count = '+str(b))
c=0
file_name.close()
file_name=open(fn,"r")
for e in file_name:
    for i in e :
        if 'A'<= i <='z' or '0'<= i <='9':
            c+=1
print('alphanumeric count = '+str(c))
file_name.close()
file_name=open(fn,"r")
d=0
for e in file_name :
    d+=1
print('line count = '+str(d))
file_name.close()
file_name=open(fn,"r")
f=0;g='';h1=[];h2=[]
for e in file_name:
    e+=''
    for i in e:
        if 'A'<= i <='z' or '0'<= i <='9':
            g+=i
        else:
            h1.append(g)
            g=''
for i in h1:
    if i != '':
        h2.append(i)
print('word count = '+str(len(h2)))
file_name.close()
file_name=open(fn,"r")
h3=[]
for e in h2:
    v=e.lower()
    h3.append(v)
if ans in 'Nn':
    h4=[]
    for e in h3:
        if e not in sw:
            h4.append(e)
    h4.sort()
    n=[];cn=1;j=[]
    for e in range(len(h4)-1):
        if h4[e] == h4[e-1]:
            cn+=1
        else:
            j.append([h4[e],cn])
            cn = 1
    j.append([h4[-1],cn])
    print('BoW =',j)
elif ans in 'Yy':
    h4=[]
    for e in h3:
        if e not in sw:
            h4.append(fhash(e,int(M)))
    h4.sort()
    n=[];cn=1;j=[]
    for e in range(len(h4)-1):
        if h4[e] == h4[e+1]:
            cn+=1
        else:
            j.append([h4[e],cn])
            cn = 1
    j.append([h4[-1],cn])
    print('BoW =',j)
file_name.close()



# 6330465621 (30.00) 292 (2021-03-21 22:58)
file_name = input("File name = ")
hash_use = input("Use feature hashing ? (y,Y,n,N) ")
while hash_use not in ['y','Y','n','N'] :
    print("Try again.")
    hash_use = input("Use feature hashing ? (y,Y,n,N) ")
if hash_use in ['y','Y'] :
    M = int(input("M = "))
def char_count(file_name) :
    text = open(file_name, "r")
    char_count = 0
    for e in text :
        for a in e :
            if a != "\n" :
                char_count += 1
    text.close()
    return char_count
print('-'*19)
print("char count =", char_count(file_name))

def alpha_count(file_name) :
    text = open(file_name, "r")
    alpha_count = 0
    for e in text :
        for a in e :
            if "A" <= a <= "Z" or "a" <= a <= "z" or "0" <= a <= "9" :
                alpha_count += 1
    text.close()
    return alpha_count
print("alphanumeric count =", alpha_count(file_name))
def line_count(file_name) :
    text = open(file_name, "r")
    line_count = 0
    for e in text :
        line_count += 1
    text.close()
    return line_count
print("line count = ", line_count(file_name))
def word(file_name) :
    text = open(file_name, "r")
    word = ''
    words = []
    for e in text :
        for a in e :
            if not "A" <= a <= "Z" and not "a" <= a <= "z" and not "0" <= a <= "9" :
                word += ' '
            else :
                word += a
    w = word.split()
    for i in w :
        words.append(i)
    text.close()
    return words
print("word count = ", len(word(file_name)))
def n(word,words) :
    n = 0
    for c in words :
        if word == c :
            n += 1
    return n
def Bow(file_name) :
    Bow = []
    words2 = []
    words_finish = []
    words = word(file_name)
    stop_text = open("stopwords.txt", "r")
    stop_words = []
    for e in stop_text :
        sw = e.split()
        for i in sw :
            stop_words.append(i)
    stop_text.close()
    for e in range(len(words)) :
        words[e] = words[e].lower()
    for a in words :
        if a not in stop_words :
            words2.append(a)
    for b in words2 :
        if b not in words_finish :
            words_finish.append(b)
    for c in words_finish :
        Bow.append([c,n(c,words)])
    return Bow
def fhash(file_name,M) :
    fhash_1 = []
    fhash = []
    words = Bow(file_name)
    for e in words :
        sum_ord = 0
        for i in range(len(e[0])) :
            sum_ord += ord(e[0][i])*(37**i)
        hashing = sum_ord % M
        fhash_1.append([hashing,e[1]])
    fhash_1.sort()
    n = 0
    for i in range(len(fhash_1) - 1 ) :
        if i == len(fhash_1) - 2 :
            if fhash_1[i][0] == fhash_1[i+1][0] :
                n += fhash_1[i][1] + fhash_1[i+1][1]
                fhash.append([fhash_1[i][0],n])
                break
            else :
                fhash.append([fhash_1[i][0],n + fhash_1[i][1]])
                fhash.append(fhash_1[i+1])
                break
        if fhash_1[i][0] == fhash_1[i+1][0] :
            n += fhash_1[i][1]
        else :
            fhash.append([fhash_1[i][0],n + fhash_1[i][1]])
            n = 0
    fhash.sort()
    return fhash

if hash_use in ['y','Y'] :
    print("BoW = ", fhash(file_name,M))
if hash_use in ['n','N'] :
    print("BoW = ", Bow(file_name)) 



# 6330466221 (30.00) 293 (2021-03-22 21:36)

alphabets = ['a','b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
nums = ['0','1','2','3','4','5','6','7','8','9',0,1,2,3,4,5,6,7,8,9]
file_name = input('File name = ')
while True:
    check = input('Use feature hashing ? (y,Y,n,N) ').upper()
    if check in ['y','Y','n','N']:
        break
    else:
        print('Try again.')
if check == 'Y':
    M = input('M = ')
    print('-------------------')
else:
    print('-------------------')
G = 37
def fhash(w,M):
    tot = 0
    for i in range(len(w)):
        tot += ord(str(w[i]))*(G**(i))
    ans = tot%int(M)
    return ans
charcount = 0
linecount = 0
wordss = ''
words = []
read = open('stopwords.txt','r')
stopwords = []
for line in read:
    stopwords += line.split()
read.close()
for i in range(len(stopwords)):
    stopwords[i] = stopwords[i].lower()
file = open(file_name,'r')
word = ''
for line in file:
    wordss += line
    linecount += 1
    for i in line:
        if i not in alphabets and i not in nums:
            word += ' '
        else:
            word += i
words = word.split()
file.close()
for i in range(len(words)):
    words[i] = words[i].lower()
if check == 'Y':
    Bow = []
    BOW = []
    bow = []
    for i in range(int(M)):
        BOW.append([i,0])
        bow.append([i,0])
    for i in range(len(words)):
        if words[i] not in stopwords:
            for j in range(int(M)):
                if fhash(words[i],M) == j:
                    BOW[j][1] += 1
                    bow[j][1] += 1
    for i in range(int(M)):
        if bow[i][1] == 0:
            BOW.remove(bow[i])
elif check == 'N':
    Bow = []
    BOW = []
    for i in words:
        if i not in stopwords:
            if i not in Bow:
                Bow.append(i)
                BOW.append([i,1])
            elif i in Bow:
                for j in range(len(Bow)):
                    if i == Bow[j]:
                        BOW[j][1] += 1
charcount = len(wordss)-linecount+1
wordscount = len(words)
print('char count =', charcount)
print('alphanumeric count =', len(''.join(words)))
print('line count =', linecount)
print('word count =', wordscount)
print('BoW =',BOW)



    

# 6330467921 (29.00) 294 (2021-03-21 23:06)

def fhash(w,M):
    s = 0
    for i in range(len(w)):
       s += ord(w[i])*(37**i)
    fh = s%M
    return fh
def count(word, wordslist):
    c = 0
    for w in wordslist:
        if w == word:
            c += 1
    return c

file_name = input('File name = ')
yn = input('Use feature hashing ? (y,Y,n,N) ')
while yn not in 'yYnN':
    print('Try again.')
    yn = input('Use feature hashing ? (y,Y,n,N) ')
if yn == "y" or yn == 'Y':
    M = int(input('M = '))
    
print('-------------------')
    
stopwords = []
stopfile = open("stopwords.txt","r")
for line in stopfile:
    line = line.lower()
    if len(line) > 0:
        stopwords += line.split()
stopfile.close()

abnum = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
cc = 0
abc123 = 0
lc = 0
words = []
file = open(file_name,"r")
for line in file:
    l = ''
    for a in line:
        if a != '\n':
            cc += 1
        if a not in abnum:
            l += ' '
        else:
            l += a
            abc123 +=1
    words += l.split()
    if len(line) > 0:
        lc += 1 
file.close()
print('char count =',cc)
print('alphanumeric count =',abc123)
print('line count =',lc)
print('word count =',len(words))

for i in range(len(words)):
    words[i] = words[i].lower()
for i in range(len(stopwords)):
    stopwords[i] = stopwords[i].lower()
    
cut_words = []
for a in words:
    if a not in stopwords:
        cut_words.append(a)

if yn == 'y' or yn == 'Y':
    for i in range(len(cut_words)):
       cut_words[i] = fhash(cut_words[i],M)
bow = []
for e in cut_words:
    if e not in bow:
        bow.append(e)
for i in range(len(bow)):
    bow[i] = [bow[i], count(bow[i],cut_words)]
bow.sort()

print('BoW =',bow)
# 6330468521 (27.60) 295 (2021-03-22 16:42)
def char_count(file_name) :
    fin = open(file_name, "r")
    char_count = 0
    for line in fin :
        a = line.strip()
        char_count += len(a)
    fin.close()
    return char_count
def alphanumeric_count(file_name) :
    fin = open(file_name, "r")
    alphanumeric_count = 0
    for line in fin :
        for e in line.strip() :
            if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9" :
                alphanumeric_count += 1
            else :
                alphanumeric_count += 0
    fin.close()
    return alphanumeric_count
def line_count(file_name) :
    fin = open(file_name, "r")
    line_count = 0
    for line in fin :
        if len(line) > 0 :
            line_count += 1
    fin.close()
    return line_count
def word_count(file_name) :
    fin = open(file_name, "r")
    words = ""
    for line in fin :
        for e in line.strip() :
            if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9" :
                words += e
            else :
                words += " "
    words_list = words.split()
    word_count = len(words_list)
    fin.close()
    return word_count
def BOW_list(file_name) :
    fin = open(file_name, "r")
    words_of_BOW2 = []
    words_of_BOW = []
    for line in fin :
        words_of_BOW1 = ""
        for e in line.strip() :
            if "a" <= e <= "z" or "A" <= e <= "Z" or "0" <= e <= "9" :
                words_of_BOW1 += e
            else :
                words_of_BOW1 += " "
        words_of_BOW2.append(words_of_BOW1.lower().split())
    for i in range(len(words_of_BOW2)) :
        for e in words_of_BOW2[i] :
            words_of_BOW.append(e)
    fin.close()
    fin1 = open("stopwords.txt", "r")
    stopwords_list1 = []
    stopwords_list = []
    for line in fin1 :
        stopwords_list1.append(line.strip().split())
    for i in range(len(stopwords_list1)) :
        for e in stopwords_list1[i] :
            stopwords_list.append(e)
    fin1.close()
    BOW_list = []
    BOW_list[:] = words_of_BOW
    for e in stopwords_list :
        k = 0
        while k < len(BOW_list) :
            if BOW_list[k] == e :
                BOW_list.pop(k)
            else :
                k += 1
    return BOW_list
def fhash(w, M) :
    G = 37
    s = 0
    for i in range(len(w)) :
        s += ord(w[i])*(G**i)
    s1 = s % int(M)
    return s1
def Bow_yY(p) :
    B = []
    for e in p :
        y = fhash(e, int(M))
        B.append(y)
    B1 = []
    B2 = []
    for e in B :
        x = 0
        for i in range(len(B)) :
            if e == B[i] :
                x += 1
        B1.append([e,x])
    B1.sort()
    for e in B1 :
        if e not in B2 :
            B2.append(e)
    return B2
def BOW_nN(v) :
    b = []
    b[:] = v
    list1 = []  #before
    for e in v :
        c = []
        n = 0
        for i in range(len(v)) :
            if b[i] == e :
                n += 1
        c.append(e)
        c.append(n)
        list1.append(c)
    BOW_nN = []
    for e in list1 :
        if e not in BOW_nN :
            BOW_nN.append(e)
    return BOW_nN
file_name = input("File name = ")
u = input("Use feature hashing ? (y,Y,n,N) ")
while u not in ["y","Y","n","N"] :
    print("Try again.")
    u = input("Use feature hashing ? (y,Y,n,N) ")
if u == "y" or u == "Y" :
    M = input("M = ")
    print("-------------------")
    print("char count = "+str(char_count(file_name)))
    print("alphanumeric count = "+str(alphanumeric_count(file_name)))
    print("line count = "+str(line_count(file_name)))
    print("word count = "+str(word_count(file_name)))
    p = BOW_list(file_name)
    print("BoW = "+str(Bow_yY(p)))
elif u == "n" or u == "N" :
    print("-------------------")
    print("char count = "+str(char_count(file_name)))
    print("alphanumeric count = "+str(alphanumeric_count(file_name)))
    print("line count = "+str(line_count(file_name)))
    print("word count = "+str(word_count(file_name)))
    v = BOW_list(file_name)
    print("BoW = "+str(BOW_nN(v)))
# 6330469121 (16.00) 296 (2021-03-22 23:58)
def fash(w,M):
    M=int(M)
    x=0
    for i in range(len(w)):
        x+=ord(w[i])*(37**i)
    return x%M
def comprog(q):
    q=open(q,"r")
    q1=[]
    c=0
    word=[]
    n=''
    k=''
    z='qwertyuiopasdfghjklzxcvbnm'
    z+=z.upper()+'1234567890'
    x=z+' '
    for line in q:
        n+=line
        for i in line:
            if i in z:
                q1.append(i)
            elif i == '\n':
                c+=1
            if i in x:
                k+=i
                
    word=k.split()
    a=len(n)-c
    b=len(q1)
    c=c+1
    d=len(word)
    
    
    print('char count =',a)
    print('alphanumeric count =',b)
    print('line count =',c)
    print('word count =',d)
    q.close()
def count( data, element ):
    c = 0
    for e in data:
        if e == element: c += 1
    return c
def bow(file,m):
    file=open(file,"r")
    stopword=open('stopwords.txt',"r")
    stop=[]
    for line in stopword:
        stop+=line.split()       
    z='qwertyuiopasdfghjklzxcvbnm0987654321'
    a=[]
    b=[]
    for line in file:
        line=line.lower().split()
        for e in line:
            s = ''
            for i in range(len(e)):
                if e[i] in z:
                    s+=e[i]
            a.append(s)
    for i in range(len(a)):
        if not a[i] in stop:
            b.append(a[i])
    c=[]
    b.sort()
    for i in range(len(b)-1):
        if b[i]!=b[i+1]:
            c.append(b[i])
    c.append(b[-1])
    d=['']*len(c)
    for i in range(len(c)):
        d[i]=[c[i],count(b,c[i])]
    if m.lower()=='n':
       
        return d
    if m.lower()=='y':
        M=input('M = ')
        x=[]
        for i in range(len(b)):
            x.append(fash(b[i],M))
        x.sort()
        c=[]
        for i in range(len(x)-1):
            if x[i]!=x[i+1]:
                c.append(x[i])
        c.append(x[-1])
        d=['']*len(c)
        for i in range(len(c)):
            d[i]=[c[i],count(x,c[i])]
        
        return d

file_name=input('File name = ')
comprog(file_name)
m=input('Use feature hashing ? (y,Y,n,N) ')
while m.lower() != 'n' and m.lower() !='y':
        print('Try again.')
        m=input('Use feature hashing ? (y,Y,n,N) ')
print(bow(file_name,m))


# 6330470721 (30.00) 297 (2021-03-21 18:47)
#Prog-08: Bag-of-words
#6330470721 (30.00) Name Wongsatorn Suwisuthikasame
file_name = input("File name = ")
a = input("Use feature hashing ? (y,Y,n,N) ")
def fhash(w,M):
    G = 37
    ans =0
    list_ans=[]
    list1=[]
    for n in range(len(w)):
        for i in range(len(w[n])):
            ans +=ord(w[n][i])*G**i
        x = ans%(int(M))
        list_ans += [ans]
        ans =0
        list1.append(x)
        list1.sort()
    res_count=[]
    res_cal=[]
    for p in list1:
        if p not in res_cal:
            res_cal.append(p)
    for k in res_cal:
        res_count.append([k,list1.count(k)])
    
    print("BoW =",res_count)
def count_char(file_name):
    v=open(file_name,"r")
    line_count=0
    line1 = ''
    for line in v:
        if "\n" in line:
            line_count+=1
        line1 += line
    char_num=len(line1)-line_count
    print("char count =",char_num)
    v.close()
def count_alpha(file_name):
    v=open(file_name,"r")
    line_count=0
    e = ''
    words = []
    for line in v:
        line = line.lower()
        if "\n" in line:
            line_count+=1
        for i in range(len(line)):
            if not "a"<=line[i]<="z" and not "0"<=line[i]<="9":
                line = line.replace(line[i],' ')      
        word = line.split()
        words += word
    for i in range(len(words)):
        e += words[i]
    print('alphanumeric count =',len(e))
    v.close()
def count_line(file_name):
    f = open(file_name,'r')
    countline = 0
    countline1 = 0
    for line in f:
        if line != '\n':
            countline += 1
        elif line == '\n':
            countline1 += 1
    c = countline + countline1 
    print('line count =',c)
    f.close()
def count_word(file_name):
    f = open(file_name,"r")
    words =[]
    for line in f:
        line = line.lower()
        for i in range(len(line)):
            if not "a"<=line[i]<="z" and not "0"<=line[i]<="9":
                line = line.replace(line[i],' ')      
        word = line.split()
        words += word
    print('word count =',len(words))
    f.close()
    

while True:
        if a in ['Y','y']:
            m = input("M = ")
            print('-------------------')
            count_char(file_name)
            count_alpha(file_name)
            count_line(file_name)
            count_word(file_name)
            stop = open("stopwords.txt","r")
            stop_all =[]
            for line1 in stop:
                line1 = line1.lower()
                stop_list = line1.split()
                stop_all += stop_list
            stop.close()
            f = open(file_name,"r")
            words =[]
            for line in f:
                line = line.lower()
                for i in range(len(line)):
                    if not "a"<=line[i]<="z" and not "0"<=line[i]<="9":
                       line = line.replace(line[i],' ')      
                word = line.split()
                words += word
                words_cut =[]
                for i in range(len(words)):
                    if words[i] not in stop_all:
                        words_cut.append(words[i])
            fhash(words_cut,m)
            f.close()
            break
        elif a in ['N','n']:
            print('-------------------')
            count_char(file_name)
            count_alpha(file_name)
            count_line(file_name)
            count_word(file_name)
            stop = open("stopwords.txt","r")
            stop_all =[]
            for line1 in stop:
                line1 = line1.lower()
                stop_list = line1.split()
                stop_all += stop_list
            stop.close()
            f = open(file_name,"r")
            words =[]
            for line in f:
                line = line.lower()
                for i in range(len(line)):
                    if not "a"<=line[i]<="z" and not "0"<=line[i]<="9":
                       line = line.replace(line[i],' ')      
                word = line.split()
                words += word
                words_cut =[]
                for i in range(len(words)):
                    if words[i] not in stop_all:
                        words_cut.append(words[i])
            res_count=[]
            res_cal=[]
            for p in words_cut:
                if p not in res_cal:
                    res_cal.append(p)
            for k in res_cal:
                res_count.append([k,words_cut.count(k)])
            res_count.sort()
            print('BoW =',res_count)
            f.close()
            break
        else:
            print("Try again.")
            a = input("Use feature hashing ? (y,Y,n,N) ")

   


# 6330471321 (14.95) 298 (2021-03-22 10:19)
def fhash(w, M):
    G = 37
    y = 0
    for i in range(len(w)):
        x = 0
        x = ord(w[i]) * G ** i
        y += x
    z = y % int(M)
    return z

def acount(x):
    total = 0
    z = 0
    for i in range(len(x)):
        if x[i].isalpha() or x[i].isdigit():
            z += 1
    return z

def wcount(x):
    ofn = open(x, 'r')
    ofn = ofn.read()
    ofn = ofn.replace("\n", "")
    for i in range(len(ofn)):
        if not ofn[i].isalpha() and not ofn[i].isdigit():
            ofn = ofn.replace(ofn[i], " ")
    return ofn.split()

def BoW(x, h, M = 0):
    unique = []
    res = []
    stopwords = open("stopwords.txt", "r")
    stopwords = stopwords.read()
    stopwords = stopwords.split()

    i = 0
    while i != len(x) - 1:
        if x[i] in stopwords:
            del x[i]
        else:
            i += 1

    if h:
        x = list(map(lambda y: fhash(y, M), x))

    for i in x:
        if i not in unique:
            unique.append(i)

    for i in unique:
        res.append([i, x.count(i)])

    return sorted(res)
                


file_name = input('File name = ',)
while True:
    y = input('Use feature hashing ? (y,Y,n,N) ')
    ofn = open(file_name,"r")

    content = ofn.read()

    if y in 'yY':
        M = input('M = ',)
        print('-------------------')
        print('Char count =', len(content))
        a_count = acount(content)
        print('alphanumeric count =',a_count)
        line = len(content.splitlines())
        print('line count =',line)
        print('word count =', len(wcount(file_name)))
        print('BoW =', BoW(wcount(file_name), True, M))
        break

    elif y in 'nN':
        print('-------------------')
        print('Char count =',len(content))
        a_count = acount(content)
        print('alphanumeric count =',a_count)
        line = len(content.splitlines())
        print('line count =', line)
        print('word count =', len(wcount(file_name)))
        print('BoW =', BoW(wcount(file_name), False))
        break

    else:
        print('Try again.')

# 6330472021 (15.00) 299 (2021-03-22 23:25)
file_name = input('File name = ')
use_fhash = input('Use feature hashing ? (y,Y,n,N) ')
use_fhash_list=['y','Y','n','N']
while use_fhash not in use_fhash_list :
    print('Try again.')
    use_fhash = input('Use feature hashing ? (y,Y,n,N) ')
if use_fhash =='y' or use_fhash == 'Y':
    m=int(input('M = '))
def char_count():
    fn = open(file_name,'r')
    line= fn.readline()
    c=0
    while len(line)>0:
        c += len(line)-1
        line= fn.readline()
    fn.close()
    return c
print('-'*19)
print('char count =',char_count())
def alphanumeric_count():
    fn = open(file_name,'r')
    line= fn.readline()
    c=0
    while len(line)>0:
        for i in range(len(line)):
            if 'a' <= line[i] <= 'z' or 'A' <= line[i] <= 'Z' or '0' <= line[i] <= '9':
                c+= 1
        line = fn.readline()
    fn.close()
    return c
print('alphanumeric count =',alphanumeric_count())
def line_count():
    fn = open(file_name,'r')
    line= fn.readline()
    c=0
    while len(line)>0:
        c+=1
        line = fn.readline()
    fn.close()
    return c
print('line count =',line_count())
def word_count():
    fn = open(file_name,'r')
    line= fn.readline()
    word=''
    while len(line)>0:
        for i in range(len(line)):
            if 'a'<= line[i] <= 'z' or 'A' <= line[i] <= 'Z' or '0' <= line[i] <= '9':
                word += line[i]
            else:
                word += ' '
        line=fn.readline()
    fn.close()
    w=word.split()
    c= len(w)
    return c
print('word count =',word_count())

fn = open(file_name,'r')
line= fn.readline()
word=''
while len(line)>0:
    for i in range(len(line)):
        if 'a'<= line[i] <= 'z' or 'A' <= line[i] <= 'Z' or '0' <= line[i] <= '9':
            word += line[i]
        else:
            word += ' '
    line=fn.readline()
fn.close()
word=word.lower()
w=word.split()
fx = open('stopwords.txt','r')
linee = fx.readline()
stop_word=''
while len(linee)>0:
    for i in range(len(linee)):
        if 'a'<= linee[i] <= 'z' or 'A' <= linee[i] <= 'Z' or '0' <= linee[i] <= '9':
            stop_word += linee[i]
        else:
            stop_word += ' '
    linee=fx.readline()
fx.close()
stw = stop_word.split()
s=[]
for e in w:
    if e not  in stw:
        s.append(e)

    
def fhash(word,m):
    sum=0
    for i in range(len(word)):
        sum += ord(word[i])*(37**i)
    fh = sum%m
    return fh

        

        
    


    
    
    
    
    
    
    
# 6330474221 (24.90) 300 (2021-03-22 20:52)
alpnum = 'abcdefghijklmnopqrstuvwxyz0123456789'

file = input('File name = ')
x = input('Use feature hashing ? (y,Y,n,N) ').lower()
M = ''
def Bag_of_words(words):
    BoW = []
    word_list = []
    count = []
    
    for word in words:
        if word not in word_list:
            word_list.append(word)
            count.append(int(1))
        else:
            for i in range(len(word_list)):
                if word_list[i] == word:
                    count[i]+=1
                    
    for i in range(len(word_list)):
        BoW.append([word_list[i],count[i]])
    
    return sorted(BoW)
def fhash_BOW(BoW,M):
    BoW_hash = []
    hash_list = []
    hash_count = []
    for word, count in BoW:
        num_hash = fhash(word,M)
        if num_hash not in hash_list:
            hash_list.append(num_hash)
            hash_count.append(count)
        else:
            for i in range(len(hash_list)):
                if num_hash == hash_list[i]:
                    hash_count[i]+=count
    
    
    for i  in range(len(hash_list)):
            
        BoW_hash.append([hash_list[i],hash_count[i]])
        
    BoW_hash = sorted(BoW_hash)
    return BoW_hash
def fhash(word, M):
    f = 0
    
    for i,char in enumerate(word):
        f += ord(char)*(37**i)
        
    f = f%int(M)
    
    return f


while x not in ['n','y']:
    print('Try again')
    x = input('Use feature hashing ? (y,Y,n,N) ').lower()
    
if x == 'y':
    M = input('M =')
    
sFile = open('stopwords.txt','r')
stop_words = []
for line in sFile:
    stop_words += line.split()
    stop_words = list(map(str.lower,stop_words))
sFile.close()

wFile = open(file,'r')

charCount = 0
alpCount = 0
lineCount = 0
wordCount = 0

words = []

for line in wFile:
    lineCount+=1
    words += line.split()
    words = list(map(str.lower,words))
    charCount+= len(line.strip())
    
wFile.close()

clean_words = []

for word in words:
    text = ''
    for char in word:
        if char in alpnum:
            text += char
            alpCount+=1
    clean_words.append(text)
    
wordCount += len(clean_words)

clean_word_stopword = []

for word in clean_words:
    if word not in stop_words:
        clean_word_stopword.append(word)
        
print(clean_word_stopword)

BoW = Bag_of_words(clean_word_stopword)

if x == 'y':
    BoW_hash = fhash_BOW(BoW,M)
    
print('-------------------')
print('char count =',charCount)
print('alphanumeric count =',alpCount)
print('line count =',lineCount)
print('word count =',wordCount)

if x =='y':
    print('BoW = ', BoW_hash)
else:
    print('BoW = ', BoW)
    

             
    
# 6330475921 (26.00) 301 (2021-03-22 15:57)
file_name= input("File name = ")
fh= input("Use feature hashing ? (y,Y,n,N) ")
while fh != "y" and fh !="Y" and fh!="n" and fh!="N":
    print("Try again.")
    fh = input("Use feature hashing ? (y,Y,n,N) ")
if fh == "y" or fh =="Y":
    M = int(input("M = " ))
print('-------------------')
sample = open(file_name,"r")
stop_f = open("stopwords.txt","r")
stop = stop_f.read()
text = sample.read()
text = text.lower()
charc=0
alphac=0
linec=1
wordc=0
i = 0
while i < len(text):
    if text[i] == '\n':
        linec += 1
    if text[i] != '\n':
        charc += 1
    if text[i] in 'abcdefghijklmnopqrstuvwxyz0123456789':
        alphac += 1
    i+=1
i=0
text2=""
while i < len(text):
    if text[i] not in 'abcdefghijklmnopqrstuvwxyz0123456789':
        text2+= " "
    else:
        text2+=text[i]
    i+=1
txtlist=text2.split()
i=0
stop2=""
while i< len(stop):
    if stop[i] == "\n":
        stop2+=" "
    else:
        stop2+= stop[i]
    i += 1
stplist= stop2.split()

if fh == 'n' or fh =='N':
    bow = []
    motf = []
    i = 0
    while i < len(txtlist):
        if txtlist[i] not in stplist:
            if txtlist[i] not in motf:
                motf.append(txtlist[i])
                bow.append([txtlist[i], 1])
            else:
                ind = motf.index(txtlist[i])
                bow[ind] = [txtlist[i], bow[ind][1]+1]
        i += 1
        
else:
    bow = []
    motf = []
    i = 0
    while i < len(txtlist):
        if txtlist[i] not in stplist:
            j = 0
            ans_f = 0
            while j < len(txtlist[i]):
                ans_f += ord(txtlist[i][j])*(37**j)
                j += 1
            ans = ans_f%M
            if ans not in motf:
                motf.append(ans)
                bow.append([ans, 1])
            else:
                ind = motf.index(ans)
                bow[ind] = [ans, bow[ind][1]+1]
        i += 1
bow.sort()
wordc = len(txtlist)
sample.close()
stop_f.close()
print('char chount = ', charc)
print('alphanumeric count =', alphac)
print('line count =', linec)
print('word count =', wordc)
print('Bow =', bow)

            
            
        
        
    
    
    

    


# 6330476521 (20.06) 302 (2021-03-22 19:54)
def fhash(w,M):
    x = 0
    for i in range(len(w)):
        x += ord(w[i])*37**i
    r = x%M
    return r
def get_unique(words):
    unique_words = []
    for i in range(len(words)):
        if words[i] not in unique_words:
            unique_words.append(words[i])
    return unique_words
def count_words(word,g_word):
    c = 0
    for i in range(len(g_word)):
        if word == g_word[i]:
            c += 1
    return c
#------------------------------------------------
fn = open('stopwords.txt',"r")
stopwords = ''
for line in fn:
    stopwords += line
    stopwords = stopwords.strip()+' '
stopwords = stopwords.split()
fn.close()
#------------------------------------------------
file_name = input('File Name = ')
fn1 = open(file_name,"r")
words = ''
s_words = ''
lines = 0
cw = 0
for line in fn1:
    words += line.lower()
    lines += 1
    if '\n' in line:
        cw += len(line)-1
    else:
        cw += len(line)
fn1.close()
for ch in words:
    if ch == '\n':
        s_words += ' '
    elif 'a' <= ch <= 'z' or '0' <= ch <= '9' or ch == ' ':
        s_words += ch
s_words = s_words.split()
u_words = get_unique(s_words)
while True:
    f_hash = input('Use feature hashing ? (y,Y,n,N) ')
    if f_hash in ['y','Y','n','N']:
        if f_hash in ['y','Y']:
            M = int(input('M = '))
        break
    else:
        print('Try again.')
print('-------------------')
#------------------------------------------------
alphanumaric = 0
for e in s_words:
    alphanumaric += len(e)
BoW = []
if f_hash in ['n','N']:
    for w in u_words:
        if w not in stopwords:
            BoW.append([w,count_words(w,s_words)])
elif f_hash in ['y','Y']:
    f_hashed = []
    for w in s_words:
        if w not in stopwords:
            f_hashed.append(fhash(w,M))
    u_hashed = get_unique(f_hashed)
    for n in u_hashed:
        BoW.append([n,count_words(n,f_hashed)])
BoW.sort()
print('char count =',cw)
print('alphanumaric count =',alphanumaric)
print('line count =',lines)
print('word count =',len(s_words))
print('BoW =',BoW)
# 6330477121 (17.15) 303 (2021-03-22 23:55)

def Input_data():
    Count = 0
    M = -1
    
    File_name_input = input('File name = ')
    
    BoW_num = input('Use feature hashing ? (y,Y,n,N) ')
    
    while BoW_num not in ['Y', 'y', 'N', 'n']:
        print('Try again.')
        BoW_num = input('Use feature hashing ? (y,Y,n,N) ')
        Count += 1
        
    if BoW_num in ['y', 'Y']:
        M = int(input('M = '))
        BoW_num = True
        
    elif BoW_num in ['n' , 'N']:
        BoW_num = False
        
    else:
        pass
        
    print('-------------------')

    return File_name_input, BoW_num, M
def TikTok(w, M):
    Start = 37
    Second = 0
    for i in range(len(w)):
        Second += ((Start**i) * ord(w[i]))
        
    Ans = (Second % M)
    
    return Ans
def Words_Func(File_name_input):
    Lenght1= 0
    Lenght2 = 0
    Num_Line = 0
    words = []
    Count = 0
    word = ''
    wordsFile = open(File_name_input, 'r')
    
    for line in wordsFile:
        Num_Line += 1
        for c in line:
            Lenght1+= 1
            if c == '\n':
                Lenght1-= 1
            else:
                pass
            #Count += 1
                
        for c in line:
            if ('a' <= c <= 'z') or ('A' <= c <= 'Z') or ('0' <= c <= '9'):
                Lenght2 += 1
                word += c
                
            else:
                if len(word) != 0:
                    words.append(word)
                else:
                    False
            #Count += 1
                    
                word = ''
                
    wordsFile.close()

    return Lenght1 , Lenght2 , Num_Line, words

def StopWords_Func():
    r = []
    File_Of_stopWords = open('stopwords.txt', 'r')
    Count = 0
    
    for line in str(File_Of_stopWords):
        for i in line.strip().split():
            i = i.lower()
            
            if i not in r:
                r.append(i)
                
            else:
                False
            
            #Count += 1
    File_Of_stopWords.close()

    return r


def BoW_Ans(words, stopWords, BoW_num, M):
    Ans = []
    for j in words:
        j = j.lower()
        
        if j in stopWords:
            pass
        
        else:
            Check = False
            if BoW_num:
                Edit = TikTok(j, M)
                for i in range(len(Ans)):
                    
                    if Ans[i][0] == Edit:
                        Ans[i][1] += 1
                        Check = True
                        break
                    
                    else:
                        pass
                    
                if not Check:
                    Ans.append([Edit, 1])
                else:
                    pass
                    
            else:
                for i in range(len(Ans)):
                    if Ans[i][0] == j:
                        Ans[i][1] += 1
                        Check = True
                        break
                    else:
                        False
                        
                if not Check:
                    Ans.append([j, 1])
                else:
                    pass

    return Ans


File_name_input,\
BoW_num, \
M = Input_data()

stopWords = StopWords_Func()

Lenght1,\
Lenght2 ,\
Num_Line, words = Words_Func(File_name_input)


print('char count = ', Lenght1)
print('alphanumeric count = ', Lenght2)
print('line count = ', Num_Line)
print('word count = ', len(words))
print('BoW = ', BoW_Ans(words, stopWords, BoW_num, M))
# 6330478821 (14.70) 304 (2021-03-21 23:16)
def start():
    hashes = input('Use feature hashing ? (y,Y,n,N) ')
    if hashes=='n' or hashes=='N':
        print('-------------------')
        print('char count = '+str(char_count))
        print('alphanumeric count = '+str(alpha_count))
        print('line count = '+str(line_count))
        print('word count = '+str(word_count))
        print('BoW = '+str(new_ans))
    elif hashes=='y' or hashes=='Y':
        M=input('M = ')
        for i in new_ans:
            result=0
            c=1
            for j in i[0]:
                result=result+(ord(j)*c)
                c=c*37
            result=result%int(M)
            if result in hash_ans_idx:
                hash_ans[hash_ans_idx.index(result)][1]+=i[1]
            else:
                hash_ans.append([result,i[1]])
                hash_ans_idx.append(result)
        print('-------------------')
        print('char count = '+str(char_count))
        print('alphanumeric count = '+str(alpha_count))
        print('line count = '+str(line_count))
        print('word count = '+str(word_count))
        print('BoW = '+str(hash_ans))
    else:
        print("Try again")
        start()
########################################################
filename = input("File name = ")
sample = open(filename, "r")
read = sample.read().lower()
#print(read)
ans=[] #<word,number of word>
new_ans=[]
hash_ans=[]
hash_ans_idx=[]
word=[]
stopwords=[]
s=''
p=''
#print(len(read))
#alphabet='abcdefghijklmnopqrstuvwxyz'
#print(sample)
#line_count = len(read.readlines())
line_count=1
pos=0
find=read.find('\n',pos,len(read))
while(find!=-1):
    find=read.find('\n',find+1,len(read))
    #print(find)
    line_count+=1
#print(read)
char_count = len(read)-line_count+1 # not sure
for i in read:
    if i in '\"\'.,:;@#!%&*()|<>%\n ':
        if s in word:
            ans[word.index(s)][1]+=1
        else:
            ans.append([s,1])
            word.append(s)
        s=''
    else:
        s=s+i
#print(ans)
word_count=0
alpha_count=0
for i in ans:
    if(i[0]!=''):
        word_count+=i[1];
    alpha_count=alpha_count+(len(i[0])*i[1])
stop = open("stopwords.txt", "r")
#print(stopword.read())
stopread=stop.read()
for i in stopread:
    if i in '\"\'.,:;@#!%&*()|<>%\n ':
        if p not in stopwords:
            stopwords.append(p)
        p=''
    else:
        p=p+i
#print(stopwords)
for i in ans:
    if i[0] not in stopwords and i[0]!='':
        new_ans.append(i)
        new_ans.sort()
#print(new_ans.sort())

start()
# 6330481621 (30.00) 305 (2021-03-22 19:51)
file_name=input("File_name= ")
use=input("Use feature hashing ? (y,Y,n,N) ")
while use not in ['y','Y','n','N']:
    print('Try again.')
    use=input("Use feature hashing ? (y,Y,n,N) ")
if use in ['y','Y']:
    M=input("M = ")
print('-------------------')
stop=open('stopwords.txt','r')
file=open(file_name,'r')
cha=0
alpha=0
stw=[]
linecount=0
wordcount=0
sen=''#ประโยคในfile ที่ cleanแล้ว
for line in stop:
    a=line.split()
    for e in a:
        stw.append(e)
for line in file:
    linecount+=1
    a=line.split()
    cha+=len(line)
    for e in line:
        if e.isalnum():
            sen+=e
        else:
            sen+=' '
word=sen.split() #[]คำสะอาด
wordlow=[] #[]คำสะอาดพิมเล็ก
wordcount=len(word)
cha=cha-linecount+1
for e in word:
    wordlow.append(e.lower())
for i in range(len(word)):
    for e in word[i]:
        if '0'<=e<='9' or 'a'<=e.lower()<='z':
            alpha+=1
perfsen=' '.join(wordlow) #ประโยคสวย
print('char count =',cha)
print('alphanumeric count =',alpha)
print('line count =',linecount)
print('word count =',wordcount)
#-----------------------------------
def fhash(w,M):
    summ=0
    G=37
    for i in range(len(w)):
        summ+=ord(w[i])*G**i
    sol=summ%int(M)
    return sol
#-----------------------------------
def clean(s):
    a=[ '(', ')', '-', '_', '[', ']' ,'"' ,"'" ,';', ':', '>', '<','.' ]
    c=[]
    for i in range(len(s)):
        if s[i] not in a:
            c.append(s[i])
    return c
#-----------------------------------
nsen=[]
for e in wordlow:
    if e not in stw:
        nsen.append(e)
newsen=clean(nsen)
#-----------------------------------
x=[]
y=[]
for i in range(len(newsen)):
    if newsen[i] not in x:
        x.append(newsen[i])
        y.append(1)
    else:
        y[x.index(newsen[i])]+=1
block=[]
for i in range(len(x)):
    block.append([x[i],y[i]])
block.sort()
#-----------------------------------
if use in ['y','Y']:
    o=[]
    p=[]
    for i in range(len(newsen)):
        if fhash(newsen[i],M) not in o:
            o.append(fhash(newsen[i],M))
            p.append(1)
        else:
            p[o.index(fhash(newsen[i],M))]+=1
    q=[]
    for i in range(len(o)):
        q.append([o[i],p[i]])
    q.sort()
    print('BoW =',q)
else:
    print('BoW =',block)
stop.close()
file.close()
# 6330482221 (30.00) 306 (2021-03-21 02:40)
def hashing():
    hashing = input("Use feature hashing ? (y,Y,n,N) ").lower()
    while hashing != "y" and hashing != "n":
        print("Try again.")
        hashing = input("Use feature hashing ? (y,Y,n,N) ").lower()
    if hashing == "y":
        return True
    if hashing == "n":
        return False
def do_hashing(w):
    g = 37 ; m = int(input("M = ")) ; l = [] ; l1 = [] ; l2 = [] ; l3 = [] 
    for i in w:
        for e in range(len(i)):
            a = ord(i[e])*(g**e)
            l.append(a)
        l1.append(sum(l)%m)
        l = []
    for i in l1:
        if i not in l2:
            l2.append(i)
    for i in l2:
        l3.append([i,l1.count(i)])
    return sorted(l3)
 #----------------------------------------------------------------------------------#           
file_name = input("File name = ")
fin = open(file_name,"r")  
check_hashing = hashing()
stopwords = open("stopwords.txt","r")
l = [] ; l1 = [] ; l2 = [] ; abc = "" # list à¹€à¸à¹‡à¸š stopwords,sample 
for i in stopwords:
    i = i.split()
    for e in range(len(i)):
        l.append(i[e])

countall = 0 ; lines = 0
for m in fin:
    lines += 1
    countall += len(m)
    for e in m:
        if e.isalnum(): abc += e
        else : abc += " "

w = abc.split()
for x in range(len(w)):
    l1.append(w[x])
countall = countall - (lines-1)
countcn = 0
for j in l1:
    for e in j:
        if e.isdigit() or e.isalpha(): countcn += 1

for i in range(len(l1)): l1[i] = l1[i].lower()

ans = ""
for i in l1:
    ans += " "
    for u in range(len(i)):
        if "a" <= i[u] <= "z" or "0" <= i[u] <= "9" : ans += i[u]
        else: ans += " "
ans = ans.strip().split()

for i in range(len(ans)):
    if ans[i] in l: ans[i] = ""

finalword = []
for i in ans:
    if i not in finalword: finalword.append(i)
for i in finalword:
    if len(i) == 0: finalword.remove(i)

ans_new = []
for i in ans:
    if i != "": ans_new.append(i)

counts = []
for i in finalword:
    counts.append(ans.count(i))

bow = []
for i in range(len(finalword)):
    bow.append([finalword[i],counts[i]])
bow = sorted(bow)

if not check_hashing: bow = bow
else:
    hh = do_hashing(ans_new)
print("-------------------")
print(f'char count = {countall}')
print(f'alphanumeric count = {countcn}')
print(f'line count = {lines}')
print(f'word count = {len(l1)}')
if check_hashing:
    print(f'BoW = {hh}')
else:
    print(f'BoW = {bow}')
fin.close()
stopwords.close()
# 6330483921 (26.00) 307 (2021-03-22 22:21)

alpha=['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s' \
,'t','u','v','w','x','y','z']
number=['0','1','2','3','4','5','6','7','8','9']
a=input('File name = ')
b=input('Use feature hashing ? (y,Y,n,N) ')
file=open(a,'r')
stopw=open('stopwords.txt','r')
stw=[]
for line in stopw :
    sw=line.split()
    for i in sw :
        stw.append(i)
def ccount(s) :
    return len(s)
def acount(s) :
    c=0
    for i in s :
        if i in alpha or i in number :
            c=c+1
    return c
def wcount(s) :
    e=s.split()
    return (len(e))
def nfh(s) :
    BoW=[]
    bow=[]
    boww=[]
    count=[]
    e=s.split()
    for i in e :
        if i not in stw :
            bow.append(i)
    for i in bow :
        if i not in boww :
            boww.append(i)
            count.append(int(1))
        else :
            for k in range (len(boww)) :
                if i == boww[k] :
                    count[k] += 1
    for i in range (len(boww)) :
        BoW.append([boww[i],count[i]])
    return BoW
def fh(s,M) :
    b=0
    for i in range (len(s)) :
        b=b+(ord(s[i])*(37**i))
    return b%M
def yfh(s,M) :
    c=s.split()
    f=[]
    d=[]
    e=[]
    BoW=[]
    count=[]
    for i in c :
        if i not in stw :
            f.append(i)
    for i in f :
        d.append(fh(i,M))
    for i in d :
        if i not in e :
            e.append(i)
            count.append(int(1))
        else :
            for k in range (len(e)) :
                if i == e[k] :
                    count[k] += 1
    for i in range (len(e)) :
        BoW.append([e[i],count[i]])
    return BoW
def de(s) :
    for i in range (len(s)) :
        if s[i] not in alpha and s[i] not in number :
            s=s[:i]+' '+s[i+1:]
    return s
while b not in ['n','y','N','Y'] :
    print('Try again.')
    b=input('Use feature hashing ? (y,Y,n,N) ')
if b in ['n','N'] :
    print('-------------------')
    cc=0
    ac=0
    lc=0
    wc=0
    aline=''
    for line in file :
        l=line.lower()
        l=de(l)
        cc=cc+ccount(l)
        ac=ac+acount(l)
        lc=lc+1
        wc=wc+wcount(l)
        aline=aline+l
    BoW=nfh(aline)
    print('char count =',cc-lc)
    print('alphanumeric count =',ac)
    print('line count =',lc)
    print('word count =',wc)
    print('BoW =',BoW)
if b in ['y','Y'] :
    M=int(input('M = '))
    print('-------------------')
    cc=0
    ac=0
    lc=0
    wc=0
    aline=''
    for line in file :
        l=line.lower()
        l=de(l)
        cc=cc+ccount(l)
        ac=ac+acount(l)
        lc=lc+1
        wc=wc+wcount(l)
        aline=aline+l
    BoW=yfh(aline,M)
    BoW.sort()
    print('char count =',cc-lc)
    print('alphanumeric count =',ac)
    print('line count =',lc)
    print('word count =',wc)
    print('BoW =',BoW) 
# 6330485121 (30.00) 308 (2021-03-21 01:01)
def char_count(file_name): 
    words = ''
    c = 0
    fn = open(file_name)
    for line in fn:    
        words += line
    for e in words:
        if e != '\n':
            c += 1
    fn.close()
    return c
def alphanumeric_count(file_name):
    words = ''
    c = 0
    fn = open(file_name)
    for line in fn:    
        words += line
    for e in words:
        if e.lower() in 'abcdefghijklmnopqrstuvwxyz0123456789':
            c += 1
    fn.close()
    return c
def line_count(file_name):
    c = 0
    fn = open(file_name)
    for line in fn:
        c += 1
    fn.close()
    return c
def list_of_words(file_name):
    words = ''
    listwords = ''
    fn = open(file_name)
    for line in fn:    
        words += line
    for e in words:
        if e.lower() not in 'abcdefghijklmnopqrstuvwxyz0123456789' or e.lower() == '\n':
            listwords += ' '
        else: listwords += e.lower()
    listwords = listwords.split()
    fn.close()
    return listwords      # ['it', 'was', 'the', 'best', 'of', ...]
def bag_of_words(file_name):
    listwords = list_of_words(file_name)
    sw = list_of_words('stopwords.txt')
    new = []
    for e in listwords:
        if e not in sw:
            new.append(e)      
    word = []; fr = []; bow = []
    for e in new:
        if e.lower() not in word:
            word.append(e.lower())
            fr.append(int(1))
        elif e.lower() in word:
            fr[word.index(e.lower())] += 1
    for i in range(len(word)):
        bow.append([word[i], fr[i]])
    bow.sort()
    return bow
def fhashing(w,m):
    fhash = 0
    g = 37
    for i in range(len(w)):
        fhash += ord(w[i])*(g**i)
    return fhash%m
def feature_hashing(file_name):
    listwords = list_of_words(file_name)
    sw = list_of_words('stopwords.txt')
    new = []
    for e in listwords:
        if e.lower() not in sw:
            new.append(e.lower())
    fhash = []; ordd = []; fr = []; bow = []
    for e in new:
        fhash.append(fhashing(e,m))
    for i in range(len(fhash)):
        if fhash[i] not in ordd:
            ordd.append(fhash[i])
            fr.append(int(1))
        elif fhash[i] in ordd:
            fr[ordd.index(fhash[i])] += 1
    for i in range(len(ordd)):
        bow.append([ordd[i], fr[i]])
    bow.sort()
    return bow
x = ['y', 'Y', 'n', 'N' ]
file_name = input('File name = ')
hashing = input('Use feature hashing ? (y,Y,n,N) ')
while hashing not in x:
    print('Try again.')
    hashing = input('Use feature hashing ? (y,Y,n,N) ')
if hashing in 'yY':
    m = int(input('M = '))
    print('-------------------')
    print('char count = '+str(char_count(file_name)))
    print('alphanumeric count = '+str(alphanumeric_count(file_name)))
    print('line count = '+str(line_count(file_name)))
    print('word count = '+str(len(list_of_words(file_name))))
    print('BoW =',feature_hashing(file_name))
elif hashing in 'nN':
    print('-------------------')
    print('char count = '+str(char_count(file_name)))
    print('alphanumeric count = '+str(alphanumeric_count(file_name)))
    print('line count = '+str(line_count(file_name)))
    print('word count = '+str(len(list_of_words(file_name))))
    print('Bow =',bag_of_words(file_name))
# 6330486821 (30.00) 309 (2021-03-22 09:27)

def remove_stopwords(text,sword):
    return sorted([e for e in text if e not in sword])
def fhash(text,m):
    return sorted([(sum([ord(w[i])*37**i for i in range(len(w))])%m) for w in text])
def repeat_word(text):
    ntext = []
    for i in range(len(text)):
        if i==0:                 ntext.append([text[0],text.count(text[0])])
        elif text[i]!=text[i-1]: ntext.append([text[i],text.count(text[i])])
    return ntext
def show_features():
    print('-------------------')
    print('char count =',countc-n)
    print('alphanumeric count =',counta)
    print('line count =',countl)
    print('word count =',countw)
 
#------------------info from file------------------
f = open(input('File name = '),'r')
countc,counta,countl,n = 0,0,0,0
text = ''
for line in f:
    countl += 1; line = line.lower()
    for ch in line:
        countc += 1
        if ch=='\n': n += 1
        if ch.isalnum(): counta += 1; text += ch
        else: text += ' '
text = text.split()
countw = len(text)
f.close()
#-----------------stopwords import-----------------
s = open('stopwords.txt','r')
stopwords = []
for line in s: stopwords += line.strip().split()
s.close()
#----------------------Input-----------------------
h = input('Use feature hashing ? (y,Y,n,N) ').lower()
while h!='y' and h!='n':
    print('Try again.')
    h = input('Use feature hashing ? (y,Y,n,N) ').lower()
#----------------------Output----------------------
stext = remove_stopwords(text,stopwords)
if h=='n':
    show_features()
    print('BoW =',repeat_word(stext))
elif h=='y':
    m = int(input('M = '))
    show_features()
    print('BoW =',repeat_word(fhash(stext,m)))
# 6330487421 (22.99) 310 (2021-03-22 23:39)

def iinput():
    M=-1
    
    file_name = input('File name = ')
    wantfhash = input('Use feature hashing ? (y,Y,n,N) ')
    while wantfhash not in ['y', 'Y', 'n', 'N']:
        print('Try again.')
        wantfhash = input('Use feature hashing ? (y,Y,n,N) ')
    if wantfhash in ['y', 'Y']:
        M = int(input('M = '))
        wantfhash = True
    else:
        wantfhash = False
    print('-------------------')

    return file_name, wantfhash, M
def sstopwords():
    x = []
    stopWordsFile = open('stopwords.txt', 'r')
    for line in stopWordsFile:
        for word in line.strip().split():
            word = word.lower()
            if word not in x:
                x.append(word)
    stopWordsFile.close()

    return x
def wwords(file_name):
    q = 0
    p = 0
    lines = 0
    words = []

    wordsFile = open(file_name, 'r')
    for line in wordsFile:
        lines += 1
        for y in line:
            q += 1
            if y == '\n':
                q -= 1
            if ('A' <= y <= 'Z') or('a' <= y <= 'z') or ('0' <= y <= '9')  :
                p += 1

        w = ''
        for y in line:
            if ('0' <= y <= '9') or ('A' <= y <= 'Z') or('a' <= y <= 'z')  :
                w += y
            else:
                if len(w) != 0:
                    words.append(w)
                w = ''
    wordsFile.close()

    return q, p , lines, words
def fhash(w, M):
    G = 37
    x = 0
    for i in range(len(w)):
        x += (ord(w[i])*(G**i))
    
    return x % M
def bbow(words, stopWords, wantfhash, M):
    r = []
    for y in words:
        y = y.lower()
        if y in stopWords:
            pass
        else:
            found = False
            if wantfhash:
                cEdit = fhash(y, M)
                for i in range(len(r)):
                    if r[i][0] == cEdit:
                        r[i][1] += 1
                        found = True
                        break
                if not found:
                    r.append([cEdit, 1])
            else:
                for i in range(len(r)):
                    if r[i][0] == y:
                        r[i][1] += 1
                        found = True
                        break
                if not found:
                    r.append([y, 1])
    

    return r

#----------------------------------------------------------------------------
file_name, wantfhash, M = iinput()
stopWords = sstopwords()
q, p , lines, words = wwords(file_name)
print('char count =', q)
print('alphanumeric count =', p)
print('line count =', lines)
print('word count =', len(words))
print('BoW =', bbow(words, stopWords, wantfhash, M))

# 6330488021 (19.92) 311 (2021-03-21 12:59)

def fhash(w,M):
    f=0
    for i in range(len(w)):
        f+=ord(w[i])*(37**i)
    total = f%int(M)
    return total
def bow(word,q):
    alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    s_alphabet = alphabet.lower()
    n1=''
    w = word.lower()
    for e in w:
        if e in alphabet or e in s_alphabet or e in '1234567890' or e in ' ':
            n1+=e
    n1list = n1.split()
    n2=''
    for k in n1list:
        if k not in stop_list:
            n2+=k+' '
    n2list=n2.split() #เก็บคำที่ตัด stopword ออก
    
    bow=[]
    if q=='N' or q=='n':
        n2new=[]
        for j in n2list:
            if j not in n2new:
                n2new.append(j)
        for c in n2new:
            bow.append([c,n2list.count(c)])
        bow.sort()
   
    elif q=='Y' or q=='y':
        fhas=[]
        checkfhas=[]
        for k in n2list:
            fhas.append(fhash(k,M)) #เก็บ fhas ของทุกคำ
        for p in fhas:
            if p not in checkfhas:
                checkfhas.append(p) # เก็บเลขตัดตัวซ้ำออก
        for r in checkfhas:
            bow.append([r,fhas.count(r)])
        bow.sort()
    return bow

file_name = input('File name = ')
q= input('Use feature hashing ? (y,Y,n,N) ')
while q not in 'yYnN':
    print('Try aqain.')
    q=input('Use feature hashing ? (y,Y,n,N) ')
if q== 'y' or q=='Y':
    M=input('M = ')

f_stop = open('stopwords.txt','r')
line_s = f_stop.readline()
s=line_s
for line_s in  f_stop:
    s+=line_s
    stop_list =s.split()
#-------------------------------    
f_file = open(file_name,'r')
line_f = f_file.readline()
fn=line_f
char = ''
l=1
kount=0
for line_f in  f_file:
    fn+=line_f
    l+=1 #3 line count
for c in fn:
    if c !='\n':
        char+=c
    else:
        char+=' '
        kount+=1
ch= len(char)-kount #1 char count
count = 0 
for e in fn:
    if 'A' <= e <='Z' or 'a'<= e  <='z':
        count+=1
    if e in '123456789':
        count+=1  #2 letter and num count
f_list = fn.split()
word_c = len(f_list) #4 wordcount

print('-------------------')
print('char count = '+str(ch))
print('alphanumeric count = '+str(count))
print('line count = '+str(l))
print('word count = '+str(word_c))
print('BoW =',bow(char,q))


f_file.close()
f_stop.close()


# 6330489721 (23.35) 312 (2021-03-22 21:01)
file_name = input('File name = ')
ft = input('Use feature hashing ? (y,Y,n,N) ')
uh = False
while ft not in ['y','Y','n','N']:
    print('Try again.')
    ft = input('Use feature hashing ? (y,Y,n,N) ')
if ft in ['y','Y']:
    M=input('M = ')
    uh = True
print('-------------------')

stopwords_list = []
stopwords_file = open('stopwords.txt', 'r')
line_count=0
char_count=0
alpha_count=0
word_count=0

for line in stopwords_file:
    
    strip_stopwords_file = line.strip()
    strip_split_stopwords_file = strip_stopwords_file.split()
    stopwords_list += strip_split_stopwords_file
stopwords_file.close()

file = open(file_name, 'r')
for line in file:
    strip_line = line.strip().lower()
    char_count += len(strip_line)
file.close()

file = open(file_name, 'r')
for line in file:
    strip_line = line.strip().lower()
    for i in strip_line:
        isalnum = i.isalnum()
        if isalnum == True:
            alpha_count +=1
file.close()

file = open(file_name, 'r')
for line in file:
    strip_line = line.strip().lower()
    line_count +=1
file.close()
def find_replace(t):
    result = ""
    for c in t:
        if c in "\"\'/\\,.:;":
            result += " "
        else:
            result += c
    return result

file = open(file_name, 'r')
for line in file:
    strip_line = line.strip().lower()
    words = find_replace(strip_line)
    strip_words = words.strip()
    split_strip_words = strip_words.split()    
    word_count += len(split_strip_words)
file.close()

print('char count =',char_count)
print('alphanumeric count =',alpha_count)
print('line count =',line_count)
print('word count =',word_count)


all_words_list =[]
file = open(file_name, 'r')
for line in file:
    strip_line = line.strip().lower()
    words = find_replace(strip_line)
    strip_words = words.strip()
    split_strip_words = strip_words.split()
    all_words_list += split_strip_words 
file.close()

all_words_withoutstopwords_list = []
for i in all_words_list:
    if not i in stopwords_list:
        all_words_withoutstopwords_list.append(i)
        
BoW = []
def add(BoW,d):
    c = True
    for i in range(len(BoW)):
        if BoW[i][0] == d:
            c = False
            BoW[i][1] += 1
    if c == True:
        BoW.append([d,1])
    return BoW
def fhash(list_of_word,M):
    wordhash_list = []
    for word in list_of_word:
        char_count = 0
        for i in range(len(word)):
            char_count += ord(word[i])*(37**i)
        wordhash_list.append(char_count%int(M))
    return wordhash_list

if uh == False:
    for i in all_words_withoutstopwords_list:
        BoW = addwordToBoW(BoW,i)
    print('BoW =',sorted(BoW))

if uh == True:
    wordhash_list = fhash(all_words_withoutstopwords_list,M)
    BoWhash = []
    for i in sorted(wordhash_list):
        BoWhash = add(BoWhash,i)
    print('BoW =',BoWhash)







# 6330491921 (25.20) 313 (2021-03-22 22:40)

#------------------------------------
file_name = input("File name = ")

while True:
    feature_hashing = input("Use feature hashing ? (y,Y,n,N) ")
    if feature_hashing.lower() not in ['y', 'n']:
        print("Try again.")
    else:
        break
    
if feature_hashing.lower() == 'y':
    while True:
        try:
            m = int(input("M = "))
            dash = '-'*19
            print(dash)
            break
        except:
            print("Try again.")
            continue
        
elif feature_hashing.lower() == 'n':
    while True:
        try:
            dash = '-'*19
            print(dash)
            break
        except:
            print("Try again.")
            continue
        

        
stopwords_file = open("stopwords.txt","r")

input_file = open(file_name,"r")
data = input_file.read().lower()
result_char = len(data)-data.count('\n')
print("char count = "+str(result_char))


alphabet = 0
for char in data:
    if char.isalpha() or char.isdigit():
        alphabet += 1
result_alpha = alphabet
print("alphanumeric count = "+str(result_alpha))
    

d = data.split("\n")
count = len(d)
for i in range(len(d)):
    if i == len(d)-1:
        if not d[i]:
            count -=1
result_count = count
print("line count = "+str(result_count))

word = data.replace("\n", " ").replace('"','').replace(".","").replace(",","").split(" ")
_word = []
for w in word:
    if w.isalpha() or w.isdigit():
        _word.append(w)
result_word = len(_word)
print("word count = "+str(result_word))


stopwords = []
with open('stopwords.txt','r') as file: 
         for line in file: 
                 for word in line.split(): 
                        stopwords.append(word)

new_input = []
for i in _word:
    if i not in stopwords:
        new_input.append(i)

new_input.sort()
def fhash(w,M):
    G = 37
    index = 0
    num = 0
    for c in w:
        num = num + (ord(c)*(G**index))
        index += 1
    return num%M

if feature_hashing.lower() == 'y':
    index = 0
    for w in new_input:
        num = fhash(w,m)
        new_input[index] = num
        index += 1
    result = []
    new_input.sort()
    for i in new_input:
        result.append([i,new_input.count(i)])
    res = [] 
    [res.append(x) for x in result if x not in res]
    result_res_1 = res
    print("BoW = "+str(result_res_1))
    
else:
    result = []
    for i in new_input:
        result.append([i,new_input.count(i)])
    res = [] 
    [res.append(x) for x in result if x not in res]
    result_res_1 = res
    print("BoW = "+str(result_res_1))
    
#------------------------------

# 6330492521 (21.40) 314 (2021-03-22 19:56)
def fhash(w,M):
  s = 0
  G = 37
  for i in range(len(w)):
    s = s+ord(w[i])*(G**i)
  return s%M
def clean_word(ltext):
  text = ''
  cleanw =''
  alc = 0
  lc = len(ltext)
  for k in range(lc):
    text = text+ltext[k]
  for i in range(len(text)):
      if text[i].isalnum()==True:
          cleanw=cleanw+text[i].lower()
          alc = alc +1
      else:
          cleanw = cleanw+' '
  list_text  = cleanw.split()
  print('char count =',len(cleanw))
  print('alphanumeric count =',alc)
  print('line count =',lc)
  print('word count =',len(list_text))
  return list_text 
def stop_words(text,stopline):
  st = ''
  for k in range(len(stopline)):
    st = st+' '+stopline[k]
  stoplist = st.split()
  for i in range(len(stoplist )):
    text = [s for s in text if s != stoplist[i]]
  return text
def text_same(words):
    words.sort()
    bow = []
    ch_list =[]
    for i in range(len(words)):
      if words[i] not in ch_list:
        c=0
        for j in range(len(words)):
          if words[i]==words[j]:
            c=c+1
        bow.append([words[i],c])
        ch_list.append(words[i])
    return bow 
file_name = input('File name = ') 
fx = input('Use feature hashing ? (y,Y,n,N) ')
while True:
  if fx in ['y','Y','n','N']:
    if fx == 'Y' or fx == 'y':
      M = int(input('M = '))
    break
  else:
    print('Try again.')
    fx = input('Use feature hashing ? (y,Y,n,N) ')



file=open(file_name,'r')
lines=file.readlines()
file.close()
lines=[line.strip() for line in lines]
file2=open('stopwords.txt','r')
lines2=file2.readlines()
file2.close()
lines2=[line.strip() for line in lines2]
print('-------------------')
all_text = clean_word(lines)
words= stop_words(all_text,lines2)
if fx == 'Y' or fx == 'y':
  number = []
  for i in range(len(words)):
    number.append(fhash(words[i],M))
  bow=text_same(number)  
elif fx =='N' or fx == 'n':
  bow=text_same(words)
print('BoW =',bow)  

# 6330494821 (26.00) 315 (2021-03-21 15:28)
def bow(word1,word123):
    x=0
    for i in range(len(word123)):
        if word1==word123[i]:
            x+=1
    return x
def fhash(word,m):
    x=0
    for i in range(len(word)):
        x+=ord(word[i])*37**i
    y=x%m
    return y
x=input('File name = ')
file_name=open(x,'r')
y=input('Use feature hashing ? (y,Y,n,N) ')
while not(y=='y'or y=='Y'or y=='n'or y=='N'):
    print('Try again.')
    y=input('Use feature hashing ? (y,Y,n,N) ')
if y=='y'or y=='Y':
    M=int(input('M = '))
print('-------------------')
stopwords=open('stopwords.txt','r')
stopword=''
for line in stopwords:
    for c in line:
        stopword+=c
stopword=stopword.split()        
charcount = 0
alphanumericcount = 0
linecount = 0
word=''
BoW=[]
for line in file_name:
    charcount+=len(line)-1
    for c in line:
        if c.isalnum()==True:
            alphanumericcount+=1
            word+=c
        else:
            word+=' '
    linecount+=1
print('char count =',charcount)
print('alphanumeric count =',alphanumericcount)
print('line count =',linecount)
word=word.lower().split()
wordcount = len(word)
print('word count =',wordcount)
if y=='y'or y=='Y':
    a=[]
    for i in range(wordcount):
        if word[i] not in stopword:
            b=fhash(word[i],M)
            a.append(b)
    a.sort()
    c=1
    for i in range(1,len(a)):
        if a[i]==a[i-1]:
            c+=1
        else:
            BoW.append([a[i-1],c])
            c=1
    if len(a)>=1:
        BoW.append([a[-1],c])        
else:
    for i in range(wordcount):
        if [word[i],bow(word[i],word)] not in BoW and word[i] not in stopword:
            BoW.append([word[i],bow(word[i],word)])
print('BoW = ',BoW)
stopwords.close()
file_name.close()
# 6330495421 (0.00) 316 (2021-03-21 18:54)
stopwords=""
fn=open("stopwords.txt", "r")
for line in fn:
    stopwords+=line
fn.close()
stopwords=stopwords.split()
    
def fhash_(w,M):
    c=0
    for i in range (len(w)):
        c+=(ord(w[i])*37**i)
    fhash=c%int(M)
    return fhash
def words_(sentence):
    s=""
    for c in sentence:
        if c not in "\"\'/\\().,;:" :
            s+=c
def clear_stopwords(sentence):
    s=""
    for c in sentence:
        if c in "\"\'/\\().,;:" :
            s+=" "
        else:
            s+=c
        
    s=s.lower()
    s=s.split()
    d=[]
    for e in s:
        if e not in stopwords:
            d.append(e)
    a=" ".join(d)
    return a
def word_count(sentence):
    wordcount=len(sentence.strip().split())
    return wordcount
def char_count(sentence):
    charcount=len(sentence.strip())
    return charcount
def alphanumeric_count(sentence):
    s=""
    b=0
    for c in sentence:
        if c in "\"\'/\\().,;:" :
            s+=" "
        else:
            s+=c
    s=s.strip().split()
    for e in s:
        b+=len(str(e))
    return b
        



a=input("File name = ")
b=input("Use feature hashing ? (y,Y,n,N) ")
while b not in "y,Y,n,N":
    print ("Try again.")
    b=input("Use feature hashing ? (y,Y,n,N) " )
if b in ["y","Y"]:   
    M=input("M = ")
charcount=0
alphanumericcount=0
linecount=0
wordcount=0
bow=[]
bowfinal=[]
file=open("sample.txt", "r")
for line in file:
    charcount+=char_count(line)
    wordcount+=word_count(line)
    alphanumericcount+=alphanumeric_count(line)
    linecount+=1
    if b in ["y", "Y"]:
        s=clear_stopwords(line)
        s=s.split()
        for e in s:
            bow.append(fhash_(e,M))
    if b in ["n","N"]:
        s=clear_stopwords(line)
        s=s.split()
        for e in s:
            bow.append(e)
                                  
file.close
print ("-"*19)
print ("char count = " + str(charcount))
print ("alphanumeric count = " + str(alphanumericcount))
print ("line count = " + str(linecount))
print ("word count = " + str(wordcount))
if b=="y"or"Y":
        for i in range (len(bow)):
            
            a=[bow[i],bow.count(bow[i])]
            if a not in bowfinal:
                bowfinal.append(a)
if b=="n"or"N":
        for i in range (len(bow)):
            
            a=[bow[i],bow.count(bow[i])]
            if a not in bowfinal:
                bowfinal.append(a)
bowfinal.sort()
print ("BoW =",bowfinal)



        
# 6330496021 (20.06) 317 (2021-03-22 23:07)
x = input('File name = ')
file=open(x,'r')
w = ''
lines = 0
lens = 0
for line in file:
    lines = lines+1
    lens = lens+(len(line)-1)
    w = w+line.lower()
file.close()
file=open('stopwords.txt','r')
j = ''
for line in file:
  j = j+line.lower()
file.close()
def fhash(w,m):
    a = 0
    for i in range(len(w)):
        a = a+(ord(w[i])*(37**i))
    a = a%int(m)
    return a
def word_(x):
    a = ''
    for w in x:
        if w.isalnum():
            a = a+w
        else:
            continue
    return a
def stop_(x,stop):
    a = []
    for e in x:
        if e in stop:
            continue
        else:
            a = a+[e]
    return a
def howmany_(x):
    a = []
    for i in x:
        word = i
        n = 0
        for j in x:
            if i == j:
                n += 1
        if [word,n] not in a:
            a.append([word,n])
    return a
        

list_word = w.split()
list_stopword = j.split()
word = []
for t in list_word:
    word = word+[word_(t)]
char_count = lens
alphanumeric_count = 0
for h in word:
    alphanumeric_count = alphanumeric_count+len(h)

BoWn = howmany_(stop_(word,list_stopword))
while True:
    u = input('Use feature hashing ? (y,Y,n,N) ')
    u = u.lower()
    if u in ['y']:
        m = input('M = ')
        BoWy = howmany_([fhash(l,m) for l in stop_(word,list_stopword)]) 
        print("-------------------")
        print('char count = ',char_count)
        print('alphanumeric count = ',alphanumeric_count)
        print('line count = ',lines)
        print('word count = ',len(word))
        print('BoW = ',BoWy)
        break
    if u in ['n']:
        print("------------------")
        print("char count = ",char_count)
        print("alphanumeric count = ",alphanumeric_count)
        print("line count = ",lines)
        print("word count = ",len(word))
        print("BoW = ",BoWn)
        break
    else:
        print('Try again.')
    


    
# 6330497721 (28.40) 318 (2021-03-18 22:44)
def char_count(file_name):
    open_file=open(file_name,'r')
    text=''.join([line.strip() for line in open_file.readlines()])
    char_count=len(text)
    open_file.close()
    return char_count
def alphanu_count(file_name):
    open_file=open(file_name,'r')
    text=' '.join([line.strip() for line in open_file.readlines()])
    new_text=''.join(adjust_text(text).split())
    alphanu=''
    for i in new_text:
        if i.isalnum():
            alphanu+=i
    open_file.close()
    return len(alphanu)
def line_count_and_word_count(file_name):
    open_file=open(file_name,'r')
    lines=[line.strip() for line in open_file.readlines()]
    line_count=len(lines)
    lines=' '.join(lines)
    lines=lines.split()
    word_cound=len(lines)
    open_file.close()
    return line_count,word_cound
def read_text(file_name):
    # return string
    open_file=open(file_name,'r')
    lines=' '.join([line.strip() for line in open_file.readlines()])
    open_file.close()
    return lines
def adjust_text(text):
    # return string
    text=text.lower()
    cheak=text.split()
    new_text=[]
    for i in cheak:
        new_t=''
        for j in i:
            if j.isalnum():
                new_t+=j
            else:
                new_t+=' '
        new_text.append(new_t)
    return ' '.join(new_text)
def list_without_stopwords(adjust_text):
    # return list
    stopwords=read_text('stopwords.txt').split()
    list_without_stopwords=[]
    cheak=adjust_text.split()
    for i in cheak:
        if i not in stopwords:
            list_without_stopwords.append(i)
    return list_without_stopwords
def BoW(list_without_stopwords):
    # return list
    cheak=[]
    BoW=[]
    for i in range(len(list_without_stopwords)):
        if not (list_without_stopwords[i] in list_without_stopwords[i+1:]):
            cheak.append(list_without_stopwords[i])
    for i in range(len(cheak)):
        count=list_without_stopwords.count(cheak[i])
        BoW.append([cheak[i],count])
    return sorted(BoW)
def fhash(word,M):
    # return int
    sum_f=0
    for i in range(len(word)):
        sum_f+=ord(word[i])*(37**i)
    return sum_f%M
def un_BoW_fhash(list_without_stopwords,M):
    un_BoW_fhash=[]
    new_info=[]
    for i in range(len(list_without_stopwords)):
        new_text=''
        for j in list_without_stopwords[i]:
            if j.isalnum():
                new_text+=j
        new_info.append(new_text)
    for i in range(len(new_info)):
        un_BoW_fhash.append(fhash(new_info[i],M))
    return un_BoW_fhash
def print_info(file_name):
    line_count,word_count=line_count_and_word_count(file_name)
    print('-------------------')
    print('char count =',char_count(file_name))
    print('alphanumeric count =',alphanu_count(file_name))
    print('line count =',line_count)
    print('word count =',word_count)    

file_name=input('File name = ')
use_feature_hash=input('Use feature hashing ? (y,Y,n,N) ')
while use_feature_hash not in ['y','Y','n','N']:
    print('Try again.')
    use_feature_hash=input('Use feature hashing ? (y,Y,n,N) ')
if use_feature_hash in ['y','Y']:
    M=int(input('M = '))
    print_info(file_name)
    BoW_fhash=un_BoW_fhash(list_without_stopwords(adjust_text(read_text(file_name))),M)
    Bow_fhash=BoW(BoW_fhash)
    print('BoW =',Bow_fhash)
elif use_feature_hash in ['n','N']:
    print_info(file_name)
    BoW=BoW(list_without_stopwords(adjust_text(read_text(file_name))))
    print('BoW =',BoW)

# 6330498321 (30.00) 319 (2021-03-22 23:21)
alpha='abcdefghijklmnopqrstuvwxyz'
stop=[]
inf=open('stopwords.txt','r')
for line in inf:
    for i in line.split():
        stop.append(i)
def fhash(w,M):
  top=0
  e=0
  for i in w:
    top+=(ord(i)*(37**e))
    e+=1
  fh=top%int(M)
  return fh
def counts(key,data):
  c=0
  for i in data:
    if i==key:
      c+=1
  return c
file_name=input('File name = ') 
while True:
  choice=input('Use feature hashing ? (y,Y,n,N) ')
  if choice=='n' or choice=='N' or choice=='y' or choice=='Y':
    break
  else:
    print('Try again.')
if choice=='y' or choice=='Y':
    M=input('M = ')
infile=open(file_name,'r')
char_count=0
alnu_count=0
counter='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890'
list_of_line=[]
for line in infile:
  list_of_line.append(line)
  for j in line:
    if j in counter:
      alnu_count+=1
    if j!='\n':
        char_count+=1
lines=len(list_of_line)
word_counter=''
for k in list_of_line:
  for m in k:
    if m in counter:
      word_counter+=m.lower()
    else:
      word_counter+=' '
list_of_word2=word_counter.split()
list_of_word=[]
for i in list_of_word2:
    if i not in stop:
        list_of_word.append(i)
words=len(list_of_word2)
BoW=[]
uniq=[]
list_of_fhash=[]
if choice=='n' or choice=='N':
  for k in list_of_word:
    if k not in uniq:
      uniq.append(k)
  for i in uniq:
    BoW.append([i,counts(i,list_of_word)])
if choice=='y' or choice=='Y':
    for i in list_of_word:
      list_of_fhash.append(fhash(i,M))
    for k in list_of_fhash:
        if k not in uniq:
          uniq.append(k)
    for i in uniq:
        BoW.append([i,counts(i,list_of_fhash)])
print('-------------------')
print('char count =',char_count)
print('alphanumeric count =',alnu_count)
print('line count =',lines)
print('word count =',words)
print('BoW =',BoW)
inf.close()
infile.close()
# 6330499021 (30.00) 320 (2021-03-22 19:18)

note=input('File name = ')
file=open(note,'r')
data=file.read()
file.close()
stop_words=open('stopwords.txt','r')
stop_words_data=stop_words.read().split()
def fhash(w,M) :
    x=0
    for i in range(len(w)):
        x += ord(w[i])*(37**i)
    return x % int(M)

def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

while True:

    f_hashing = input('Use feature hashing ? (y,Y,n,N) ')
    if f_hashing =='y' or f_hashing=='Y':
        M=int(input('M = '))
        m_check=True
        break
    elif f_hashing =='n' or f_hashing=='N':
        m_check=False
        break
    else:
        print('Try again.')
print('-------------------')

words=''
for word in data.lower():
    if word.isalnum()==True:
        words+=word

    elif word.isalnum()==False:
        words+=' '

word_list=words.split()
words_lower=words.lower().split()
number_of_word=len(words_lower)

word_real=[]
for word in words_lower:
    if word in stop_words_data:
        pass
    else:
        word_real.append(word)

BoW=[]
if m_check==False:
    for word in word_real:
        if word not in stop_words_data:
            BoW.append([word,word_real.count(word)])


if m_check==True:
    ans = []
    x = []
    for word in word_real:
        if word not in stop_words_data:
            ans.append(fhash(word, M))
    for i in ans:
        if i not in x:
            x.append(i)
            BoW.append([i, ans.count(i)])
            BoW.sort()


number_of_characters = 0
for line in data:
  line = line.strip("\n")
  number_of_characters += len(line)


alphanumeric_count=0
for char in data:
    if char.isalnum()==True:
        alphanumeric_count=alphanumeric_count+1

file.close()
stop_words.close()

print('char count = {}'.format(number_of_characters))
print('alphanumeric count = {}'.format(alphanumeric_count))
print('line count = {}'.format(file_len(note)))
print('word count = {}'.format(number_of_word))
print('BoW = {}'.format(BoW))




# 6330500921 (24.40) 321 (2021-03-22 00:14)

file_name=input('File name = ' )
a=input('Use feature hashing ? (y,Y,n,N) ')
while a.lower() not in ('y','n') :
    print('Try again.')
    a=input('Use feature hashing ? (y,Y,n,N) ')
if a.lower() == 'n':
    file= open(file_name,'r')
    read= open('stopwords.txt','r')
    r=read.readlines()
    h=list(r)
    g=''
    for i in h:
        for e in i:
            if not e.isalnum():
                g=g+' '
            elif e.isalnum():
                g=g+e
    stop_words=g.split(' ')
    f=file.readlines()
    b=[line.strip() for line in f]
    each=[]
    for i in b:
        each+=list(i)
    d=[i.lower() for i in each if i.isalnum()]
    c=''.join(d)
    char_count= len(each)
    alpha_count=len(c)
    line_count=len(f)
    e=''
    for i in each:
        if not i.isalnum():
            e=e+' '
        elif i.isalnum():
            e=e+i.lower()
    e=e.split()
    word_count=len(e)
    st=''
    for i in e:
        if i in stop_words:
            st+=' '
            
        else:
            st+=' '+i
    stopp=st.split()
    stopp.sort()
    
    Bow_y= []
    for i in range(len(stopp)) :
        a = stopp.count(stopp[i])
        if stopp[i-1] !=stopp[i] :
            Bow_y.append([stopp[i],a])
    
    
    file.close()
    read.close()
    print('-------------------')
    print('char count = '+str(char_count))
    print('alphanumeric count = '+str(alpha_count))
    print('line count = '+str(line_count))
    print('word count = '+str(word_count))
    print('BoW = '+str(Bow_y))
elif a.lower() == 'y':
    M=int(input('M = '))
    
    
    file= open(file_name,'r')
    read= open('stopwords.txt','r')
    r=read.readlines()
    h=list(r)
    g=''
    for i in h:
        for e in i:
            if not e.isalnum():
                g=g+' '
            elif e.isalnum():
                g=g+e
    stop_words=g.split(' ')
    f=file.readlines()
    b=[line.strip() for line in f]
    each=[]
    for i in b:
        each+=list(i)
    d=[i.lower() for i in each if i.isalnum()]
    c=''.join(d)
    char_count= len(each)
    alpha_count=len(c)
    line_count=len(f)
    e=''
    for i in each:
        if not i.isalnum():
            e=e+' '
        elif i.isalnum():
            e=e+i.lower()
    e=e.split()
    word_count=len(e)
    st=''
    for i in e:
        if i in stop_words:
            st+=' '
        else:
            st+=' '+i
    stopp=st.split()
    stopp.sort
    def fhash(w,M):
        a=list(w)
        G=37
        c=0
        for i in range(len(a)):
            b=ord(a[i])*(G**(i))
            c=c+b
        d=c%M
        return d
    bow_y=[]
    for i in stopp:
        bow_y.append(fhash(i,M))
        bow_y.sort()
    Bow_y= []
    for i in range(len(bow_y)) :
        a = bow_y.count(bow_y[i])
        if bow_y[i-1] !=bow_y[i] :
            Bow_y.append([bow_y[i],a])  
    file.close()
    read.close()
    print('-------------------')
    print('char count = '+str(char_count))
    print('alphanumeric count = '+str(alpha_count))
    print('line count = '+str(line_count))
    print('word count = '+str(word_count))
    print('BoW = '+str(Bow_y))

    
# 6330501521 (30.00) 322 (2021-03-21 18:10)
file_name=input('File name = ')
use=input('Use feature hashing ? (y,Y,n,N) ')
while use not in ['y','Y','n','N']:
    print('Try again.')
    use=input('Use feature hashing ? (y,Y,n,N) ')
def fhash(w,M):
    f=0
    for i in range(len(w)):
        f+=ord(w[i])*(37**i)
    fhash=f%M
    return fhash
def read(file_name):
    file=open(file_name,'r')
    read=''
    for line in file:
        line=line.lower()
        for i in line:
            if i not in 'abcdefghijklmnopqrstuvwxyz0123456789 ':
                read+=' '
            else:
                read+=i
    file.close()
    return read.split()
def bow(ming,unique):
    c=0
    C=[]
    for i in unique:
        c+=ming.count(i)
        C.append([i,c])
        c=0
    return sorted(C)

ming=read(file_name)

charcount=0
linecount=0
file=open(file_name,'r')
for line in file:
    charcount+=len(line)
    charcount-=1
    linecount+=1
charcount+=1
file.close()

al=0
for i in ming:
    al+=len(i)

word=len(ming)

yum=[]
if use in ['y','Y','n','N']:
    for i in ming:
        if i not in read('stopwords.txt'):
            yum.append(i)
unique=[]
for i in yum:
    if i not in unique:
        unique.append(i)

f=[]
if use in ['Y','y']:
    M=int(input('M = '))
    for i in yum:
        f.append(fhash(i,M))   
    u=[]
    for i in f:
        if i not in u:
            u.append(i)
    vee=bow(f,u)
else:
    vee=bow(ming,unique)


print('-------------------')
print('char count = '+str(charcount))
print('alphanumeric count = '+str(al))
print('line count = '+str(linecount))
print('word count = '+str(word)) 
print('BoW =',vee)
print(' ')


# 6330502121 (30.00) 323 (2021-03-22 00:16)
alp=['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
num=['1','2','3','4','5','6','7','8','9','0']
def spt():
    b=open(c,'r')
    a=''
    word=''
    for x in b:
        word+=x.lower()
    for i in range(len(word)):
        if word[i] in alp or word[i] in num:
            a+=word[i]
        else:
            a+=" "
    a=a.split()
    stop=open('stopwords.txt','r')
    e=[]
    for x in stop:
        x=x.split()
        if len(x)!=0:
            for i in range(len(a)):
                if a[i] in x:
                    e.append(a[i])
    for i in range(len(e)):
        a.remove(e[i])
    return a
    
def bow(word):
    e=[]
    for i in range(len(a)):
        n=a.count(a[i])
        e.append([a[i],n])
    r=[] 
    [r.append(x) for x in e if x not in r]
    return r
    
def fh(word):
    g=37
    p=[]
    c=[]
    m=input('M = ')
    for i in range(len(word)):
        n=0
        k=0
        for b in range(len(word[i])):
            t=ord(word[i][b])
            k+=t*g**n
            n+=1
        p.append(str(k%int(m)))
    for num in range (int(m)):
        l=0
        for i in range(len(p)):
            if str(num)==p[i]:
                l+=1
        if l!=0:
            c.append([num,l])
    return c
def end(bow):
    i=0
    z=0
    o=0
    n=0
    e=''
    a=open(c,'r')
    for x in a:
        n+=len(x)
        i+=1
        x=x.lower()
        for b in range(len(x)):
            if x[b] in alp or x[b] in num:
                z+=1
        for b in range(len(x)):
            if x[b] in alp or x[b] in num:
                e+=x[b]
            else:
                e+=' '
    e=str(e).split()
    o+=len(e)
    print('-------------------')
    print('char count = ',n-i+1)
    print('alphanumeric count = ',z)
    print('line count = ',i)
    print('word count = ',o)
    print('BoW = ',bow)
        
c=input('File name = ')
x=True
while x:
    b=input('Use feature hashing ? (y,Y,n,N) ')
    if b=='y' or b=='Y':
        a=spt()
        bow=fh(a)
        x=False
    elif b=='n' or b=='N':
        a=spt()
        bow=bow(a)
        x=False
    else:
        print('Try again.')
end(bow)
# 6330503821 (22.99) 324 (2021-03-22 14:59)
def feature_hashing(word, m):
    hash_value = 0
    for i in range(len(word)):
        character = word[i]
        hash_value += ord(character)*(37**i)
    return hash_value % m

def is_alphanumeric(character):
    return 'a' <= character <= 'z' or 'A' <= character <= 'Z' or '0' <= character <= '9'


word_file_name = input("File name = ")
word_file = open(word_file_name, "r")  

stopwords_file = open("stopwords.txt", "r")

is_feature_hashing = input("Use feature hashing ? (y,Y,n,N) ").lower()
while is_feature_hashing not in ['Y', 'y', 'N', 'n']:
    print("Try again.")
    is_feature_hashing = input("Use feature hashing ? (y,Y,n,N) ").lower()

if is_feature_hashing == 'y':
    m = int(input("M = "))

char_count = 0
alphanum_count = 0
line_count = 0
stopword_lst = []
for line in stopwords_file:  
    line = line.split()
    stopword_lst.extend(line)
word_lst = []
word_count = 0
for line in word_file:
    char_count += len(line)-1  
    line_count += 1
    for character in line:
        if is_alphanumeric(character):
            alphanum_count += 1

    word = ""
    for i in range(len(line)):
        if is_alphanumeric(line[i]):
            word += line[i]
            
        else:
            if word != "":
                word_count += 1
                if word.lower() not in stopword_lst:
                    word_lst.append(word.lower())
            word = ""

bow = []  
if is_feature_hashing == 'n':
    for word in word_lst:
        word_in_bow = [data[0] for data in bow]
        if word not in word_in_bow:
            bow.append([word, 1])
        else:
            bow[word_in_bow.index(word)][1] += 1
else:
    for word in word_lst:
        word_in_bow = [data[0] for data in bow]
        hashed_word = feature_hashing(word, m)
        if hashed_word not in word_in_bow:
            bow.append([hashed_word, 1])
        else:
            bow[word_in_bow.index(hashed_word)][1] += 1
bow.sort()
print("-------------------")
print(f"char count = {char_count+1}")
print(f"alphanumeric count = {alphanum_count}")
print(f"line count = {line_count}")
print(f"word count = {word_count}")
print(f"BoW = {bow}")
# 6330504421 (30.00) 325 (2021-03-21 22:18)
def fhash(w,M):
    x = 0
    for i in range(len(w)):
        x += ord(w[i])*(37**i)
    x = x % M
    return x
file_name = input('File name = ')
feature_hashing = input('Use feature hashing ? (y,Y,n,N) ')
while feature_hashing not in ['y','Y','N','n']:
    print('Try again.')
    feature_hashing = input('Use feature hashing ? (y,Y,n,N) ')
if feature_hashing == 'y' or feature_hashing == 'Y':
    M = int(input('M = '))
stopwords_file = open('stopwords.txt','r')
stopwords = ''
list_stopwords = []
for i in stopwords_file:
    for e in range(0,len(i),1):
            if 'a' <= i[e] <= 'z' or '0' <= i[e] <= '9' or i[e] == ' ':
                stopwords += i[e]
    stopwords += ' '
    list_stopwords = stopwords.split()
stopwords_file.close()
my_file = open(file_name,'r')
print('-------------------')
count1 = 0 #char count
count2 = 0 #alphanumeric count
count3 = 0 #line count
count_bow = 0
bow = []
words_list = []
count_list = []
fhash_list = []
alphanumeric = '' 
for i in my_file:
    i = i.lower()
    for z in range(len(i)):
        if i[z] == '\n':
            count1 += 0
        else:
            count1 += 1
    count3 += 1
    for e in range(0,len(i),1):
        if 'a' <= i[e] <= 'z' or '0' <= i[e] <= '9' or i[e] == ' ':
            alphanumeric += i[e]
            if i[e] != ' ':
                count2 += 1
        else:
            alphanumeric += ' '
    alphanumeric += ' '
print('char count =',count1)
print('alphanumeric count =',count2)
print('line count =',count3)
list_alphanumeric = alphanumeric.split()
list_alphanumeric.sort()
print('word count =',len(list_alphanumeric))
for i in list_alphanumeric:
    if i not in list_stopwords:
        words_list += [i]
words_list.sort()
if feature_hashing == 'n' or feature_hashing == 'N':
    for a in range(len(words_list)):
        if a == 0:
            count_list += [words_list[0]]
        else:
            if words_list[a] != words_list[a-1] :
                count_list += [words_list[a]]
    for b in count_list:
        for c in words_list:
            if b == c:
                count_bow += 1
        bow += [[b,count_bow]]
        count_bow = 0
else:
    for d in words_list:
        fhash_list += [fhash(d,M)]
    fhash_list.sort()
    for a in range(len(fhash_list)):
        if a == 0:
            count_list += [fhash_list[0]]
        else:
            if fhash_list[a] != fhash_list[a-1] :
                count_list += [fhash_list[a]]
    for e in count_list:
        for f in fhash_list:
            if e == f:
                count_bow += 1
        bow += [[e,count_bow]]
        count_bow = 0
print('BoW =',bow)
my_file.close()


















# 6330505021 (17.75) 326 (2021-03-21 15:16)
# prog-08: Bag-of-words
# # 6330505021 (17.75) Sarun Punsuvon
def remove_new_tab(s):
    str_out = ""
    for i in s:
        if i == "\n":
            continue
        str_out += i
    return str_out

def keep_char_int(s):
    str_out = ""
    for i in s:
        if i.lower() in "1234567890abcdefghijklmnopqrstuvwxyz":
            str_out += i
    return str_out

def fhash(w, M):
    G = 37
    shabu = 0
    eiei = 0
    for k in range(len(w)):
        n = ord(w[k])
        shabu = shabu + n*(G**k)
    eiei += shabu % M
    eiei = int(eiei)

    return eiei

def char_count(s):
    s = remove_new_tab(s)
    return len(s)

def alphan_count(s):
    s = keep_char_int(s)
    return len(s)

def word_list(s):
    str_out = ""
    for i in s:
        if i in "\n \"":
            str_out += " "
        elif i.lower() in "1234567890abcdefghijklmnopqrstuvwxyz ":
            str_out += i
    str_out = str_out.split(" ")
    for i in str_out:
        if i == "":
            str_out.remove(i)
    return str_out

def word_count(s):
     
    str_out = word_list(s)
    return len(str_out)

def bow(s, n, m=0):
    stop = ['it', 'they', 'the', 'a', 'an', 'of', 'on',
            'in', 'at', 'is', 'am', 'are', 'was', 'were']
    list_word = word_list(s)
    key = []
    list_out = []
    for word in list_word:
        if word not in stop:
            if word in key:
                list_out[key.index(word)][1] += 1
            else:
                list_out.append([word, 1])
                key.append(word)
    if n == 0:
        return list_out
    elif n == 1:
        list_out = []
        key1 = []
        for i in key:
            if fhash(i, m) in key1:
                list_out[key1.index(fhash(i, m))][1] += 1
            else:
                list_out.append([fhash(i, m), 1])
                key1.append(fhash(i, m))

        return list_out

def all_text(file):
    str_out = ""
    count = 0
    for i in file:
        str_out += i
        count += 1
    return str_out, count

def displayed(file, m, n):
    s, line = all_text(file)
    print("-"*19)
    print("char count =", char_count(s))
    print("alphanumeric count =", alphan_count(s))
    print("line count =", line)
    print("word count =", word_count(s))
    print("BoW = ", bow(s, n, m))

def main():
    file_name = input("Flle name = ")
    while True:
        user_input = input("Use feature hasing? (Y,y,N,n) ")
        file = open(file_name, "r")

        if user_input.lower() == "n":
            displayed(file, 0, 0)
            break
        elif user_input.lower() == "y":
            user_m = int(input("M = "))
            displayed(file, user_m, 1)
            break
        else:
            print("input error")


main()

# 6330507321 (21.40) 327 (2021-03-18 21:59)
def fhash(w,M):
    c=0
    for i in range(len(w)):
        c += ord(w[i])*(37**(i))
    c=c%int(M)
    return c
x=input('File name = ',)
y=input('Use feature hashing ? (y,Y,n,N) ',)
while y not in ['y','Y','n','N']:
    print('Try again.')
    y=input('Use feature hashing ? (y,Y,n,N) ',)
if y.lower()=='y':
    M=input('M = ',)
    k=[]

File=open(x,'r')
a=''
lc=0
chc=0
for line in File:
    for e in line.strip():
        if (('a'<=e.lower() and e.lower()<='z') or ('0'<= e<='9')):
            a+=e
        else:
            a+=' '
        chc+=1
    lc+=1
a=a.lower().split()
wc=len(a)
File.close()
stop=open('stopwords.txt','r')
b=''
for line in stop:
    b += line+' '
b=b.split()
alm=0
for e in a:
    l=len(e)
    for i in range(len(e)):
        if not (('a'<=e[i].lower() and e[i].lower()<='z') or ('0'<= e[i]<='9')):
            l-=1
    alm+=l
stop.close()
File=open(x,'r')
B=[]
for e in a:
    if not e in b:
        B.append(e)
B.sort()
B.append(' ')
h=1
j=[]
if y in['N','n']:
    for i in range(len(B)-1):
        if B[i]==B[i+1]:
            h+=1
        else:
            j.append([B[i],h])
            h=1

else:
    k=[]
    B.remove(' ')
    for e in B:
        k.append(fhash(e,M))
        k.sort()
        k.append(111)
    for i in range(len(k)-1):
        if k[i]==k[i+1]:
            h+=1
        else:
            j.append([k[i],h])
            h=1
File.close()
print('-------------------')        
print('char count =',chc)        
print('alphanumeric count =',alm)
print('line count =',lc)
print('word count =',wc)
print('BoW =',j)
# 6330508021 (19.40) 328 (2021-03-21 23:12)

file_name = input('File name = ')
fh = input('Use feature hashing ? (y,Y,n,N) ')
while fh not in ['y','n','N','Y'] :
    print('Try again.')
    fh = input('Use feature hashing ? (y,Y,n,N) ')
if fh in ['y','Y']:
    M = int(input('M = '))
print('-------------------')
#input
    
def word_count(word,list):
    N = 0
    for i in list:
        if word == i:
            N += 1
    return N
def BoW(w_list,stopwords):
    a = []
    BoW = []
    w_forBoW = []
    for i in w_list:
        if i not in stopwords:
             w_forBoW += [i]
    for i in w_forBoW:
        if i not in a:
            a += [i]
            BoW += [[i,word_count(i,w_forBoW)]]
    BoW.sort()
    return BoW
def fhash(w,M):
    n = 0
    for i in range(len(w)):
        n += ord(w[i])*(37**(i))
    N = n%M
    return N
def BoW2(w_list,stopwords,M):
    a = []
    BoW = []
    w_forBoW = []
    for i in w_list:
        if i not in stopwords:
             w_forBoW += [i]
    for i in w_forBoW:
        if i not in a:
            a += [i]
            BoW += [[fhash(i,M),word_count(i,w_forBoW)]]
    o = []
    BoW2 = []
    for i in BoW:
        [A,B] = i
        if A not in o:
            o += [A]
            BoW2 += [i]
        else:
            [c,d] = BoW2[o.index(A)]
            d += B
            BoW2[o.index(A)] = [c,d]
    BoW2.sort()
    return BoW2

#def_____________________________________________________

stopwords = []

file = open('stopwords.txt','r')
for line in file:
    stopwords += line.strip().lower().split(' ')
file.close()

line_count = 0
words = ''
file = open(file_name,'r')
for line in file:
    words += line.strip()
    line_count += 1
file.close()

character_count = len(words)
print('char count =',character_count)

#char count_______________________________________________

alphanumeric_count = 0
for i in words:
    if i.isalnum() == True:
        alphanumeric_count += 1
print('alphanumeric count =',alphanumeric_count)       
print('line count =',line_count)

#_alphanumeric_count__line count_______________________

newword = ''
for i in words:
    if i.lower().isalnum() == True:
        newword += i.lower()
    else:
        newword += ' '

w_list = newword.strip().split(' ')
w_list.remove('')
word_list = len(w_list)
print('word count =',word_list)

#_____________________________________________________

if fh in ['y','Y']:
    print('BoW =',str(BoW2(w_list,stopwords,M)))
elif fh in ['n','N']:
    print('BoW =',str(BoW(w_list,stopwords)))
# 6330509621 (11.08) 329 (2021-03-22 17:42)

file = open('stopwords.txt','r')
a=''
for line in file:
    a+= ' ' +line[:-1] 
la = a.split()
#print(la)
file_name = input("File name = ")
file_= open(file_name,'r')
c = ''
line_c = 0   #line count
for line in file_:
    c += ' ' +line[:-1]
    line_c +=1
lc = c.lower()    
lc = lc.replace('.',' ')
lc = lc.replace(',',' ')
lc = lc.replace("'",' ')
lc = lc.replace('"',' ')
lc = lc.replace("/",' ')
lc = lc.replace("(",' ')
lc = lc.replace(")",' ')
lc = lc.replace("{",' ')
lc = lc.replace("}",' ')
lc = lc.replace("[",' ')
lc = lc.replace("]",' ')
lc = lc.replace(";",' ')
lc = lc.replace(":",' ')
lc = lc.replace("|",' ')
lc = lc.split()
wc= len(lc) #word count
#print(c)
cc = len(c) #char count cc-line_c
ac = 0 #alpha count
for i in lc:
    ac += len(i)
cc= cc-line_c
#print(ac)
#BoW

d = []
for e in lc:
    if not(e in la):
        d += [e]
        
def word_frequency():            
    sd = ' '.join(d)
    f= []
    f_=[]
    n=0
    for e in d:
        if not(e in f):
            f_ += [[e,1]]
            f += [e]
        else:
            for i in range(len(f)):
                if e == f[i]:
                    f_[i][1] += 1    
    return f_    
def fhash(w,M): # big,4
    p=0
    for e in range(len(w)):
        p += ord(w[e])*(37**e)
    p = p%M
    return p
def BoW():
    bow = []
    bow_= []
    nn=0
    for e in range(len(d)):
        bow += [fhash(d[e],M)]

    for e in range(M):
        for i in bow:
            if e == i:
                nn +=1
        if nn != 0:
            bow_ += [[e,nn]]
        nn = 0
    return bow_

#file_name = input("File name =")
hh = input('Use feature hashing ? (y,Y,n,N) ')
ans = ['y','Y','n','N']
while not(hh in ans):
    print('Try again.')
    hh = input('Use feature hashing ? (y,Y,n,N) ')
if hh == 'y' or hh == 'Y':
    M = int(input('M = '))
print('-------------------')
print('char count = '+ str(cc))
print('aplhanumaric count = '+str(ac))
print('line count = '+str(line_c))
print('word count = '+str(wc))
if hh == 'y' or hh=='Y':
    print('BoW = '+str(BoW()))
else:
    print('BoW = '+ str(word_frequency()))

    
    
    
    


# 6330510121 (13.00) 330 (2021-03-22 23:58)
Filename =input('File name = ')
file_name=open(Filename,"r" )
a=input('Use feature hashing ? (y,Y,n,N) ')
while a not in ['y','Y','n','N']:
    print('Try again')
    a=input('Use feature hashing ? (y,Y,n,N) ')
if a in ['Y','y']:
    M=input('M = ')
    c='yes'
if a in ['N','n']:
    c='no'
def fhash(w,M):
    fhash=0
    for i in range(len(w)):
        fhash+=((ord(w[i]))*(37**i))
    fhash=fhash%M
    return fhash
def stopword(sentence):
    STOPWORD=open('stopwords.txt','r')
    STOPWORD2=''
    for line in STOPWORD:
        STOPWORD2+=line
        STOPWORD2=STOPWORD2.replace('\n',' ')
    STOPWORD=STOPWORD2.split()
    sentence=sentence.split()
    x='' 
    for i in sentence :
        if  i.lower() not in STOPWORD:
            x+=' '+i
    return  x       
file_name2=''
for line in file_name:
    file_name2+=line

file_name2=file_name2.replace('\n','')
print('Char count =',len(file_name2))    
ALPHAMERIC=['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9']
x=0
for i in range(len(file_name2)):
    if file_name2[i].lower() in ALPHAMERIC:
        x+=1  
print('alphanumeric count =',x)
file_name = open(Filename, "r")
x = 0
for line in file_name:
    x += 1
print('line count =',x)
x='''!()-[]{};:'"\,<>./?@#$%^&*_~'''
file_name=open(Filename,"r" )
file_name2=''
for line in file_name:
    file_name2+=line
#wordcount
file_name2=file_name2.replace('\n',' ')
for i in file_name2:
    if i in x :
        file_name2=file_name2.replace(i,' ')
word=file_name2.split()    
print('word count =',len(word))
print('-------------------')
if c == 'yes':
    x=[]
    y=[]
    file_name2=stopword(file_name2).split()
    for i in file_name2:
        x.append(fhash(i,int(M)))
    BoW = []
    x.sort()
    for i in x:
        if i not in y:
            y.append(i)
    for i in y:        
        if i not in BoW:
            BoW.append([i,x.count(i)])
    print(BoW)
if c =='no':
    x=[]
    y=[]
    file_name2=stopword(file_name2).split()
    for i in file_name2:
        x.append(i)
    BoW = []
    x.sort()
    for i in x:
        if i not in y:
            y.append(i)
    for i in y:        
        if i not in BoW:
            BoW.append([i,x.count(i)])
    print(BoW)
    
# 6330511821 (25.15) 331 (2021-03-20 00:28)



the_list = []
string = ''
list_of_fhash = []
def fhash(w,M):
    value = 0
    for i in range(len(w)):
        value += ord(w[i])*(37)**i
    answer = value%M
    return answer
def char_count(file):
    sigma = 0
    for i in file:
        sigma += len(i)
    file = open(file_name,'r')
    sigma = sigma - (line_count(file) - 1) 
    return sigma
def alphanumeric_count(file):
    sigma = 0
    for i in file:
        for k in i:
            if k.isalnum():
                sigma += 1
    return sigma
def line_count(file):
    sigma = 0
    for i in file:
        sigma+=1
    return sigma
def word_count(file):
    global string
    for i in file:
        for k in i:
            if k.isalnum():
                string += k
            else:
                string += ' '
    string = string.split()
    sigma = len(string)
    return sigma
def BoW_not_hashing():
    founded_word = []
    BoW = []
    for i in string:
        if i.lower() not in the_list:
            if i not in founded_word:
                founded_word.append(i)
                BoW.append([i,1])
            else:
                for j in BoW:
                    if j[0] == i:
                       j[1] += 1
    return BoW
def remove_punctual():
    file = open('stopwords.txt')
    global the_list
    for i in file:
        for j in i.split():
            the_list.append(j)
            
def BoW_but_hashing():
    founded_word = []
    BoW = []
    global list_of_fhash
    for i in string:
        if i.lower() not in the_list:
            list_of_fhash.append(fhash(i,M))
    for i in list_of_fhash: 
        if i not in founded_word:
            founded_word.append(i)
            BoW.append([i,1])
        else:
            for j in BoW:
                if j[0] == i:
                   j[1] += 1
    return BoW            
    
    

remove_punctual()
file_name = input("File name = ")
asking = input("Use feature hashing ? (y,Y,n,N) ")
while asking not in 'yYnN':
    print('Try again.')
    asking = input("Use feature hashing ? (y,Y,n,N) ")
if asking in 'yY':
    M = int(input("M = "))
print('-------------------\
')
file = open(file_name,'r')


print('char count =',char_count(file))
file = open(file_name,'r')
print('alphanumeric count =',alphanumeric_count(file))
file = open(file_name,'r')
print('line count =',line_count(file))
file = open(file_name,'r')
print('word count =',word_count(file))

if asking in 'nN':
    print('BoW =',BoW_not_hashing())
elif asking in 'yY':
    print('BoW =',BoW_but_hashing())


    
file.close()
# 6330512421 (24.90) 332 (2021-03-21 21:54)
def fhash(w,M) :
    ord_ = 0
    for i in range(len(w)) :
        ord_ += ord(w[i])*(37**i)
    ord_ = ord_%(int(M))
    return ord_
file = input("File name = ", )   
file_name = open(file,"r")
use_fhash = input("Use feature hashing ? (y,Y,n,N) ",) 
if ","+use_fhash+"," not in ",y,Y,n,N," :
    while ","+use_fhash+"," not in ",y,Y,n,N," :
        print("Try again.")
        use_fhash = input("Use feature hashing ? (y,Y,n,N) ",)
if ","+use_fhash+"," in ",y,Y," :
    M = input("M = ",)
else :
    pass
#--------------------------------
#stop_word
stop_word = open("stopwords.txt",)
list_stopword = [line.rstrip('\n') for line in stop_word]
stopword_content = []
for e in list_stopword :
    stopword_content += e.split()
stop_word.close()
#--------------------------------------
#list_word
list_word = [line.rstrip('\n') for line in file_name]
list_content = []
for e in list_word :
    list_content += e.split()
#---------------------------------
list_content1 = []
for e in list_content :
    word = ""
    for k in e :
        if "a"<=k.lower()<="z" :
            word += k.lower()
        elif "0"<=k<="9" :
            word += k
    list_content1 += [word]
list_content2 = []
for e in list_content1 :
    if e in stopword_content :
        list_content2 += ""
    else :
        list_content2 += [e]
file_name.close()
#--------------------------------------
print("-------------------")
#character_count
character_count = 0
for e in list_word :
    for k in e :
        character_count += 1
print("char count = ",character_count)
#alphanumeric_count
alphanumeric_count = 0
for e in list_content1 :
    for k in e :
        alphanumeric_count += 1
print("alphanumeric count = ",alphanumeric_count)
#line_count
line_count = 0
for line in open(file,"r") :
    line_count += 1
print("line count = ",line_count)
#word_count
word_count = 0
for e in list_content1 :
    word_count += 1
print("word count = ",word_count)
#--------------------------------------
#Bag_of_word
def repeat(N,data) :
        n = 0
        for e in data :
            if N == e :
                n += 1
        return n
if ","+use_fhash+"," in ",y,Y," :
    data = []
    for e in list_content2 :
        data += [fhash(e,M)]
#data = [3,0,3,2,1,2,3,3]
#BoW = [[0, 1], [1, 1], [2, 2], [3, 4]]
    data1 = []
    for e in data :
        if e not in data1 :
            data1 += [e]
    data1 = sorted(data1)
#data1 = [0,1,2,3]
    Bow = [[e,repeat(e,data)] for e in data1]
elif ","+use_fhash+"," in ",n,N," :
    data_ = []
    for e in list_content2 :
        if e not in data_ :
            data_ += [e]
    Bow = sorted([[e,repeat(e,list_content2)] for e in data_])    
#--------------------------------------
print("Bow = ",Bow)


    

        
        
        
    

        


        

# 6330513021 (17.85) 333 (2021-03-21 22:33)
def fhash(w,M):
    G=37
    sum_fhash=0
    for i in range(len(w)):
        sum_fhash+=(ord(w[i])*((G)**i))
    sum_fhash = sum_fhash%M
    return sum_fhash
def lookBow(s):
    s.sort()
    ans_true = []
    first_word = s[0]
    num = 1

    for i in range(1,len(s)) :
        if s[i] == first_word :
            num += 1
        else :
            ans_true.append([s[i-1], num])
            first_word = s[i]
            num = 1

    ans_true.append([s[i],num])
    return ans_true
#-----------------------------------------#
file_name=input('File name = ')
f_hash=input('Use feature hashing ? (y,Y,n,N) ')
file=open(file_name,'r')
str_stop=''
stopwords=open('stopwords.txt','r')
for line in stopwords:
        str_stop+=line
str_stop=str_stop.lower()
str_stop=str_stop.split()
#general case
while not f_hash in 'yYnN':
    print('Try again.')
    f_hash=input('Use feature hashing ? (y,Y,n,N) ')

#case n or N
if f_hash in'nN':
    print('-------------------')
    all_sentence=[]
    c=0
    for line in file:
        all_sentence.append(line.strip())   
        c+=1
    #count_words_false
    words=''
    for i in all_sentence :
        words+=i+' '
    all_words=words.split()
    #char_count
    punctuation=''
    for i in words:
        if i in "\"\'/\\,.:;()[]{}":
                punctuation += " "
        else:
            punctuation += i
    char_count=len(punctuation)-c 
    print('char count = ',char_count )
    punctuation_lower=punctuation.lower()
    punctuation_words=punctuation_lower.split()
    count_words=(len(punctuation_words)) #count_words_true
    #count_alphanumric
    only_words=punctuation.split()
    alphanumric=''
    for i in only_words:
        alphanumric+=i
    count_alphanumric=len(alphanumric) 
    print('alphanumeric count = ',count_alphanumric )
    print('line count = ',c )
    print('word count = ',count_words )
    no_stopwords=[]
    punctuation_low=punctuation.lower()
    punctuation_split=punctuation_low.split()
    for i in punctuation_split:
        if not i in str_stop:
            no_stopwords.append(i)
    bow=lookBow(no_stopwords)
    print('BoW =',bow)
    
#case y or Y
if f_hash in'yY':
    M=int(input('M = '))
    print('-------------------')
    all_sentence=[]
    c=0
    for line in file:
        all_sentence.append(line.strip())   
        c+=1
    #count_words_false
    words=''
    for i in all_sentence :
        words+=i+' '
    all_words=words.split()
    #char_count
    punctuation=''
    for i in words:
        if i in "\"\'/\\,.:;()[]{}":
                punctuation += " "
        else:
            punctuation += i
    char_count=len(punctuation)-c 
    print('char count = ',char_count )
    punctuation_lower=punctuation.lower()
    punctuation_words=punctuation_lower.split()
    count_words=(len(punctuation_words)) #count_words_true
    #count_alphanumric
    only_words=punctuation.split()
    alphanumric=''
    for i in only_words:
        alphanumric+=i
    count_alphanumric=len(alphanumric) 
    print('alphanumeric count = ',count_alphanumric )
    print('line count = ',c )
    print('word count = ',count_words )
    no_stopwords=[]
    num_fhash=[]
    order_fhash=[]
    count=1
    bow_y=[]
    punctuation_low=punctuation.lower()
    punctuation_split=punctuation_low.split()
    for i in punctuation_split:
        if not i in str_stop:
            no_stopwords.append(i)
    for i in no_stopwords:
        num_fhash.append(fhash(i,M))
    order_fhash=sorted(num_fhash)
    
    first_word=order_fhash[0]
    for i in range(1,len(order_fhash)) :
        if order_fhash[i] == first_word :
            count += 1
        else :
            bow_y.append([order_fhash[i-1], count])
            first_word = order_fhash[i]
            count = 1
    bow_y.append([order_fhash[i],count])
    print('BoW =',bow_y)
    
file.close()
# 6330514721 (30.00) 334 (2021-03-22 01:41)
a=input('File name = ')
b=input('Use feature hashing ? (y,Y,n,N) ')
c=open(a,'r')
d=''
alnum='0123456789abcdefghijklmnopqrstuvwxyz'
l=['n','N','y','Y']
charcount=0
alnumcount=0
linecount=0
wordcount=0
for line in c:
    linecount+=1
    line=line.lower()
    for i in line:
        charcount+=1
        if i in alnum:
            alnumcount+=1
            d+=i
        else:
            d+=' '
d=d.split()
wordcount=len(d)
charcount=charcount-linecount+1
c.close()
while b not in l:       
   print('Try again.')
   b=input('Use feature hashing ? (y,Y,n,N) ')
u=open('stopwords.txt','r')
y=[]
for line in u:
    line=line.strip().split()
    y+=line
u.close()
nostop=[]
for i in d:
    if i not in y:
        nostop+=[i]
def fhash(nostop,m):
    kimlium=[]
    for w in nostop:
        k=0    
        for i in range(len(w)):
            k+=ord(w[i])*(37**i)
        k%=int(m)
        kimlium+=[k]
    return kimlium
def kumsum(p):
    p.sort()
    p+=['']
    v=1
    noks=[]
    for i in range(len(p)-1):
        if p[i]==p[i+1]:
            v+=1
        else:
            noks.append([p[i],v])
            v=1
    return noks
nono='-'*19
if b=='y' or b=='Y':
    m=input('M = ')
    f=fhash(nostop,m)
    f=kumsum(f)
    print(nono)
    print('char'+' '+'count'+' '+'=',charcount)
    print('alphanumeric'+' '+'count'+' '+'=',alnumcount)
    print('line'+' '+'count'+' '+'=',linecount)
    print('word'+' '+'count'+' '+'=',wordcount)
    print('BoW'+' '+'=',f)
elif b=='n' or b=='N':
    f=kumsum(nostop)
    print(nono)
    print('char'+' '+'count'+' '+'=',charcount)
    print('alphanumeric'+' '+'count'+' '+'=',alnumcount)
    print('line'+' '+'count'+' '+'=',linecount)
    print('word'+' '+'count'+' '+'=',wordcount)
    print('BoW'+' '+'=',f)

# 6330515321 (7.37) 335 (2021-03-22 23:58)
#Prog-08: Bag-of-words
# 6330515321 (7.37) Siriphon Chitkham
fileName = input("Filename = ")
fHashOption = input("Use feature hashing ? (y,Y,n,N) ").lower()
while fHashOption not in ["y", "n"] :
    print("Try again.")
    fHashOption = input("Use feature hashing ? (y,Y,n,N) ").lower()
if fHashOption == "y" :
    useFHash = True
else :
    useFHash = False
M = int(input("M = "))

stopwordsFile = open("stopword.txt", "r")
stopWordList = []
for line in stopwordsFile :
    miniStopwordList = line.strip().lower().split()
    stopWordList += miniStopwordList
stopwordsFile.close()
def fHash(word,M):
    fValue = 0
    G = 37
    p = 0
    for char in word :
        fValue += ord(char) * ( G ** p )
        p += 1
    return fValue % M
def sentenceClean(sentence, stopWordList):
    cleanWordList = []
    word = ""
    wordCount = 0
    alphanumericCount = 0
    for char in sentence :
        if char.isalpha() or char.isdigit() :
            word += char.lower()
        else :
            if word != "" and word not in stopWordList :
                alphanumericCount += len(word)
                wordCount += 1
                cleanWordList.append(word)
                word = ""
            elif word != "" and word in stopWordList :
                alphanumericCount += len(word)
                wordCount += 1
                word = ""
    return cleanWordList , wordCount , alphanumericCount


wordList = []
wordCodeList = []
bagOfWord = []
charCount = 0
alphanumericCount = 0
lineCount = 0
wordCount = 0
f = open(fileName, "r")
for line in f :
    lineCount += 1
    sentence = line.strip()
    charCount += len(sentence)
    CleanWordList , wc , alnum = sentenceClean(sentence, stopWordList)
    wordCount += wc
    alphanumericCount += alnum
    for word in CleanWordList :
        wordHash = fHash(word,M)
        if useFHash :
            if wordHash not in wordCodeList :
                wordCodeList.append(wordHash)
                bagOfWord.append([wordHash, 1])
            else :
                pos = wordCodeList.index(wordHash)
                bagOfWord[pos][1] += 1
        else :
            if word not in wordList :
                wordList.append(word)
                bagOfWord.append([word, 1])
            else :
                pos = wordList.index(word)
                bagOfWord[pos][1] += 1
f.close()
print("-------------------")
print("char count =" , charCount )
print("alphanumeric count =", alphanumericCount )
print("line count =", lineCount )
print("word count =", wordCount )
bagOfWord.sort()
print("BoW =", bagOfWord)
# 6330516021 (20.80) 336 (2021-03-21 18:33)

def blank(t) :
    result = ""
    for ch in t :
        if ch in "\"\'\\/,.<>:;[]{}()-_" :
            result += " "
        else :
            result += ch
    return result
def fhash(w,M) :
    f = 0
    for i in range(len(w)) :
        f += ord(w[i])*(37**i)
    f %= M
    return f

#--------------------------------
stopfile = open('stopword.txt','r')
stop = []
for line in stopfile :
    stop += line.split()
stopfile.close()
infile = open(input('File name = '),'r')
order = input('Use feature hashing ? (y,Y,n,N) ')
while order != 'y' and order != 'Y' and order != 'n' and order != 'N' :
    print("Try again.")
    order = input('Use feature hashing ? (y,Y,n,N) ')
if order == 'Y' or order == 'y' :
    M = int(input('M = '))
print('-------------------')
sample = []
words = []
char_count = 0
alphanum_count = 0
line_count = 0
for lines in infile :
    for ec in lines :
        if ec != '\n' :
            char_count += 1
    alphanum = blank(lines)
    for ea in alphanum :
        if ea != ' ' and ea != '\n':
            alphanum_count += 1
    words += alphanum.lower().split()
    sample += alphanum.lower().split()
    line_count += 1
print('char count = ' + str(char_count))
print('alphanumeric count = ' + str(alphanum_count))
print('line count = ' + str(line_count))
samples = []
for e in sample :
    if not e in stop :
        samples.append(e)
infile.close()
text = []
Bow = []
for i in range(len(samples)) :
    if samples[i] in text :
        Bow[text.index(samples[i])][1] += 1
    else :
        text += [samples[i]]
        Bow += [[samples[i],1]]
Bow.sort()
word_count = len(words)
print('word count = ' + str(word_count))
if order == 'Y' or order == 'y' :
    bowl = []
    contxt = []
    for i in range(len(samples)) :
        bow = fhash(samples[i],M)
        if bow in contxt :
            bowl[contxt.index(bow)][1] += 1
        else :
            contxt += [bow]
            bowl += [[bow,1]]
    bowl.sort()
    print('BoW = ' + str(bowl))
else :
    print('BoW = ' + str(Bow))
# 6330517621 (25.00) 337 (2021-03-22 22:39)

#----------------------------------------------------------------
alnum = ['0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
def Re_space(s):
  return s.split(' ')
def lowandre(b):
    a = open(b, 'r')
    s = []
    ss = []
    sss = []
    df = []
    scon = []
    for line in a :
        s.append(Re_space(line))
    file.close()
    for e in s :
        if e[len(e)-1][len(e[len(e)-1])-1] == '\n' :
            e[len(e)-1] = e[len(e)-1][:len(e[len(e)-1])-1]
        for m in e :
            ss.append(m)
    for e in ss :
        for i in range(len(e)):
            if e[i] not in alnum :
                e = e[:i]+' '+e[i+1:]
        e = e.strip()
        sss.append(e)
    for e in sss:
        a = e.split(' ')
        while '' in a :
            a.remove('')
        df.append(a)
    for e in df :
        for m in e :
            scon.append(m)
    
    
    return scon
#-----------------------------------------------------------------
def lowresam(b):
    x = lowandre(b)
    lowresam = []
    for e in x :
        e = e.lower()
        lowresam.append(e)
        if e in stw :
            lowresam.remove(e)
    
    return lowresam
#-----------------------------------------------------------------
def charcount(b):
    fn = open(b, 'r')
    line = fn.readline()
    charcount = 0
    while len(line) > 0 :
        if line[len(line)-1] == '\n' :
            charcount += len(line)-1
        else :
            charcount += len(line)
        line = fn.readline()
    fn.close()
    
    return charcount

#-----------------------------------------------------------------
def alnumcount(b):
    alnumcount = 0
    x = lowandre(b)
    for e in x :
        alnumcount += len(e)
        
    return alnumcount
#-----------------------------------------------------------------
def linecount(b):
    fn = open(b, 'r')
    line = fn.readline()
    linecount = 0
    while len(line) > 0 :
        linecount += 1
        line = fn.readline()
    fn.close()
    
    return linecount
#-----------------------------------------------------------------
def wordcount(b):
    wordcount = 0
    x = lowandre(b)
    for e in x :
        wordcount += 1
        
    return wordcount

#-----------------------------------------------------------------
def BoWNn(b):
    a = lowresam(b)
    x = []
    y = []
    z = []
    f = []
    o = 1
    a.sort()
    for e in a:
        f.append(e)    
    g = len(a)
    for i in range(g):
        if i != g-1 :
            if f[i] == f[i+1]:
                a.remove(a[i+1])
                o += 1
            if f[i] == f[i-1]:
                a.remove(a[i-1])
                o += 1
            x.append(f[i])
            x.append(o)
            y.append(x)
            x = []
            o = 1
        if i == g-1:
            x.append(f[i])
            x.append(o)
            y.append(x)
            x = []
            o = 1        
    y.sort()
    for e in y :
        if e not in z:
            z.append(e)
        else :
            pass
    
    return z

#------------------------------------------------------------------
def BoWYy(b) :
    a = lowresam(b)
    x = []
    y = []
    z = []
    dz = []
    h = 0
    b = 0
    for e in a :
        for m in e :
            b += ord(m)*(37**h)
            h += 1
        h = 0
        c = b%int(M)
        x.append(c)
        x.sort()
        b = 0
    i = 0
    while i <= max(x):
        y.append([])
        i += 1
    for e in x :
        y[e-1].append(e)
    while [] in y :
        y.remove([])
        y.sort()
    for e in y :
        z.append(e[0])
        z.append(len(e))
        dz.append(z)
        z = []
    dz.sort()
    
    return dz
        
    
#stwbuilding------------------------------------------------------

file = open('stopwords.txt','r')
stwt = []
stw = []
for line in file:
  stwt.append(Re_space(line))
file.close()
for e in stwt :
    if e[len(e)-1][len(e[len(e)-1])-1] == '\n' :
        e[len(e)-1] = e[len(e)-1][:len(e[len(e)-1])-1]
    for m in e :
        stw.append(m)


#----------------------------------------------------------------

file_name = input('File name = ')
YN = input('Use feature hashing ? (y,Y,n,N) ')
while YN not in ['y','Y','n','N'] :
    print('Try again.')
    YN = input('Use feature hashing ? (y,Y,n,N) ')
if YN == 'y' or YN == 'Y' :
    M = input('M = ')
if YN == 'n' or YN == 'N' :
    pass
print('-------------------')
x = charcount(file_name)
print('char count = '+str(x))
x = alnumcount(file_name)
print('alphanumeric count = '+str(x))
x = linecount(file_name)
print('line count = '+str(x))
x = wordcount(file_name)
print('word count = '+str(x))
if YN == 'n' or YN == 'N':
    x = BoWNn(file_name)
if YN == 'y' or YN == 'Y':
    x = BoWYy(file_name)
print('BoW = '+ str(x))


# 6330518221 (30.00) 338 (2021-03-22 15:03)
def charcount(m):
    n = 0
    for i in m:
        i = i.strip()
        n += len(i)
    return n

def alphacount(m):
    n = 0
    for i in m:
        i = i.strip()
        for j in i:
            if j.isalnum():
                n += 1
    return n

def create(m):
    k = list()
    for i in m:
        i = i.strip()
        for y in i.split():
            k.append(y)
    return k

def createall(m):
    k = list()
    n = list()
    for i in m:
        i = i.strip()
        for y in i.split():
            k.append(y)
    for i in k:
        allword = ''
        for j in i:
            if j.isalnum():
                allword += j
            else:
                allword += ' '
        p = allword.split()
        for r in p:
            n.append(r)
    return n
    
def fhash(word, M):
    G = 37
    n = 0
    for i in range(len(word)):
        n += ord(word[i]) * (G ** i)
    return n % M

def BOW (w,d,M):
    w2 = list()
    bow = list()
    if d == 'y':
        for i in range(len(w)):
            w[i] = fhash(w[i],M)
    for i in w:
        if i in w2:
            pass
        else:
            w2.append(i)
    for i in w2:
        bow.append([i,w.count(i)])
    return bow


file_name = input('File name = ')
useh = input('Use feature hashing ? (y,Y,n,N) ').lower()
while useh not in ['y','n']:
    print('Try again.')
    useh = input('Use feature hashing ? (y,Y,n,N) ').lower()
M = -1
if useh == 'y':
    M = int(input('M = '))
print('-'*19)
all_stop2 = open('stopwords.txt', 'r')
all_stop = list()
for i in all_stop2:
    all_stop.append(i.lower())
lstop = create(all_stop)
datao = open(file_name, 'r')
data = list()
for i in datao:
    data.append(i.lower())
print('char count =',charcount(data))
print('alphanumeric count =',alphacount(data))
print('line count =',len(data))
word = createall(data)
print('word count =',len(word))
wordnostop = list()
for i in word:
    if i not in lstop:
        wordnostop.append(i)
print('BoW =',sorted(BOW(wordnostop,useh,M)))
all_stop2.close()
datao.close()
# 6330520421 (22.44) 339 (2021-03-22 22:28)
#Prog-08 : Bag-of-words
#6330520421 (22.44) Supakorn Na kalasin
def hashing(w,m) :
    a=0
    for i in range(len(w)):
        a += ord(w[i])*(37**i)
    return a%int(m)
    
file_name = open(input('File name = '),"r")
a = input("Use feature hashing ? (y,Y,n,N)")
while True :
    if a not in ["y","Y","N","n"] :
        print("Try again.")
        a = input("Use feature hashing ? (y,Y,n,N)")
    elif a == "Y" or a == "y" :
        M = input("M = ")
        stopwords = open('stopwords.txt','r')
        ws = []
        for line2 in stopwords:
            z = line2.split()
            ws += z
        lc=0
        cc=0
        wc=0
        anc=0
        bb = []
        for line in file_name:
            lc += 1
            cc += len(line)-1
            for e in line:
                if e in ['?','.','!','/',';',':',',','"',"'",'@','#','$','%','^','&','฿','*','(',')','_','-','+','=','|',"[","]",'<','>','*']:
                    line=line.replace(e,' ')
            b = line.split()
            bb += b
            wc += len(b)
            for i in b:
                anc += len(i)
        print('char count =',cc)
        print('alphanumeric count =',anc)
        print('line count =',lc)
        print('word count =',wc)
        wb = []
        for j in bb: 
            if j.lower() not in ws:
                wb.append(j.lower())
        hw = []
        for i in wb:
            hw.append(hashing(i,M))
        bow = []
        for f in hw:
            n = 0
            for g in hw:
                if f == g:
                    n += 1
            if [f,n] not in bow:
                bow.append([f,n])
        print('BoW =',bow)
        break
               
    elif a == "n" or a == "N" :
        stopwords = open('stopwords.txt','r')
        ws = []
        for line2 in stopwords:
            z = line2.split()
            ws += z
        lc=0
        cc=0
        wc=0
        anc=0
        bb = []
        for line in file_name:
            lc += 1
            cc += len(line)-1
            for e in line:
                if e in ['?','.','!','/',';',':',',','"',"'",'@','#','$','%','^','&','฿','*','(',')','_','-','+','=','|',"[","]",'<','>','*'] :
                    line=line.replace(e,' ')
            b = line.split()
            bb += b
            wc += len(b)
            for i in b:
                anc += len(i)
        print('char count =',cc)
        print('alphanumeric count =',anc)
        print('line count =',lc)
        print('word count =',wc)
        wb = []
        for j in bb: 
            if j.lower() not in ws:
                wb.append(j.lower())
        bow = []
        for f in wb:
            n = 0
            for g in wb:
                if f == g:
                    n += 1
            if [f,n] not in bow:
                bow.append([f,n])
        print('BoW =',bow)
        break


# 6330521021 (26.00) 340 (2021-03-22 12:35)
def gettext(file,t):
    text=''
    ncount=0
    for i in range(len(t)-1):
        text+=t[i][:-1]+' '
        if t[i][-1:]=='\n':
            ncount+=1
    text+=t[-1]
    if t[-1][-1:]=='\n':
            ncount+=1
    text=text.lower()
    return text,ncount
def replacepunc(text):
    newtext=''
    for i in range(len(text)):
        if text[i].isalnum():
            newtext+=text[i]
        else:
            newtext+=' '
    return newtext
def stopw():
    stopwords=open('stopwords.txt','r')
    s=stopwords.readlines()
    stopw,nc=gettext(stopwords,s)
    allstopw=stopw.split()
    return allstopw
def fhash(w,m):
    n=0
    for i in range(len(w)):
        n+=ord(w[i])*(37**i)
    return n%m
def bow(textlist):
    k=[]
    l=1
    wc=[]
    uniqueword=[]
    allstopw=stopw()
    for i in textlist:
        if i not in allstopw:
            k.append(i)
    k=sorted(k)
    for i in k:
        if i not in uniqueword:
            uniqueword.append(i)
    for i in range(len(k)-1):
        if k[i]!=k[i+1]:
            wc.append(l)
            l=1
        else:
            l+=1
    wc.append(l)
    bagofword=[]
    for i in range(len(wc)):
        bagofword.append([uniqueword[i],wc[i]])
    return bagofword
def bowm(textlist,m):
    k=[]
    l=[]
    uniquenum=[]
    wc=[]
    n=1
    allstopw=stopw()
    for i in textlist:
        if i not in allstopw:
            k.append(i)
    for i in k:
        l.append(str(fhash(i,m)))
    l=sorted(l)
    for i in l:
        if i not in uniquenum:
            uniquenum.append(i)
    for i in range(len(l)-1):
        if l[i]!=l[i+1]:
            wc.append(n)
            n=1
        else:
            n+=1
    wc.append(n)
    bagofword=[]
    for i in range(len(wc)):
        bagofword.append([int(uniquenum[i]),wc[i]])
    return bagofword
fn=input("File name = ")
m=0
while True:
    fh=input("Use feature hashing ? (y,Y,n,N) ")
    if fh=='y' or fh=='Y' or fh=='n' or fh=='N':
        if fh=='y' or fh=='Y':
            m=int(input("M = "))
        break
    else:
        print("Try again.")
file=open(fn,'r')
ccount=0
alnum=0
uniqueword=[]
t=file.readlines()
text,ncount=gettext(file,t)
ccount=len(text)
print('char count =',ccount-ncount)
lcount=len(t)
for i in text:
    if i.isalnum():
        alnum+=1
print('alphanumeric count =',alnum)
print('line count =',lcount)
newtext=replacepunc(text)
textlist=newtext.split()
wcount=len(textlist)
print('word count =',wcount)
if m==0:
    print("BoW =",bow(textlist))
else:
    print("BoW =",bowm(textlist,m))
# 6330522721 (30.00) 341 (2021-03-22 22:58)

def main():
    char_count()
    alphanumeric_count()
    line_count()
    word_count()
def char_count():
    file = open(start)
    c = 0
    for line in file :
        for e in line:
            if e != '\n':
                c += 1
    file.close()
    return print('char count =', c)
def alphanumeric_count() :
    file = open(start)
    c = 0
    for line in file :
        for e in line:
            if e.isalnum() :
                c += 1
    file.close()
    return print('alphanumeric count =' ,c)
def line_count():
    file = open(start)
    c = 0
    for line in file:
        if len(line) > 0 :
            c += 1
        else:
            break
    file.close()    
    return print('line count =',c)
def remove_punc(p):
    t = ''
    for e in p:
        if e.isalnum():
            t += e
        else :
            t += ' '
    return t
   
def word_count():
    c = 0
    file = open(start)    
    for line in file :
        c += len(remove_punc(line).split())
    file.close()      
    return print('word count =',c)
#-------------------------
stopwords = []
stop_file = open('stopwords.txt')
for line in stop_file:
    new_stop = line.split()
    for e in new_stop:
        stopwords.append(e)
#---------------------------
def line_normalize(oldline):
    new_line = []
    for e in oldline:
        if e not in stopwords:
            new_line.append(e)
    return new_line                
    
def histogram_bin(data):
    his_bin = []
    for e in data :
        if e not in his_bin :
            his_bin.append(e)
    return his_bin

def count(data,element):   
    c = 0
    for e in data :
        if e == element:
            c += 1
    return c
def fhash(word,m):
    t = 0
    for i in range(len(word)):
        t += ord(word[i])*37**i
    return t % m

def change_data(data):
    new_data = []
    for i in range(len(data)):
        data[i][1] = 0
    for e in data:
        if e not in new_data :
            new_data.append(e)
    return new_data
def count_fhash(data,element):
    for e in data :
        if element[0] == e[0]:
            data[data.index(e)][1] += element[1]
#------------------------------------------------------------
yynn = ['y','Y','n','N']
start = input('File name = ').strip()
while True:
    second = input('Use feature hashing ? (y,Y,n,N) ')
    if second in yynn :
        if second in ['y','Y']:
            m = int(input('M = '))
        print('-------------------')
        break
    else:
        print('Try again.')

main()
# -------------------------
file = open(start)
norm = []   #append
bow_1 = []
new_line = ''
for line in file:
    new_line += remove_punc(line.lower())
normalize_word =line_normalize(new_line.split())
file.close()
for e in normalize_word:
    norm.append(e)
normal = histogram_bin(norm)
for e in normal :
    bow_1.append([e, count(norm,e) ])

if second in ['y','Y']:
    bow_fhash = []
    new_bow2 = []
    for [word,f] in bow_1 :
        bow_fhash.append([fhash(word,m) , f])
    bow_f2 = []
    for e in bow_fhash:      
        bow_f2.append(list(e))
    zerotoreal_bow = change_data(bow_f2)
    for k in bow_fhash:
        count_fhash(zerotoreal_bow ,k)
    
if second in ['n','N']:
    print('BoW =',sorted(bow_1) )
else:
    print('BoW =',sorted(zerotoreal_bow))           
            



# 6330523321 (24.75) 342 (2021-03-21 19:13)
filename = input('File name = ')
feature = input('Use feature hashing ? (y,Y,n,N) ')
usehash = False
while not feature in ['y','Y','n','N']:
    print('Try again.')
    feature = input('Use feature hashing ? (y,Y,n,N) ')
if feature in ['y','Y']:
    M = int(input('M = '))
    usehash = True
print('-------------------')
stopwordslist = []   
stopwords_file = open('stopwords.txt', 'r')
for line in stopwords_file:
    
    strip_stopwords_file = line.strip()
    strip_split_stopwords_file = strip_stopwords_file.split()
    stopwordslist += strip_split_stopwords_file
stopwords_file.close()
def find_replace(t):
    result = ""
    for c in t:
        if c in "\"\'/\\,.:;":
            result += " "
        else:
            result += c
    return result

charcount = 0
file = open(filename, 'r')
for line in file:
    strip_line = line.strip().lower()
    charcount += len(strip_line)
file.close()
print('char count =',charcount)

alphanumericcount = 0
file = open(filename, 'r')
for line in file:
    strip_line = line.strip().lower()
    for i in strip_line:
        isalnum = i.isalnum()
        if isalnum == True:
            alphanumericcount +=1
file.close()
print('alphanumeric count =',alphanumericcount)

linecount = 0
file = open(filename, 'r')
for line in file:
    strip_line = line.strip().lower()
    linecount +=1
file.close()
print('line count =',linecount)

wordcount = 0
file = open(filename, 'r')
for line in file:
    strip_line = line.strip().lower()
    words = find_replace(strip_line)
    strip_words = words.strip()
    split_strip_words = strip_words.split()    
    wordcount += len(split_strip_words)
file.close()
print('word count =',wordcount)

all_words_list =[]
file = open(filename, 'r')
for line in file:
    strip_line = line.strip().lower()
    words = find_replace(strip_line)
    strip_words = words.strip()
    split_strip_words = strip_words.split()
    all_words_list += split_strip_words 
file.close()

all_words_withoutstopwords_list = []
a = []
for i in all_words_list:
    if not i in stopwordslist:
        all_words_withoutstopwords_list.append(i)

BoW = []
def addwordToBoW(BoW,newword):
    contain = False
    for i in BoW:
        if i[0] == newword:
            contain = True
            i[1] +=1
            break
    if contain == False:
        BoW.append([newword,1])
    return BoW
if usehash == False:
    for i in all_words_withoutstopwords_list:
        BoW = addwordToBoW(BoW,i)
    print('BoW =',sorted(BoW))
def fhash(word,M):
    G = 37
    numchar = 0
    for charindex in range (len(word)):
        numchar += ord(word[charindex])*(G**charindex)
    return numchar%M

if usehash == True:
    wordhash_list = []
    for word in all_words_withoutstopwords_list:
        wordhash = fhash(word,M)
        wordhash_list.append(wordhash)
    BoWhash = []
    for i in sorted(wordhash_list):
        BoWhash = addwordToBoW(BoWhash,i)
    print('BoW =',BoWhash)
    

# 6330524021 (26.00) 343 (2021-03-22 16:30)

def flash(w,M):                #รับสตริง 1 คำ และค่า M คืนค่าจำนวนเต็มของคำ
    c=0
    for i in range(len(w)):
        c+=ord(w[i])*37**i
    return c%M
def clear(messages):           #รับสตริงข้อความ
    c=''                       #คืนข้อความที่ตัดอักขระแล้ว
    for i in messages.lower():
        if i.isalnum() :c+=i
        else:c+=' '
    return c
def BoW(m):                    #รับลิสต์ข้อความที่ clear ไม่มี stopwords
    words=[]                   #คืนลิสต์ของ คำ+จำนวน
    for i in m:
        if i not in words: words.append(i)
    frequency=[0]*len(words)
    for i in range(len(m)):
        frequency[words.index(m[i])]+=1
    bow=[]
    for i in range(len(words)):
        bow.append([words[i],frequency[i]])
    return bow
def cut_stop(w):                #รับลิสต์คำ ตัด stopwords
    global stopwords
    return [e for e in w if e not in stopwords]
#start
file_name=input('File name = ').strip()
while True:
    feature=input('Use feature hashing ? (y,Y,n,N) ').lower().strip()
    if feature=='n':break
    if feature=='y':M=int(input('M = ').strip());break
    else:print('Try again.')
print('-------------------')
file=open(file_name,'r')
file2=open('stopwords.txt','r')
c=0
alllines=file.readlines()
for i in alllines:
    for e in i:
        if e !='\n':c+=1
print('char count = '+str(c))
c1=0
for i in alllines:
    i=clear(i)
    for e in i:
        if e!=' ':c1+=1
print('alphanumertic count = '+str(c1))
print('line count = '+str(len(alllines)))
allw=[]
for i in alllines:
    i=clear(i)
    for e in i.split():
        allw.append(e)
allw.sort()
print('word count =',len(allw))
stopwords=[]
for i in file2:
    i=clear(i)
    for e in i.split():
        stopwords.append(e)
allw=cut_stop(allw)       
if feature=='y':
    newallw=[]
    for i in allw:
        newallw.append(flash(i,M))
    newallw.sort()
    print('Bow =',BoW(newallw))
if feature=='n':
    print('BoW =',BoW(allw))
file.close()
file2.close()
# 6330525621 (19.00) 344 (2021-03-22 22:27)

file_name = open(input("File Name = "),"r")
read = input("Use feature hashing ? (y,Y,n,N) ")
while read.lower() != 'y' and read.lower() != 'n':
    print("Try again.")
    read = input("Use feature hashing ? (y,Y,n,N) ")
if read.lower() == "y":
    M = int(input("M = "))
stopword = open("stopword.txt","r")
print("-------------------")
def remove_punctuation(s):
    out=''
    for c in s:
        if c.lower() in '0123456789abcdefghijklmnopqrstuvwxyz':
            out += c.lower()
        else:
            out += ' '
    return out
char = 0
line_count = 0
alpha = 0
word_count = 0
word_file = []
for line in file_name:
    for e in line:
        if e != "\n":
            char += 1
    for e in line.lower():
        if e.isalnum() and e != "\n":
            alpha += 1
    line_count += 1
    line_word = (remove_punctuation(line).split())
    for e in line_word:
        word_file.append(e)
for e in word_file:
    if e != '/n':
        word_count += 1
def word_in_file(file):
    words = []
    for lin in file:
        lineword = (remove_punctuation(lin).split())
        for k in lineword:
            words.append(k)
    return words
def BoW_no_fea(words,stop):
    stop = word_in_file(stop)
    for s in words:
        if s in stop:
            words.remove(s)
    for s in words:
        if s in stop:
            words.remove(s)
    words.sort()
    n = 0
    bow = []
    for i in range(len(words)-1):
        if words[i+1] != words[i]:
            f = i-n+1
            n = i+1
            bow.append([words[i],f])
    bow.append([words[len(words)-1],len(words)-n])
    return bow
def fhash(w,M):
    u = 0
    for i in range(len(w)):
        u += ord(w[i])*(37**i)
    result = u%M
    return result
def BoW_w_fea(words,M,stop):
    stop = word_in_file(stop)
    for k in words:
        if k in stop:
            words.remove(k)
    for k in words:
        if k in stop:
            words.remove(k)
    words.sort()
    fh =[]
    for i in range(len(words)):
        fh.append(fhash(words[i],M))
    bow = []
    n = 0
    fh.sort()
    for i in range(len(fh)-1):
        if fh[i+1] != fh[i]:
            f = i-n+1
            n = i+1
            bow.append([fh[i],f])
    bow.append([fh[len(fh)-1],len(fh)-n])
    return bow
print("char count =",char)
print("alphanumeric =",alpha)
print("line count =",line_count)
print("word count =",word_count)
if read.lower() == "n":
    print("BoW =",BoW_no_fea(word_file,stopword))
if read.lower() == "y":
    print("BoW =",BoW_w_fea(word_file,M,stopword))
stopword.close()
file_name.close()
# 6330526221 (21.40) 345 (2021-03-21 23:57)
def fhash(w,M) :
    c = 0
    for i in range(len(w)) :
        c += ord(w[i]) * (37**i)
    c = c % M
    return c
def remove_punctuation(s):
  out = ''
  for c in s:
    if c.lower() in '0123456789abcdefghijklmnopqrstuvwxyz':
      out += c.lower()
    else:
      out += ' '
  return out

File_name = open(input('File name = '),'r')
BBB = []
text = ''
snow = 0
plus = 0
for line in File_name :
    if '\n' in line :
        snow += len(line)-1
        text += line[:-1]
        plus += 1
    else :
        snow += len(line)
        text += line
        plus += 1
s=remove_punctuation(text)
see=''.join([token[0].upper()+token[1:] for token in s.split()])
listt = s.split()
stop_words = open('stopwords.txt','r')
xxx = ''
for line in stop_words :
    if '\n' in line :
        xxx += line[:-1] + ' '
    else :
        xxx += line
xxx = xxx.split()

J = []
for i in range(len(listt))  :
    if not listt[i] in xxx :
        J.append(listt[i])
        
JJ = []
MM = []
for i in range(len(J)) :
    if not J[i] in JJ :
        JJ.append(J[i])
    else :
        MM.append(J[i])

SS = []
SSS = []
for i in J :
    if i not in SS :
        SS.append(i)
        SSS.append(J.count(i))

AAA = []
for i in range(len(JJ)) :
    AAA.append([JJ[i],SSS[i]])
AAA.sort()
while True :
    x = input('Use feature hashing ? (y,Y,n,N) ')
    if x == 'y' :
        a = int(input('M = '))
        for i in range(len(J)) :
            BBB.append(str(fhash(J[i],a)))
        K = []
        KK = []
        K_ = []
        KK_ = []
        solution = []
        for i in range(len(J)) :
            if not BBB[i] in K :
                K.append(int(BBB[i]))
        for i in range(len(J)) :
            if not BBB[i] in K_ :
                K_.append(BBB[i])
        for i in K_ :
            KK_.append(int(i))
        K.sort()
        
        for i in BBB :
            if i not in K :
                K.append(i)
                KK.append(BBB.count(i))
        for i in range(len(KK)) :
            solution.append([KK_[i],KK[i]])
        solution.sort()
        
        break
    elif x == 'Y' :
        a = int(input('M = '))
        for i in range(len(J)) :
            BBB.append(str(fhash(J[i],a)))
        K = []
        KK = []
        K_ = []
        KK_ = []
        solution = []
        for i in range(len(J)) :
            if not BBB[i] in K :
                K.append(int(BBB[i]))
        for i in range(len(J)) :
            if not BBB[i] in K_ :
                K_.append(BBB[i])
        for i in K_ :
            KK_.append(int(i))
        K.sort()
        
        for i in BBB :
            if i not in K :
                K.append(i)
                KK.append(BBB.count(i))
        for i in range(len(KK)) :
            solution.append([KK_[i],KK[i]])
        solution.sort()
        break
    elif x == 'n' :
        break
    elif x == 'N' :
        break
    else :
        print('Try again.')

File_name.close()
stop_words.close()
print('-------------------')
print('char count = '+ str(snow))
print('alphanumeric count = ' + str(len(see)))
print('line count = ' + str(plus))
print('word count = ' + str(len(listt)))
if x == 'n' :
    print('Bow = ',AAA)
elif x == 'N' :
    print('Bow = ',AAA)
elif x == 'y' :
    
    print('Bow = ',solution)
elif x == 'Y' :
    print('Bow = ',solution)
# 6330527921 (26.00) 346 (2021-03-22 20:23)
def fhash(w,M) :
    f = 0
    for i in range(len(w)):
        f += ord(w[i]) * (37 ** i)
    f = f % M
    return f

file_name = str(input('File name = '))
while True :
    a = input('Use feature hashing ? (y,Y,n,N) ')
    if a == 'y' or a == 'Y' :
        M = int(input('M = '))
        break
    elif a == 'n' or a == 'N' :
        M = 'false'
        break
    else:
        print('Try again.')
stopwords = open('stopwords.txt','r')
lstopwords = stopwords.read()
stopwords.close()
lstopwords = lstopwords.split()
readfile = open(file_name,'r')
llines = [line.strip() for line in readfile.readlines()]
linecount = len(llines)
charcount = len(''.join(llines))
s = ' '.join(llines)
words = ''
alnum = ''
for i in range(len(s)) :
    if s[i].isalnum():
        words += s[i].lower()
        alnum += s[i]
    else :
        words += ' '
lwords = words.strip().split()
wordcount = len(lwords)
alnumcount = len(alnum)
print('-------------------')
print('char count = '+str(charcount))
print('alphanumeric count = '+str(alnumcount))
print('line count = '+str(linecount))
print('word count = '+str(wordcount))
n = 0
while n < len(lwords) :
    if lwords[n] in lstopwords :
        lwords.remove(lwords[n])
    else :
        n += 1
lwords.sort()
if M == 'false' :
    p = 1
    lp = []
    m = 1
    while m < len(lwords) :
        if lwords[m] == lwords[m-1] :
            p += 1
            lwords.remove(lwords[m-1])
        else :
            lp += [p]
            m += 1
            p = 1
    if lwords[-1] == lwords[-2] :
        p += 1
        lwords.remove(lwords[-2])
        lp += [p]
    else :
            lp += [p]
    BoW = []
    for k in range(len(lp)) :
        BoW += [[lwords[k],lp[k]]]
else :
    fhwords = []
    for l in range(len(lwords)) :
        fhwords += [fhash(lwords[l],M)]
    fhwords.sort()
    p = 1
    lp = []
    o = 1
    while o < len(fhwords) :
        if  fhwords[o] == fhwords[o-1] :
            p += 1
            fhwords.remove(fhwords[o-1])
        else :
            lp += [p]
            o += 1
            p = 1
    if  fhwords[-1] == fhwords[-2] :
        p += 1
        fhwords.remove(fhwords[-2])
        lp += [p]
    else :
        lp += [p]
    BoW = []
    for k in range(len(lp)) :
        BoW += [[fhwords[k],lp[k]]]
print('BoW = ' + str(BoW))

# 6330528521 (20.50) 347 (2021-03-22 15:52)
file_name = input('File name = ')
use = input('Use feature hashing ? (y,Y,n,N)')
read = open(file_name,'r')
while use not in 'y,Y,n,N':
    print('Try again.')
    use = input('Use feature hashing ? (y,Y,n,N)')
if use in 'y,Y' :
    M = input('M = ')
stopword = open('stopwords.txt','r')
count_char = 0
count_line = 0
alpha = 0
b = ''
for line in read :
    line.lower()
    count_char = count_char + len(line)
    count_line+=1
    for i in line :
        if 'A'<=i<='Z' or 'a'<=i<='z' or '0'<=i<='9':
            alpha+=1
        if i in ',."\n':
            b+=' '
        else :
            b+=i.lower()
a=b.split(' ')

c=[]
for i in a :
    if i != '':
        c.append(i)
print('-------------------')
print('char count = '+str(count_char-(count_line-1)))
print('alphanumeric count = '+str(alpha))
print('line count = '+str(count_line))
print('word count = '+str(len(c)))
m=''
for line in stopword:
    for i in line :
        if i in ',."\n':
            m+=' '
        else :
            m+=i
v=m.split(' ')
p = []
for i in a :
    if i not in v and i not in  ['']:
        p.append(i)
        p = sorted(p)
rr = []

i=0
if use not in ['y','Y']:
    while  i <len(p)-1 :
        u = 1
        
        if p[i] == p[i+1]:
            u = u+1
            y = [p[i],u]
            rr.append(y)
            i+=2
        else :
            y = [p[i],u]
            rr.append(y)
            i+=1
    print(rr)
else :
    g = []
    for e in p:
        z = 0
        x = 0
        
        for i in e :
            z = z+ord(i)*37**x
            x+=1
            if x == len(e):
                f = [e,z%int(M)]
                g.append(f)
    gg = []
    for i in g:
        gg.append(i[1])
    gg = sorted(gg)
    
    ii = 0
    rrr = []
    while ii < len(gg):
        uu = 1
        if ii == len(gg)-1:
            yy = [gg[ii],uu]
            rrr.append(yy)
            ii+=1
        elif gg[ii] == gg[ii+1]:
            uu = uu+1
            yy = [gg[ii],uu]
            rrr.append(yy)
            ii+=2
        else :
            yy = [gg[ii],uu]
            rrr.append(yy)
            ii+=1
    print('BoW =',rrr)

stopword.close()
read.close()
# 6330529121 (30.00) 348 (2021-03-22 22:52)
M = 1
file_name = input('File name = ')

fhashenable = True
fhash = input('Use feature hashing ? (y,Y,n,N) ')
while True :
    if fhash.lower() == 'y' :
        fhashenable = True
        M = int(input('M = '))
        break
    elif fhash.lower() == 'n' :
        fhashenable = False
        break
    else :
        print('Try again.')
        fhash = input('Use feature hashing ? (y,Y,n,N) ')


def filetolist(x) :
    stop_words = open(x ,'r')
    stopwords1 = stop_words.readlines()
    sentence = ''
    
    for ch in stopwords1 :
        if ch[-1] == '\n' :
            sentence += ch[:-1].lower()
            sentence += ' '
        else :
            sentence += ch.lower()
            sentence += ' '

    sentencenobobo = ''

    for ch in sentence :
        if ch.isalnum() == True :
            sentencenobobo += ch
        else :
            sentencenobobo += ' '

    sentencelist = sentencenobobo.strip().split()
    stop_words.close()
    return(sentencelist)

filename1 = open(file_name , 'r')
filename2 = filename1.readlines()
charcounttttt = ''
for ch in filename2 :
    if ch[-1] == '\n' :
        charcounttttt += ch[:-1].lower()
    else :
        charcounttttt += ch.lower()
charcount = len(charcounttttt)
filename1.close()

filename1 = open(file_name , 'r')
filename2 = filename1.readlines()
numberandalpha = 0
for ch in filename2 :
    for e in ch :
        if e.isalnum() == True :
            numberandalpha += 1
        else :
            numberandalpha += 0
filename1.close()

filename1 = open(file_name , 'r')
filename2 = filename1.readlines()
linecount = 0
for ch in filename2 :
    linecount += 1
filename1.close()
def fhashing(w,M) :
    k = 0
    for i in range(len(w)) :
        k += ord(w[i]) * (37**i)
    return(k%M)

def bagofword(x,y) :
    eao = filetolist(x)
    maieao = filetolist('stopwords.txt')
    hiyaa = []
    for ch in eao :
        if ch not in maieao :
            hiyaa.append(ch)
    if y == True :
        global M
        for i in range(len(hiyaa)) :
            hiyaa[i] = fhashing(hiyaa[i],M)
    hiyaa.sort()
    
    PLZ = []
    count = []
    countPLZ = 0
    for i in range(len(hiyaa)) :
        if i < len(hiyaa)-1 and hiyaa[i] != hiyaa[i+1] :
            PLZ.append(hiyaa[i])
            countPLZ += 1
            count.append(countPLZ)
            countPLZ = 0
        else :
            countPLZ += 1
        if i == len(hiyaa) - 1 :
            count.append(countPLZ)
            PLZ.append(hiyaa[i])

    FINAL = []
    for i in range(len(PLZ)) :
        FINAL.append([PLZ[i],count[i]])
    return(FINAL)
        
print('-------------------')
print('char count = ' + str(charcount))
print('alphanumeric count = ' + str(numberandalpha))
print('line count = ' + str(linecount))
print('word count = ' + str(len(filetolist(file_name))))
print('BoW = ' + str(bagofword(file_name,fhashenable)))
# 6330530721 (28.00) 349 (2021-03-20 22:18)
fname = input('File name = ')


file_name = open(fname,'r')
#----------------------------------------------------------
def count_words(s):
    for i in range(len(s)):
      if s[i] not in '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
        s = s[:i] + ' ' + s[i+1:]
    new = s.split()
    q = len(new)
    return q

def sep_words(s):
    for i in range(len(s)):
      if s[i].lower() not in '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
        s = s[:i].lower() + ' ' + s[i+1:].lower()
    new = s.split()
    return new

def fhash(w,M):
      c = 0
      for i in range(len(w)):
            c += ord(w[i])*(37**i)
      fhash_result = c%M
      return fhash_result
    


#-----------------------------------------------

line_count = 0
number_of_characters = 0
number_of_alnum = 0
number_of_word = 0
bow = []
sep_w = []
for line in file_name.readlines(): 
    
    
    line_count +=1
    number_of_characters += len(line)
    s1="".join(c for c in line if c.isalnum())
    number_of_alnum += len(s1)
    number_of_word += count_words(line)
    sep_w += sep_words(line)
    
file_name.close()


stopwords = open('stopwords.txt','r')
#--------------------------------------------------
st = []
sep_st = []
for li in stopwords.readlines():
    st.append(li)
    sep_st += sep_words(li)




pre_bow = []
for i in sep_w:
    if i not in sep_st:
        pre_bow.append(i)
pre_bow.sort()
       


list_words = []
list_freq = []
for word in pre_bow:
    if word not in list_words:
        list_words.append(word)
        list_freq.append(pre_bow.count(word))
        

#-----------------------------------------------    

bow = []
for e in range(len(list_words)):
    bow.append([list_words[e],list_freq[e]])




number_of_characters_edit = number_of_characters-(line_count-1)

choice = input('Use feature hashing ? (y,Y,n,N) ')


#----------------------------------------------------------------
if choice not in ['y','Y','n','N']:
    while choice not in ['y','Y','n','N']:
        print('Try again')
        choice = input('Use feature hashing ? (y,Y,n,N) ')


if choice in ['y','Y']:
    
    M = int(input('M = '))
    print('-------------------')
    print('char count =', number_of_characters_edit)
    print('alphanumeric count =', number_of_alnum)
    print('line count =',line_count)
    print('word count =',number_of_word)
    bow_y = []
    for i in range(len(pre_bow)):
        bow_y.append(fhash(pre_bow[i],M))
    
    list_num = []
    num_freq = []
    for word in bow_y:
        if word not in list_num:
            list_num.append(word)
            num_freq.append(bow_y.count(word))
            
    bow_y2 = []
    for e in range(len(list_num)):
        bow_y2.append([list_num[e],num_freq[e]])
    bow_y2.sort()
    print('BoW =',bow_y2)




if choice in ['n','N']:
    print('-------------------')
    print('char count =', number_of_characters_edit)
    print('alphanumeric count =', number_of_alnum)
    print('line count =',line_count)
    print('word count =',number_of_word)
    print('BoW =',bow)
        

# 6330531321 (22.00) 350 (2021-03-21 22:33)

file_name = input("File name = ")
file_name2 = file_name
file_name = open(file_name.strip(),"r")
char_count = 0
c = 0
linecount = 0
alphanumeric = 0
total = 0
beta =[]
delete1 = []
delete2 = []
kite = []
insidebow = []
G = 37
for line in file_name:
    linecount += 1
    x = line.strip("\n")
    for e in x:
        char_count += 1
        if e.isalnum() == True:
            alphanumeric += 1
        if e.isalnum() == True:
            c+=1
            kite.append(e)
        if e.isalnum() == False:
            kite.append(" ")        
wordcount = len("".join(kite).split())
file_name.close()
#---------------------------------------------------------------------------#
def BoW(file_name):
    stopword = open("stopwords.txt","r")
    G = 37
    beta =[]
    delete1 = []
    delete2 = []
    file_name = open(str(file_name),"r")
    for line in file_name:
        line = line.lower()
        for e in line:
            if e.isalnum() == True:
                beta.append(e)
            if e.isalnum() == False:
                beta.append(" ")
    beta = "".join(beta).split()
    for line in stopword:
        line = line.strip("")    
        for e in line:
            if e.isalnum() == True:
                delete1.append(e)            
            if e.isalnum() == False:
                delete1.append(" ")
    delete1 = "".join(delete1).split()
    for e in beta:
        if e not in delete1:
            delete2.append(e)
        else:
            delete2 = delete2          
    file_name.close()
    stopword.close()

    BoW =[]
    insidebow = []
    total = 0
    for e in delete2:
        for k in delete2:
            if e == k:
                total += 1
        if [e,total] in insidebow:
            pass
        else:
            insidebow.append([e,total])
        total = 0
    return insidebow
#-------------------------------------------------------------#





hashing = input("Use feature hashing ? (y,Y,n,N) ")
while True:
    if hashing == "n" or hashing == "N":
        print("-------------------"
)
        print("char count =",char_count)
        print("alphanumeric count =",alphanumeric)
        print("line count =",linecount)
        print("word count =",wordcount)
        print(sorted(BoW(file_name2)))
        break
    elif hashing == "y" or hashing == "Y":
        M = input("M = ")
        print("-------------------"
)
        print("char count =",char_count)
        print("alphanumeric count =",alphanumeric)
        print("line count =",linecount)
        print("word count =",wordcount)
        stopword = open("stopwords.txt","r")
        file_name = open(file_name2,"r")
        beta =[]
        delete1 = []
        delete2 = []
        thong = []
        bowwie = []
        abc = []
        satis =0  
        som = 0  
        for line in file_name:
            line = line.lower()
            for e in line:
                if e.isalnum() == True:
                    beta.append(e)
                if e.isalnum() == False:
                    beta.append(" ")
        beta = "".join(beta).split()
        for line in stopword:
            line = line.strip("")    
            for e in line:
                if e.isalnum() == True:
                    delete1.append(e)            
                if e.isalnum() == False:
                    delete1.append(" ")
        delete1 = "".join(delete1).split()
        for e in beta:
            if e not in delete1:
                delete2.append(e)
            else:
                delete2 = delete2          
        file_name.close()
        stopword.close()

        for e in delete2:
            for i in range(len(e)):
                som += (ord(e[i]))*(G**i)
            som = som % int(M)
            thong.append(som)
            som = 0
            abc = []
        summation = 0
        for e in thong:
            for k in thong:
                if e == k:
                    summation += 1
            if [e,summation] in bowwie:
                pass
            else:
                bowwie.append([e,summation])
            summation = 0
        print("BoW =",sorted(bowwie))
        break
    else:
        print("Try again. ")
        hashing = input("Use feature hashing ? (y,Y,n,N) ")
        
        
        
        
        
        
        



    





 
    
# 6330532021 (30.00) 351 (2021-03-22 22:45)

def bow_n(x):
    t = []
    for i in x:
        if i not in t:
            t.append(i)
        else:
            continue
    t_ = []
    for i in t:
        n = 0
        for e in range(len(x)):
            if i == x[e]:
                n += 1
            else:
                continue
        t_.append([i,n])
    t_.sort()
        
    return t_
def fhash(x):
    num = 0
    for i in range(len(x)):
        num = num + (ord(x[i]))*(37**i)
    return num
def bow_y(x,y):
    a = []
    for i in x:
        a.append(fhash(i)%int(y))
    t = []
    for i in a:
        if i not in t:
            t.append(i)
        else:
            continue
    t_ = []
    for i in t:
        n = 0
        for e in range(len(a)):
            if i == a[e]:
                n += 1
            else:
                continue
        t_.append([i,n])
    t_.sort()
        
    return t_

file_name = input('File name = ')
r = open(file_name,'r')
line = r.readlines()
r.close()
p = open('stopwords.txt')
q = p.readlines()
p.close()


while True:
    solu = input('Use feature hashing ? (y,Y,n,N) ')
    if solu == 'y' or solu == 'Y':
        m = input('M = ')
        break
    elif solu == 'n' or solu == 'N':
        break
    else : print('Try again.')
print('-------------------')

sen_ = ''
for i in [i.strip().lower() for i in line]:
    sen_ = sen_+i+' '
char_co = ''
for i in [i.strip() for i in line]:
    char_co = char_co+i
print('char count = '+str(len(char_co)))

u = ''
for i in sen_:
    if i.isalnum() :
        u += i
    else:
        continue
print('alphanumeric count = '+str(len(u)))
print('line count = '+str(len(line)))
sen__ = ''
for i in sen_:
    if i.isalnum() :
        sen__ += i
    else:
        sen__ += ' '
y = sen__.split()
print('word count = '+str(len(y)))
q_ = []
q_ = [i.strip() for i in q]
q__ = ''
for i in q_:
    q__ = q__+i.lower()+' '
q___ = q__[:-1].split(' ')
new_sen = []
for i in y:
    if i.lower() in q___:
        continue
    else:
        new_sen.append(i)
ans_bow = ''
if solu == 'y' or solu == 'Y':
    ans_bow = bow_y(new_sen,m)
if solu == 'n' or solu == 'N':
    ans_bow = bow_n(new_sen)
print('BoW = '+str(ans_bow))
# 6330533621 (10.00) 352 (2021-03-20 14:45)

In=str(input('File name = '))
hashing=input('Use feature hashing ? (y,Y,n,N) ')
while True:
  if hashing=='y' or hashing=='Y':
    M=int(input('M = '))
    print('-------------------')
    break
  elif hashing=='n' or hashing=='N':
    print('-------------------')
    break
  else:
    print('Try again.')
    hashing=input('Use feature hashing ? (y,Y,n,N) ')
def char_count(sentence):
  char=0
  sentence=list(sentence)
  for i in sentence:
      char+=1
  return char
def al_count(sentence):
  al=0
  sentence=list(sentence)
  for i in sentence:
    if i.isalnum():
      al+=1
  return al
def line_count():
  file_name=open('sample.txt','r')
  n=0
  for i in file_name:
    n+=1
  file_name.close()
  return n
def word_count(a):
  words=''
  for i in list(a):
    if i.isalnum():
      words+=i
    else:
      words+=' '
  return len(words.split())
def fhash(word,M):
  fhash=0
  for n in range(len(word)):
    fhash+=ord(word[n])*(37**n)
  return fhash%M
def new_words(sentence,stopwords):
  words=''
  for i in list(sentence):
    if i.isalnum():
      words+=i
    else:
      words+=' '
  words=words.split()
  new_words=[]
  for e in words:
    if not e in stopwords:
      new_words.append(e)
  return (new_words)

file_name=open(In,'r')
file=open('stopwords.txt','r')
sen=''
for i in file_name:
  sen+=i

sentence=''
for i in list(sen):
  if i!='\n':
    sentence+=i

print('char count = '+str(char_count(sentence)))
print('alphanumeric count = '+str(al_count(sentence)))
print('line count = '+str(line_count()))
print('word count = '+str(word_count(sentence)))

stop=''
for n in file:
  stop+=n

stopwords=(stop.lower()).split()
  
sentence=sentence.lower()

if hashing=='y' or hashing=='Y':
  BoW=[]
  for e in new_words(sentence,stopwords):
    BoW.append(fhash(e,M))
  Final_BoW=[]
  for e in range(len(sorted(BoW))):
    if e==0:
      Final_BoW.append(sorted(BoW)[e])
    elif 1<=e:
      if sorted(BoW)[e-1]<sorted(BoW)[e]:
        Final_BoW.append(sorted(BoW)[e])
  count=[]
  for n in Final_BoW:
    count.append((sorted(BoW)).count(n))
  BOW=[]
  for i in range(len(Final_BoW)):
    BOW.append([Final_BoW[i],count[i]])
  print('BoW = {}'.format(BOW))

elif hashing=='n' or hashing=='N':
  BoW=[]
  for e in new_words(sentence,stopwords):
    BoW.append(e)
  Final_BoW=[]
  for e in range(len(sorted(BoW))):
    if e==0:
      Final_BoW.append(sorted(BoW)[e])
    elif 1<=e:
      if sorted(BoW)[e-1]<sorted(BoW)[e]:
        Final_BoW.append(sorted(BoW)[e])
  count=[]
  for n in Final_BoW:
    count.append((sorted(BoW)).count(n))
  BOW=[]
  for i in range(len(Final_BoW)):
    BOW.append([Final_BoW[i],count[i]])
  print('BoW = {}'.format(BOW))

file_name.close()
file.close()
# 6330534221 (19.40) 353 (2021-03-21 20:47)
stop=open("stopwords.txt","r")
stopw=[]
for line in stop:
    stopw+=line.split()
#---------------------------------------------------
def bow(sen):
    sent=sen.split()
    sentence=[]
    for e in sent:
        if e not in stopw:
            sentence.append(e)
    uniq=[]
    for ch in sentence:
        if ch not in uniq:
            uniq.append(ch)
    BoW=[[e,sentence.count(e)] for e in uniq]
    return BoW
#---------------------------------------------------
def fhash(w,M):
    n=0
    for i in range(len(w)):
        e=w[i]
        n+=ord(e)*37**i
    m=n%int(M)
    return m
#---------------------------------------------------
def sum_bow(BOW):
    BoW=[]
    first=[]
    for e in BOW:
        if e[0] not in first:
            first.append(e[0])
    for ch in first:
        sum=0
        for e in BOW:
            if e[0]==ch:
                sum+=e[1]
        BoW+=[[ch,sum]]
    return BoW
#---------------------------------------------------            
file_name=input("File name = ")
t= True
while t==True:
    hashing=input("Use feature hashing ? (y,Y,n,N) ")
    if hashing in ["y","Y","n","N"]:
        t= False
        if hashing in["y","Y"]:
            M=input("M = ")
    else:
        print("Try again.")
#---------------------------------------------------
file=open(file_name,"r")
sent=""
lin=0
for line in file:
    lin+=1
    for e in line.strip():
        if e.isalnum()==True:
            sent+=e.lower()
        elif e==" " :
            sent+=e
        else:
            sent+=" "
file.close()
alco=0
for e in sent:
    if e.isalnum()==True:
        alco+=1
wc=sent.split()
#---------------------------------------------------
print("-------------------")
print("char count =",len(sent))
print("alphanumeric count =",alco)
print("line count =",lin)
print("word count",len(wc))
#---------------------------------------------------
BOW=bow(sent)
if hashing in ["y","Y"]:
    for e in BOW:
        e[0]=fhash(e[0],M)
    BOW=sum_bow(BOW)
BOW.sort()

print("BoW =", BOW)
#---------------------------------------------------



# 6330535921 (24.67) 354 (2021-03-22 02:46)

def remove_punctuation(s):
  out = ''
  for c in s:
    if c.lower() in '0123456789abcdefghijklmnopqrstuvwxyz':
      out += c.lower()
    else:
      out += ' '
  return out
def count_alpha(w):
    n = 0
    for i in w:
        if i in '0123456789abcdefghijklmnopqrstuvwxyz':
            n+=1
    return n
def fhash(word,m):
    sum_ =0
    for i in range(len(word)):
        n=0
        sum_ += ord(word[i])*37**i
    ans = sum_%m
    return ans
def bow(lis,ans):
    for i in ans:
        s=0
        for j in ans:
            if i == j:
                s+=1
        if [i,s] not in lis:
            lis.append([i,s])
    return lis

file = open(input('File name = '))
word = []
n_=0
w = 0
n_char = 0
for line in file:
    for i in line.strip().split():
        word.append(i)
        w += 1
    n_+=1
    n_char += len(line.strip())
file.close()
word_str = remove_punctuation(' '.join(word))
n=0
while n != 1:
    x = input('Use feature hashing ? (y,Y,n,N) ')
    if x in 'yY':
        x = 'y'
        n+=1
    if x in 'nN':
        x = 'n'
        n+=1
    if x not in 'yYnN':
        print('Try again.')
        continue
    
stop_f = open('stopword.txt')
sf =  []
for line in stop_f:
    for i in line.strip().split():
        sf.append(i)
stop_f.close()

word2 = remove_punctuation(' '.join(word)).split(' ')
bow_ = []
for i in word2:
    if i != '' and i not in sf:
        bow_.append(i)
bow_ = sorted([len(i),i] for i in bow_)
bow_ = [bow_[i][1] for i in range(len(bow_))]
if x=='y':
    m = int(input('M = '))
    bow_ = sorted([fhash(i,m) for i in bow_])
    bow_ans = []
    ans = bow(bow_ans,bow_)
else:    
    bow_ans = []
    ans = bow(bow_ans,bow_)

print('-------------------')
print('char count = ',n_char)
print('alphanumeric count = ',count_alpha(word_str))
print('line count = ',n_)
print('word count = ',w)
print('BoW =',ans)
# 6330536521 (30.00) 355 (2021-03-22 20:13)
def stop(a) :
    b=""
    x=open(a , "r")
    for e in x:
        for a in e:
            if a.lower().isalnum()==True :
                b+=a
            else:
                b+=" "
    return b.split()
def stop2(a) :
    n=stop(a)
    ex=[]
    c=[]
    x=open("stopwords.txt","r")
    for i in x:
        ex+=i.split()
    for e in n:
        if e.lower() not in ex:
            c+=[e.lower()]
    return c
def fhash(a,b):
    c=0
    for i in range(len(a)):
        c+=ord(a[i])*(37**i)
    return c%b  
def line(a):
    x=open(a , "r")
    n=0
    for e in x :
        n+=1
    return print("line count = "+str(n))
def char(a):
    x=open(a , "r")
    n=1
    for e in x :
        c="".join(e)
        for i in c[:-1]:
            n+=1
    return print("char count = "+str(n))
def word(a) :
    n=stop(a)
    print("word count = "+str(len(n)))
    
def alpha(a) :
    n=stop(a)
    c=0
    for e in n :
        c+=len(e)
    print("alphanumeric count = "+str(c))
    

file_name=input("File name = ")
while True:
    al=input("Use feature hashing ? (y,Y,n,N) ")
    if al.lower()=="y" :
        m=int(input("M = "))
        print("""-------------------""")
        char(file_name)
        alpha(file_name)
        line(file_name)
        word(file_name)
        n=stop2(file_name)
        l=[]
        for e in n:
            l.append(fhash(e,m))
        l=sorted(l)
        bow=[]
        co=1
        for i in range(len(l)):
            if i==int(len(l))-1 :
                bow+=[[l[i],co]]
            elif l[i]==l[i+1]:
                co+=1
            else:
                bow+=[[l[i],co]]
                co=1      
        print("Bow = "+str(bow) )
        break
    elif al.lower()=="n":
        print("""-------------------""")
        char(file_name)
        alpha(file_name)
        line(file_name)
        word(file_name)
        n=sorted(stop2(file_name))
        bow=[]
        co=1
        for i in range(len(n)):
            if i==int(len(n))-1 :
                bow+=[[n[i],co]]
            elif n[i]==n[i+1]:
                co+=1
            else:
                bow+=[[n[i],co]]
                co=1
        print("Bow = "+str(bow) )
        break
    else :
        print("Try again.")

# 6330537121 (18.58) 356 (2021-03-22 16:05)
def fhash(w,M):
    n=0
    for e in range(len(w)):
        n += ord(w[e])*37**e
    return n%M


 





file_name = open(input('File name = '),'r')

g = input('Use feature hashing ? (y,Y,n,N) ')


if g.lower() == 'y':
   M = int(input('M ='))
   x = True

elif g.lower() == 'n':
   x = False



else:
    while g not in ['y','Y','n','N']:
        print('try again')
        g = input('Use feature hashing ? (y,Y,n,N)')
        if g.lower() == 'y':
            M = int(input('M ='))
            x = True

        elif g.lower() == 'n':
            x = False
print('-------------------')

j= ''
line_count = 0
alphanumeric_count = 0 
char_count = 0
word_count = 0

for i in file_name:
  line_count += 1

  y = 0
  if "\n" in i :
    y += len(i)-1
    char_count += y
  else :
    y += len(i)
    char_count += y


  for k in range(len(i)):
    z = 0

    if i[k] not in[ '(', ')', '-', '', '[', ']' ,'"' ,"'" ,';', ':', '>', '<','.',' ','\n',',']:
        z += 1 
        alphanumeric_count += z
  b = ''
  for g in range(len(i)):
      if i[g] in'ABCDEFGHIJKLMNOPQRSTUVWXYZ': 
          b += i[g] 
          j += i[g]
      elif i[g] in 'abcdefghijklmnopqrstuvwxyz':
          b += i[g]
          j += i[g]
      elif i[g] in '0123456789':
          b += i[g]
          j += i[g]
      else:
          b += ' '
          j += ' '
  b.split()
  j.split()
  word_count +=len(b.split())


fn = open('stopwords.txt','r')
s=''
for i in fn:

  for g in range(len(i)):
      if i[g] in'ABCDEFGHIJKLMNOPQRSTUVWXYZ' or 'abcdefghijklmnopqrstuvxywz' or '0123456789':
          s += i[g]


      else:
          s += " "
  s.split()
fn.close()

o=[]
for i in j.split():
    if i.lower() not in s :
        o.append(i)


BoW0=[]
BoW1=[]
BoW2=[]
BoW3=[]
if x == True:
    for i in o:
       BoW0.append(fhash(i,M))
    
    for a in BoW0:
        if a not in BoW2:
            BoW2.append(a)
    
    for e in BoW2:
        z=0
        for k  in BoW0:
            
            
            if e == k :
                z+=1
        BoW3.append([e,z])
        BoW3.sort()

                
elif x == False:
     for i in o:
         if i not in BoW2 :
             BoW2.append(i)
     for e in BoW2:
         z=0
         for k in o :
             if e == k:
                 z+=1
         BoW3.append([e,z])
         

        
print('char count =',char_count)
print('alphanumeric count =',alphanumeric_count )
print('line count =',line_count-1)
print('word count =',word_count)
print('BoW =',BoW3)
















file_name.close()





















file_name.close()    

    
    
    

# 6330538821 (16.20) 357 (2021-03-20 16:06)
stop = ['it', 'they', 'the', 'a', 'an', 'of', 'on',
        'in', 'at', 'is', 'am', 'are', 'was', 'were']
def remove_punctuation(w):
    out = ''
    for c in w:
        if c.lower() in '0123456789abcdefghijklmnopqrstuvwxyz':
            out += c.lower()
        else:
            out += ' '
    return out
def character_count(s):
    c=0
    for char in s:
        if char != '\n':
            c += 1
    return c
def af_count(s):
    s = [j for j in s if j.lower() in '0123456789abcdefghijklmnopqrstuvwxyz']
    s = ''.join(s)
    c = len(s)
    return c
def word_c(s):
    c = len(s.split(' '))
    return c
def bow(s):
    s = remove_punctuation(s)
    s = s.split(" ")
    p = []
    key = []
    for i in s:
        if i == "":
            continue
        if i.lower() not in stop:
            if i in key:
                p[key.index(i)][1] += 1
            else:
                p.append([i, 1])
                key.append(i)
    p.sort()
    return p
def flash(w,m):
    g = 37
    a = 0
    b = 0
    for i in range(len(w)):
        a += ord(w[i])*g**i
    b += int(a)%m
    return b
def flash_bow(s,m):
    s=remove_punctuation(s)
    s=s.split(' ')
    s= [i for i in s if i != '' or '\n' or " "]
    s=[i for i in s if i not in stop]
    p=[]
    pp=[]
    for j in s:
        if j == " " or j == "\n" or j == "":
            continue
        if flash(j,m) in pp:
            p[pp.index(flash(j,m))][1] += 1
            continue
        p.append([flash(j,m),1])
        pp.append(flash(j,m))
    p.sort()
    return p

file_name = input('File name = ')
feature_hashing = input('Use feature hashing ? (y,Y,n,N) ')
while feature_hashing not in 'nyNY':
        print('Try again.')
        feature_hashing = input('Use feature hashing ? (y,Y,n,N) ')
if feature_hashing.lower() == 'n':
    file = open(file_name, 'r')
    c = 0
    a = 0
    l = 0
    w = 0
    z = ''
    for line in file:
        c += character_count(line)
        l += 1
        a += af_count(line)
        w += word_c(line)
        z += line
    file.close()
    print('-------------------')
    print('char count =', c)
    print('alphanumeric count =', a)
    print('line count =', l)
    print('word count =', w)
    print('BoW =', bow(z))
elif feature_hashing.lower() == 'y':
    file = open(file_name, 'r')
    m = int(input('M = '))
    c = 0
    a = 0
    l = 0
    w = 0
    bowe = ''
    for line in file:
        c += character_count(line)
        a += af_count(line)
        l += 1
        w += word_c(line)
        bowe += line
    file.close()
    print('-------------------')
    print('char count =', c)
    print('alphanumeric count =', a)
    print('line count =', l)
    print('word count =', w)
    print('BoW =', flash_bow(bowe,m))
# 6330539421 (26.00) 358 (2021-03-22 22:35)
#function
#----------------------------------------
def wordddd(line,i):
    global word
    global Bow
    global M
    karm = ''
    wordd = False
    line =  '***' + line + '***'
    while i < (len(line)):
        if line[i].isalnum():
            karm += line[i]
            wordd =True
            i += 1
        else:
            i += 1
            if wordd == True:
                karm = karm.lower()
                if karm not in stopwords:
                    if feature:
                        inBow = False
                        for x in Bow:
                            if hashing(karm,M) == x[0]:
                                inBow = True
                                x[1] += 1
                        if inBow == False:
                            Bow.append([hashing(karm,M),1])
                    else:
                        inBow = False
                        for x in Bow:
                            if karm in x:
                                inBow = True
                                x[1] += 1
                        if inBow == False:
                            Bow.append([karm,1])
                karm = ''
                wordd = False
                word += 1   
def countword(line):
    global word
    i = 0
    while i < len(line) and line[i].isalnum() == False :
        i += 1
    wordddd(line,i)
def hashing(wordddddd,M):
    G = 37
    return (sum([ord(wordddddd[e])*(G**e) for e in range(len(wordddddd))]))%M
#----------------------------------------------
file_name = input('File name = ').strip()
def choicee():
    global feature
    global choice
    choice = input('Use feature hashing ? (y,Y,n,N) ').strip()
    if choice in 'YyNn':
        global M
        feature = False
        if choice in 'Yy':
            M = int(input('M = ').strip())
            feature = True
    else:
        print('Try again.')
        choicee()
choicee()
stop = open('stopwords.txt','r')
stopwords = []
for line in stop:
    for i in line.strip().split():
        if i != ' ':
            if i not in stopwords:
                stopwords.append(i)
stop.close()
file = open(file_name,'r')
count = 0
alnum = 0
word = 0
lenght = 0
Bow = []
for line in file:
    lenght += 1
    for i in line:
        if i != '\n':
            count += 1
        if i.isalnum():
            alnum += 1
    countword(line.strip())
print('-------------------')
print('char count =',count)
print('alphanumer count =',alnum)
print('line count =',lenght)
print('word count =',word)
Bow.sort()
print('BoW =',Bow)
# 6330540021 (23.90) 359 (2021-03-21 21:03)
def fhash(w, M):
    return sum([ord(w[i]) * (G ** i) for i in range(len(w))]) % M
def addToBoW(word, BoW, M):
    if M == -1:
        for i in range(len(BoW)):
            if word == BoW[i][0]:
                BoW[i][1] += 1
                return
    else:
        for i in range(len(BoW)):
            if fhash(word, M) == BoW[i][0]:
                BoW[i][1] += 1
                return
    
    if M != -1:
        BoW.append([fhash(word, M), 1])
    else:
        BoW.append([word, 1])

print("File name = ", end="")
file_name = input()
hashing_enable = ""
stopword_list = []
stopword_file_name = "stopwords.txt"
G = 37
M = -1

while hashing_enable == "":
    print("Use feature hashing ? (y,Y,n,N) ", end="")
    u_input = input().lower()
    if u_input == "y":
        hashing_enable = True
    elif  u_input == "n":
        hashing_enable = False
    else:
        print("Try again.")

if hashing_enable:
    print("M = ", end="")
    M = int(input())

with open(stopword_file_name, "r") as stopword_file:
    for line in stopword_file:
        words = [w for w in line.strip().split(" ")]
        if len(words) > 1:
            stopword_list.extend(words)

character_count = 0
with open(file_name, "r") as input_file:
    for line in input_file:
        character_count += len([c for c in line if c != "\n"])

print("-------------------")
print("char count =", character_count)

alphanumeric_count = 0
with open(file_name, "r") as input_file:
    for line in input_file:
        alphanumeric_count += sum([1 for c in line if c.isalnum()])
print("alphanumeric count =", alphanumeric_count)

line_count = 0
with open(file_name, "r") as input_file:
    for line in input_file:
        line_count += 1
print("line count =", line_count)

word_count = 0
with open(file_name, "r") as input_file:
    for line in input_file:
        no_special = "".join([c.lower() if c.isalnum() else " " for c in line])
        no_special = no_special.split()
        word_count += len(no_special)
print("word count =", word_count)

BoW = []
with open(file_name, "r") as input_file:
    for line in input_file:
        no_special = "".join([c.lower() if c.isalnum() else " " for c in line])
        no_special = no_special.split()
        no_top_word = [c for c in no_special if c not in stopword_list]
        for word in no_top_word:
            addToBoW(word, BoW, M)

BoW.sort()
print("BoW =", BoW)
# 6330541621 (11.83) 360 (2021-03-22 20:56)

def fhash(w,M):
    G = 37
    B = 0
    for a in range(len(w)):
        B += (ord(w[a])*(G**a))
    C = B % int(M)
    return C

fn = input('File name = ')
ufh = input('Use feature hashing ? (y,Y,n,N) ')
while ufh not in 'yYnN':
    print('Try again.')
    ufh2 = input('Use feature hashing ? (y,Y,n,N) ')
    if ufh2 == 'y' or ufh2 == 'Y' or ufh2 == 'n' or ufh2 == 'N':
        ufh = ufh2
        break
if ufh == 'y' or ufh == 'Y':
    M = input('M = ')

print('-------------------')
file2 = open('stopwords.txt','r')
file2read = file2.readlines()
file22 = []
file23 = []
file24 = []
for a in range(len(file2read)):
    D = file22.append(file2read[a][:-1])
for l in range(len(file22)):
    file23.append(file22[l].split())
for m in range(len(file23)):
    for n in range(len(file23[m])):
        file24.append(file23[m][n])
file1 = open(fn,'r')
cutn = file1.readlines()
cutn1 = []
cutn2 = []
charcount = 0
for b in range(len(cutn)):
    H = cutn1.append(cutn[b][:-1])
for c in cutn1:
    charcount += len(c)
for d in cutn1:
    for e in range(len(d)):
        if d[e] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890':
            J = cutn2.append(d[e])
alphanumeric = len(cutn2)
linecount = len(cutn1)
cutn3 = []
cutn4 = []
symbol = [ '(', ')', '-', '_', '[', ']' ,'"' ,"'" ,';', ':', '>', '<','.',',' ]
for f in range(len(cutn1)):
    k = cutn1[f].split(' ')
    for g in range(len(k)):
        for h in range(len(k[g])):
            if k[g][h] in symbol:
                k[g] = k[g][0:h]+' '+k[g][h+1:]
        cutn3.append(k[g].split())
for i in range(len(cutn3)):
    for j in range(len(cutn3[i])):
        cutn4.append(cutn3[i][j])
wordcount = len(cutn4)
cutn5 = []
for k in range(len(cutn4)):
    cutn5.append(cutn4[k].lower())
cutn6 = []
for p in range(len(cutn5)):
    if cutn5[p] in file24:
        pass
    else:
        cutn6.append(cutn5[p])
cutn8 = sorted(cutn6)
word = cutn8
word1 = []
point = []
for s in range(len(cutn8)):
    Pt = point.append(cutn8.count(cutn8[s]))

if ufh == 'y' or ufh == 'Y':
    bow = [[fhash(word[e],M),point[e]] for e in range(len(cutn8))]
    bow2 = []
    point = []
    for s in range(len(bow)):
        for t in range(len(bow[s])):
            if t%2 == 0:
                bow2.append(bow[s][t])
            else:
                pass
    bow3 = sorted(bow2)
    for u in range(len(bow3)):
        Pt2 = point.append(bow3.count(bow3[u]))
    bow4 = [[bow3[e],point[e]] for e in range(len(bow3))]
    bow5 = []
    for q in range(len(bow4)):
        if q == 0:
            bow5.append(bow4[q])
        else:
            if bow4[q][0] == bow4[q-1][0]:
                pass
            else:
                bow5.append(bow4[q])
                
    print('char count =',charcount)
    print('alphanumeric count =',alphanumeric)
    print('line count =',linecount)
    print('word count =',wordcount)
    print('BoW =',bow5)

elif ufh in 'nN' or ufh2 in 'nN':
    point = []
    for s in range(len(cutn8)):
        Pt = point.append(cutn8.count(cutn8[s]))
    
    bow = [[word[e],point[e]] for e in range(len(cutn8))]
    bow2 = []
    for q in range(len(bow)):
        if q == 0:
            bow2.append(bow[q])
        else:
            if bow[q][0] == bow[q-1][0]:
                pass
            else:
                bow2.append(bow[q])

    print('char count =',charcount)
    print('alphanumeric count =',alphanumeric)
    print('line count =',linecount)
    print('word count =',wordcount)
    print('BoW =',bow2)
# 6330542221 (30.00) 361 (2021-03-22 21:58)
def sam(file_name):
    filesim = open(file_name,'r')
    sam = filesim.readlines()
    filesim.close()
    sam = [line.strip() for line in sam]
    return sam
def sentencechar(file_name):
    sen = ''.join(sam(file_name))
    return sen
def sentencealpha(file_name):
    a = ''
    for i in sam(file_name):
        a += ' '+i.lower()
    for i in a:
        if i not in '0123456789abcdefghijklmnopqrstuvwxyz':
            a = a.replace(i,' ') 
    return a
def alphanumeric_count(file_name):
    alpha = len(sentencealpha(file_name).replace(' ','')) 
    return alpha
def BoW(file_name):
    sample = sorted(sentencealpha(file_name).split())
    filestp = open('stopwords.txt','r')
    stp = filestp.readlines()
    filestp.close()
    stp = [line.strip() for line in stp]
    s = ''
    for i in stp:
        s += ' '+i.lower()
    stop = s.split() 
    B = []
    for i in sample:
        if i not in stop:
            B.append(i)
    b = ''
    c = []
    for i in B:
        if i!=b :
            b = i
            c.append(i)
    BoW = []
    for i in c:
        no1 = B.count(i)
        BoW.append( [i,no1])
    return BoW
def fhash(w,M):
    b = 0
    G = 37
    for i in range(len(w)):
        b += ord(w[i])*(G**i)
    b = b%M
    return b
def feature_hashing(file_name):
    sample = sorted(sentencealpha(file_name).split())
    filestp = open('stopwords.txt','r')
    stp = filestp.readlines()
    filestp.close()
    stp = [line.strip() for line in stp]
    s = ''
    for i in stp:
        s += ' '+i.lower()
    stop = s.split() 
    B = []
    for i in sample:
        if i not in stop:
            B.append(i)
    C = []
    for i in B:
        C.append(fhash(i,M))
        C = sorted(C)
    d = ''
    D = []
    for i in C:
        if i!=d :
            d = i
            D.append(i)
    FH = []
    for i in D:
        no1 = C.count(i)
        FH.append([i,no1])
    return FH
    
file_name = input('File name = ')
while True:
    fh = input('Use feature hashing ? (y,Y,n,N) ')
    if fh.lower() == 'y':
        M = int(input('M = '))
        YorN = 'yes'
        break
    elif fh.lower() == 'n':
        YorN = 'no'
        break
    else :
        print('Try again.')
print('-------------------')
print('char count =',len(sentencechar(file_name)))
print('alphanumeric count =',alphanumeric_count(file_name))
print('line count =',len(sam(file_name)))
print('word count =',len(sentencealpha(file_name).split()))
if YorN == 'yes':
    print('BoW =', feature_hashing(file_name))
elif YorN == 'no':
    print('BoW =',BoW(file_name))
# 6330543921 (18.90) 362 (2021-03-21 14:05)

def read_file(file_name):
    file=open(file_name, encoding='utf-8')
    lines=[line.strip() for line in file.readlines()]
    file.close()
    return lines
def read_stopwords():
    file=open('stopwords.txt', encoding='utf-8')
    lines=[line.strip() for line in file.readlines()]
    file.close()
    stop_words = ' '.join(lines).split(' ')
    for i in range(len(stop_words)):
        stop_words[i] = alpha(stop_words[i])
    return stop_words
def alpha(s):
    s=[c for c in s if c.lower() in '0123456789abcdefghijklmnopqrstuvwxyz']
    return ''.join(s)
def alpha_joinlist(j):
    for i in range(len(j)):
        j[i] = alpha(j[i])
    return j
def fhash(ws, m):
    f = 0
    for i in range(len(ws)):
        f += ord(ws[i])*(37**i)
    return f % int(m)
def remove_stopwords(joinlists, stop_words):
    remove_stopwords = []
    for joinlist in joinlists:
        if joinlist not in stop_words:
            remove_stopwords.append(joinlist)
    return sorted(remove_stopwords)
def bow_word(join_list):
    bow = []
    words = remove_stopwords(alpha_joinlist(join_list), read_stopwords())
    for i in range(len(words)):
        if words[i] != words[i-1]:
            bow += [[words[i], words.count(words[i])]]
    return bow
def bow_fhash(join_list, m):
    tbow = []
    bow = []
    removes = remove_stopwords(alpha_joinlist(join_list), read_stopwords())
    for remove in removes:
        tbow.append(fhash(remove, m))
    tbow = sorted(tbow)
    for i in range(len(tbow)):
        if tbow[i] != tbow[i-1]:
            bow += [[tbow[i], tbow.count(tbow[i])]]
    return bow
def output(file_name, hash, m):
    listfromfile = read_file(file_name)
    joinlist = ''.join(listfromfile)
    join_list = ' '.join(listfromfile).split(' ')
    ap = alpha(joinlist)
    print('-------------------')
    print('char count =', len(joinlist))
    print('alphanumeric count =', len(ap))
    print('line count =', len(listfromfile))
    print('word count =', len(join_list))
    if hash.lower() == 'n':
        bow = bow_word(join_list)
    else:
        bow = bow_fhash(join_list, m)
    return bow
def name():
    check = True
    file_name = input('File name = ')
    while check:
        hash = input('Use feature hashing ? (y,Y,n,N) ')
        if hash.lower() == 'y' or hash.lower() == 'n':
            m = ''
            if hash.lower() == 'y':
                m = input('M = ')
            bow = output(file_name, hash, m)
            print('BoW =', bow)
            check = False
        else:
            print('Try again.')
        
name()
# 6330544521 (22.90) 363 (2021-03-21 10:58)

def fhash(ws, m):
    f = 0
    for i in range(len(ws)):
        f += ord(ws[i])*(37**i)
    return f%int(m)
def alpha(s):
    out = ''
    for c in s:
        if c.lower() in '0123456789abcdefghijklmnopqrstuvwxyz':
            out += c.lower()
    return out
def ajoinlist(jlist):
    for i in range(len(jlist)):
        jlist[i] = alpha(jlist[i])
    return jlist
def read_file(file_name):
    file=open(file_name, encoding='utf-8')
    lines=[line.strip() for line in file.readlines()]
    file.close()
    return lines
def read_stopwords():
    file=open('stopwords.txt', encoding='utf-8')
    lines=[line.strip() for line in file.readlines()]
    file.close()
    stop_words = ' '.join(lines).split(' ')
    for i in range(len(stop_words)):
        stop_words[i] = alpha(stop_words[i])
    return stop_words
def remove_stopwords(joinlists, stop_words):
    words = []
    for joinlist in joinlists:
        if joinlist not in stop_words:
            words.append(joinlist)
    return sorted(words)
def bow_word(join_list):
    bow = []
    rmv = remove_stopwords(ajoinlist(join_list), read_stopwords())
    for i in range(len(rmv)):
        if rmv[i] != rmv[i-1]:
            bow += [[rmv[i], rmv.count(rmv[i])]]
    return bow
def bow_fhash(join_list, m):
    bw = []
    bow = []
    rmvs = remove_stopwords(ajoinlist(join_list), read_stopwords())
    for rmv in rmvs:
        bw.append(fhash(rmv, m))
    bw = sorted(bw)
    for i in range(len(bw)):
        if bw[i] != bw[i-1]:
            bow += [[bw[i], bw.count(bw[i])]]
    return bow
def inputfirst():
    k = True
    file_name = input('File name = ')
    while k:
        hash = input('Use feature hashing ? (y,Y,n,N) ')
        if hash.lower() == 'y' or hash.lower() == 'n':
            k = False
            listfromfile = read_file(file_name)
            joinlist = ''.join(listfromfile)
            join_list = ' '.join(listfromfile).split(' ')
            a = alpha(joinlist)
            if hash.lower() == 'y':
                m = input('M = ')
            print('-------------------')
            print('char count =', len(joinlist))
            print('alphanumeric count =', len(a))
            print('line count =', len(listfromfile))
            print('word count =', len(join_list))
            if hash.lower() == 'n':
                bow = bow_word(join_list)
            else:
                bow = bow_fhash(join_list, m)
            print('BoW =', bow)
        else:
            print('Try again.')

inputfirst()
# 6330545121 (24.00) 364 (2021-03-22 02:49)

def initial(n):
    
    WordStop = ''
    file = open(n,'r')
    lines = file.readlines()
    numLine = len(lines)
    file.close()
    
    Word = OprLine(lines)[0]
    nonAlnum = OprLine(lines)[1]
    
    
    filestop = open('stopwords.txt','r')
    linestop = filestop.readlines()
    linestop = [i.strip('\n') for i in linestop ]
    for i in linestop:
        WordStop = WordStop + ' '+ i
    WordStop = WordStop.split()
    filestop.close()
    
    print('Use feature hashing ? (y,Y,n,N)',end = '')
    a = input()
    if a in ['y','Y']:
        Opr = 'y'
        main(numLine,Word,Opr,nonAlnum,WordStop)
        
        
    elif a in ['n','N']:
        Opr = 'n'
        main(numLine,Word,Opr,nonAlnum,WordStop)
        
    else:
        print('Try again.')
        initial()
        
        
def OprLine(lines):
    
    [nonAlnum,a] = [0,'']
    lines = [i.strip('\n') for i in lines ]
    for i in lines:
        a = a+ ' ' + i
    for i in a:
        if i.isalnum() == False:
            nonAlnum = nonAlnum +1
            a = a.replace(i,' ')
    return [a,nonAlnum]
    
    
def fhash(w,M):
    
    [a,n,G] = [0,0,37]
    for i in w:
        a = a + ord(i.lower())*(G**n)
        n = n+1    
    inBoW = a%int(M)
    
    return inBoW
def countBoW(Data,WordStop,Opr):
    
    [BoW,i1,DBoW,ListData] = [[],[],[],[]]
    
    for i in WordStop:
        while i in Data:
            Data.remove(i)
    for i in Data:
        if not i in ListData:
            ListData.append(i)        
    if Opr == 'y':
        print('M = ',end = '')
        M = input()
        for w in Data:
            h = (fhash(w,M))
            DBoW.append(h)
        for i in DBoW:
            if i not in i1:
                i1.append(i)
                BoW.append([i,DBoW.count(i)])
    elif Opr == 'n':
        for i in ListData:
            BoW.append([i,Data.count(i)]) 
    return BoW
def main(numLine,Word,Opr,nonAlnum,WordStop):
    Data = []
    Data0 = Word.split()
    for i in Data0:
        Data.append(i.lower())
    BoW = countBoW(Data,WordStop,Opr)
    print('-------------------')
    print('char count =',end= ' ')
    print(len(Word)-numLine)
    print('alphanumeric count =',end= ' ')
    print(len(Word)-nonAlnum)
    print('line count =',end = ' ')
    print(numLine)
    print('word count =',end = ' ')
    print(len(Data0))
    print('BoW =',end = ' ' )
    print(BoW)
#-------------------------------------------------------------------------------
print('File name =',end = ' ')
n = input()
initial(n)
    
# 6330547421 (26.90) 365 (2021-03-22 03:46)

file_name = input('File name = ')
ufh = input('Use feature hashing ? (y,Y,n,N) ')
while not ( ufh in ['n','N','y','Y'] ) :
    print('Try again.')
    ufh = input('Use feature hashing ? (y,Y,n,N) ')
if ufh in ['y','Y'] :
    M = int(input('M = '))    
stop_word = open('stopwords.txt',"r")
sw = []
for line in stop_word :
    sw += line.strip().split()
print('-------------------')    
    
f = open(file_name ,"r")
def char_count(f):
    s = 0
    for line in f :
        s += len(line.strip())
    return s
print('char count =',char_count(f))

f = open(file_name ,"r")
def alnum_count(f):
    s = ''
    for line in f :
        for i in line :
            if 'A' <= i <= 'Z' or 'a' <= i <= 'z' or '0' <= i <= '9' :
                s += i
    n = len(s)
    return n
print('alphanumeric count =',alnum_count(f))

f = open(file_name ,"r")
def line_count(f):
    s = 0
    for line in f :
        s += 1
    return s
print('line count =',line_count(f))

f = open(file_name ,"r")
def word_count(f):
    s = ''
    for line in f :
        s += ' '
        for i in line :
            if 'A' <= i <= 'Z' or 'a' <= i <= 'z' or '0' <= i <= '9' :
                s += i
            else :
                s += ' '
    w = s.strip().split()
    n = len(w)
    return n
print('word count =',word_count(f))
def fhash(w,M):
    s = 0
    G = 37
    for i in range(len(w)) :
        s += (ord(w[i])*(G**i))
    ss = s % M
    return ss

f = open(file_name ,"r")
def bow(f):
    l = ''
    for line in f :
        l += ' '
        for i in line.lower() :
            if 'A' <= i <= 'Z' or 'a' <= i <= 'z' or '0' <= i <= '9' :
                l += i
            elif i == ' ' :
                l += ' '
    li = l.strip().split()
    w = '' 
    for i in li :
        if not ( i in sw ) :
            w += i
            w += ' '
    lw = w.split()
    lf = []
    b = []
    o = []
    k = 0
    if ufh == 'n' or ufh == 'N' :
      for i in lw :
          k = 0
          if not ( i in o ) :
              for w in lw :
                  if i == w :
                     k += 1
              o += [i]         
              b += [[i,k]]
    elif ufh == 'y' or ufh == 'Y' :
       for i in lw :
           lf += [fhash(i,M)]
       for i in lf :
           k = 0
           if not (i in o) :
               for n in lf :
                   if i == n :
                      k += 1
               o += [i]
               b += [[i,k]]
    return b
print('BoW =',bow(f))

# 6330548021 (18.00) 366 (2021-03-21 12:59)

f = input('file_name = ')
uf = input('Use feature hashing ? (y,Y,n,N) ')
ch = 0
al = 0
wc = 0
ws = []
bow = []
    
def remove_an(s):
    se = ''
    for e in s:
        if e.lower() in '0123456789abcdefghijklmnopqrstuvwxyz':
            se += e.lower()
    return se
def remove_an1(a):
    se = ''
    for e in a:
        if e.lower() in '0123456789abcdefghijklmnopqrstuvwxyz':
            se += e.lower()
        else :
            se += ' '
    return se
def flhash(w,M):
    G = 37
    b = 0
    for i in range(len(w)):
        b += ord(w[i])*(G**i)
    b = b % M
    return b
def remove_stop_word(h,sw):
    a = []
    for e in h:
        if e not in sw:
            a.append(e)
    return a
def count_w(G):
    coun = 1
    GG = []
    G = sorted(G)
    for i in range(len(G)):
        if not i+1 == len(G):
            if G[i] == G[i+1]:
                coun += 1
            else :
                GG.append([G[i],coun])
                coun = 1
    GG += [[G[-1],coun]]
    return GG
            
file1 = open('stopwords.txt','r')
lines0 = file1.readlines()
lines2 = [line.strip() for line in lines0]
th = []
for g in lines2:
    sw1 = remove_an1(g)
    sw1 = sw1.split()
    for u in sw1:
        th.append(u)
file1.close
        
file = open(f,'r')
s = file.read()
lines = s.splitlines()
lines1 = [line.strip() for line in lines]
lc = len(lines)
for c in lines:
    ch += len(c)
for e in lines1:
    al += len(remove_an(e))
for t in lines1:
    w = remove_an1(t)
    w = w.split()
    wc += len(w)
    for u in w:
        ws.append(u)
ws01 = remove_stop_word(ws,th)
bo1 = count_w(ws01)
  
file.close
while uf not in ['y','Y','n','N']:
    if uf not in ['y','Y','n','N']:
        print('Try again.')
        uf = input('Use feature hashing ? (y,Y,n,N) ')
if uf in ['n','N']:
    print('-------------------')
    print('char count =',ch)
    print('alphanumeric count =',al)
    print('line count =',lc)
    print('word count =',wc)
    print('BoW =',bo1)
elif uf in ['y','Y']:
    M = int(input('M = '))
    fl = []
    for g in ws01:
        fl.append(flhash(g,M))              
    fl1 = count_w(fl)
    print('-------------------')
    print('char count =',ch)
    print('alphanumeric count =',al)
    print('line count =',lc)
    print('word count =',wc)
    print('BoW =',fl1)
# 6330549721 (30.00) 367 (2021-03-20 18:08)

def fhash(w, M):
    res = 0
    G = 37
    for i in range(len(w)):
        res += ord(w[i])*G**i
    return res%M

file_name = input("File name = ")
isHashing = ""
while True:
    isHashing = input("Use feature hashing ? (y,Y,n,N) ").lower()
    if isHashing == "n" or isHashing == "y":
        break
    else:
        print("Try again.")

stopword = []
stopfile = open("stopwords.txt").readlines()
for i in stopfile:
    l = i.strip().split()
    for j in l:
        if j not in stopword:
            stopword.append(j)
            
line = open(file_name).readlines()
lswd = []
lsct = []
BoW = []
ancount=0
charcount=0
wdcount=0
reswd = []
rescnt = []
for i in line:
    l = i.strip().lower()
    lm = ""
    for j in l:
        if j.isalpha() == True or j.isnumeric() == True:
            charcount += 1
            ancount += 1
            lm += j
        else:
            charcount += 1
            lm += " "
    lm = lm.strip().split()
    wdcount += len(lm)
    for k in lm:
        if k not in stopword:
            if k not in lswd:
                lswd.append(k)
                lsct.append(1)
            else:
                lsct[lswd.index(k)] += 1
                
if isHashing == "n":
    reswd = lswd
    rescnt = lsct
elif isHashing == "y":
    M = int(input("M = "))
    lshash = []
    lshashct = []  
    for i in range(len(lswd)):
        fhashres = fhash(lswd[i],M)
        if fhashres not in lshash:
            lshash.append(fhashres)
            lshashct.append(lsct[i])
        else:
            lshashct[lshash.index(fhashres)] += lsct[i]

    reswd = lshash
    rescnt = lshashct

    
sortedlshash = []
sortedlshashct = []
indexrange = list(range(len(reswd)))
for i in range(len(reswd)):
    minval = 0
    inx = 0
    for j in range(len(indexrange)):
            if j == 0:
                minval = reswd[indexrange[j]]
            elif minval > reswd[indexrange[j]]:
                minval = reswd[indexrange[j]]
                inx = j
    sortedlshash.append(minval)
    sortedlshashct.append(rescnt[indexrange[inx]])
    indexrange.pop(inx)


for i in range(len(sortedlshash)):
    BoW.append([sortedlshash[i],sortedlshashct[i]])


print("-------------------")
print("char count =",charcount)
print("alphanumeric count =",ancount)
print("line count =",len(line))
print("word count =",wdcount)
print("BoW =",BoW)
    


# 6330550221 (30.00) 368 (2021-03-22 00:51)
a = input('File name = ')
b = input('Use feature hashing ? (y,Y,n,N) ')
while b not in ['y','Y','n','N']:
    print('Try again.')
    b = input('Use feature hashing ? (y,Y,n,N) ')
if b not in ['n','N']:
    M = input('M = ')
print('-------------------')
stop_words = open('stopwords.txt','r')
stop_w = []
for line in stop_words:
    w = line.split()
    stop_w += w
file_name = open(a,'r')
f_n = []
for line in file_name:
    f_n += line
file_name.close()
f_n_not = []
for i in range(len(f_n)):
    if f_n[i] != '\n':
        f_n_not += f_n[i]
ch_test = len(f_n_not)
file_name = open(a, 'r')
ln = 0
for line in file_name:
    ln += 1
file_name.close()
ff = ''
for i in range(len(f_n)):
    if f_n[i].lower() in ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']:
       ff += f_n[i]
al = len(ff)
file_name = open(a,'r')
f_NN = []
ans = ''
for line in file_name:
    for e in line:
        if e.isalnum():
            ans += e.lower()
        else:
            ans += ' '
    f_NN = ans.split()
file_name.close()
wc = len(f_NN)
f_NN_not = []
for i in range(len(f_NN)):
    if f_NN[i] not in stop_w:
        f_NN_not += [f_NN[i]]
def fhash(words,M):
    r = []
    fh = 0
    for i in range(len(words)):
        r += words[i]
    for i in range(len(r)):
        fh += ord(r[i])*((37)**i)
    return fh % M
if b not in ['n','N']:
    fh = []
    for i in range(len(f_NN_not)):
        fh += [fhash(f_NN_not[i],int(M))]
    check2 = []
    ccc = []
    for i in range(len(fh)):
        aa = fh.count(fh[i])
        if fh[i] not in check2:
            check2 += [fh[i]]
            ccc += [[fh[i], aa]]
            ccc.sort()
else:
    bbb = []
    check = []
    for i in range(len(f_NN_not)):
        aaa = f_NN_not.count(f_NN_not[i])
        if f_NN_not[i] not in check:
            check += [f_NN_not[i]]
            bbb += [[f_NN_not[i], aaa]]
    bbb.sort()
print('char count = '+str(ch_test))
print('alphanumeric count = '+str(al))
print('line count = '+str(ln))
print('word count = '+str(wc))
if b in ['y','Y']:
    print('BoW =', ccc)
else:
    print('BoW =', bbb)
# 6330551921 (19.60) 369 (2021-03-22 23:02)
def char_count(doc):
    c=0
    for e in range(len(doc)):
        c+=len(doc[e])
    return c
def alphanumeric_count(doc):
    doc = ''.join(doc).lower()
    c=0
    for e in doc :
        if 'a' <=e <='z' or '1'<=e<='9':
            c+=1
    return c
def line_count(doc) :
    return len(doc)
def word_find(doc):
    doc = ' '.join(doc).lower()
    c= ''
    for e in range(len(doc)):
        if 'a' <=doc[e] <='z' or '1'<=doc[e]<='9' or doc[e]==' ':
            c+= doc[e]
        else:
            c+= ' '
    return c.split()
def flash(word,m):
    flash = 0
    for i in range(len(word)) :
            flash+= ord(word[i])*37**i
    return flash%m
def bow (doc,m):
    c=[]
    if m == ' ':
        for e in word_find(doc):
            if e in word_find(stop_doc):
                continue
            else:
                if e in c :
                    continue
                else:
                    c.append(e)
        return [[o,word_find(doc).count(o)] for o in c]
    else:
        c=[]
        f = []
        show = []
        for e in word_find(doc):
            if e in word_find(stop_doc):
                continue
            else:
                c.append(e)
        for i in c:
            f.append(flash(i,m))
        f.sort()
        for k in range(min(f),max(f)+1):
            if f.count(k) != 0:
                show.append([k,f.count(k) ])

        return show
file_name = input('File name = ')
file = open(file_name,'r')
doc = file.readlines()
for e in range(len(doc)):
    doc[e] = doc[e].replace('\n','') 
file.close()
stop = open('stopwords.txt','r')
stop_doc = stop.readlines()
for e in range(len(doc)):
    stop_doc[e] = stop_doc[e].replace('\n','') 
stop.close()
while True:
    hashing = input('Use feature hashing ? (y,Y,n,N)  ').lower()
    if hashing == 'y' :
        m = int(input('M = '))
        print('-------------------')
        print('char count =',char_count(doc))
        print('alphanumeric count =',alphanumeric_count(doc))
        print('line count =',line_count(doc))
        print('word count =',len(word_find(doc)))
        print('BoW =',bow(doc,m))
        break
    elif hashing == 'n':
        m=' '
        print('-------------------')
        print('char count =',char_count(doc))
        print('alphanumeric count =',alphanumeric_count(doc))
        print('line count =',line_count(doc))
        print('word count =',len(word_find(doc)))
        print('BoW =',bow(doc,m))
        break
    print('Try again.')
# 6330552521 (0.00) 370 (2021-03-22 14:51)

def fhash( w , M ):
    fh = 0
    for i in range( len(w) ):
        fh += ( ( ord( w[i] ) ) * ( ( 37 ) ** ( i ) ) )
    fhsh = fh % M
    return fhsh
def alnum_count(lines):
    n = 0
    l = ''.join(lines)
    for x in l:
        if x.isalnum() == True:
            n += 1
        else:
            pass
    return n
def bow(lss):
    BoW = []
    for x in lss:
        eBoW = []
        eBoW.append(x)
        eBoW.append(lss.count(x))
        if eBoW not in BoW:
            BoW.append(eBoW)
        else:
            pass
    return BoW
def main():
    file_name = input( 'File name = ' )
    f_h = input( 'Use feature hashing ? (y,Y,n,N)' ).lower()

    scw = '!\',."@#$%^&*()_-+=][{}/?|:;><'

    j = True

    while j:

        if f_h == 'n':
            j = False
            print( '-' * 19 )
            
            stopwords = open( 'stopwords.txt' , 'r' )
            stws = stopwords.readlines()
            sw_list = [sw.rstrip('\n') for sw in stws]

            for x in sw_list:
                if x == '':
                    sw_list = sw_list.remove(x)
                else:
                    pass
            
            sw = ' '.join(sw_list)
            sw_list = sw.split(' ')
                

            file_name = open( 'sample.txt' , 'r' )
            fln = file_name.readlines()
            w_list1 = [line.rstrip('\n').lower() for line in fln]
            num_char = len( ''.join(w_list1) )

            print('char count =', num_char)
            print('alphanumeric count =', alnum_count(w_list1))
            print('line count =', len(w_list1))

            for x in w_list1:
                if x == '':
                    w_list1 = w_list1.remove(x)
                else:
                    pass

            w_string = ' '.join(w_list1)
            w_list = w_string.split(' ')

            print('word count =', w_list)


            w_list2 = w_string.split(' ')
            w_s = ' '.join([x for x in w_list2 if x not in sw_list])

            for x in w_s:
                if x in scw:
                    w_s = w_s.replace(x, '')
                else:
                    pass
            
            w_l = w_s.split(' ')
            w_l = [x for x in w_l if x not in sw_list]

            print('BoW =', sorted(bow(w_l)))
            
            file_name.close()
            stopwords.close()

        elif f_h == 'y':
            j = False
            M = int(input('M = '))

            print( '-' * 19 )
            
            stopwords = open( 'stopwords.txt' , 'r' )
            stws = stopwords.readlines()
            sw_list = [sw.rstrip('\n') for sw in stws]

            for x in sw_list:
                if x == '':
                    sw_list = sw_list.remove(x)
                else:
                    pass
            
            sw = ' '.join(sw_list)
            sw_list = sw.split(' ')
                

            file_name = open( 'sample.txt' , 'r' )
            fln = file_name.readlines()
            w_list1 = [line.rstrip('\n').lower() for line in fln]
            
            num_char = len( ''.join(w_list1) )

            print('char count =', num_char)
            print('alphanumeric count =', alnum_count(w_list1))
            print('line count =', len(w_list1))

            for x in w_list1:
                if x == '':
                    w_list1 = w_list1.remove(x)
                else:
                    pass

            w_string = ' '.join(w_list1)
            w_list = w_string.split(' ')

            print('word count =', w_list)

            w_list2 = w_string.split(' ')
            w_s = ' '.join([x for x in w_list2 if x not in sw_list])

            for x in w_s:
                if x in scw:
                    w_s = w_s.replace(x, '')
                else:
                    pass
            
            w_l = w_s.split(' ')
            w_l = [x for x in w_l if x not in sw_list]
            
            hsh = [fhash(x, M) for x in w_l]

            print('BoW =', sorted(bow(hsh)))

            file_name.close()
            stopwords.close()

        else:
            print('Try again')

main()
# 6330553121 (18.00) 371 (2021-03-22 22:55)

def cut(li) :
    list_of_words = []
    # cut() takes a list of strings and return list of induvidual lowercased words
    for string in li:
        low_string = string.lower()
        s = ''
        for w in low_string :
            if w.isalnum():
                s += w
            else :
                s += ' '
                
        n = s.split()
        list_of_words += n
        
    return list_of_words
def f_count(nt):
    nt.sort()
    word = nt[0]
    c = 0
    data = []
    for i in range(len(nt)) :
        if word == nt[i] :
            c += 1
        else :
            data.append([word,c])
            word = nt[i]
            c = 1
    data.append([word,c])
    return data
def fhash(w,M) :
    s = 0
    for i in range(len(w)) :
        s += ord(w[i])*(37**i)
    return s%M
          
#----------------------------------------            
            
file_name = input('File name = ')

#----------------------------------------

com = input('Use feature hashing ? (y,Y,n,N) ')
while com not in ['y','Y','n','N'] :
    print('Try again.')
    com = input('Use feature hashing ? (y,Y,n,N) ')

if com in ['y','Y']:
    hashing = 1
    M = int(input('M = '))
else :
    hashing = 0

#----------------------------------------

lines = open(file_name, 'r')
sens = []
chc = 0
alpc = 0
linec = 0
for line in lines :
    if line[-1] == '\n' :
        line = line[:-1:]
    chc += len(line)
    sens.append(line.strip())
    linec +=1
lines.close()

stopwords_file = open('stopwords.txt', 'r')
stopwords_li = []
for e in stopwords_file:
    stopwords_li.append(e.strip())
stopwords_file.close()

textwords = cut(sens)
stopwords = cut(stopwords_li)

for word in textwords:
    alpc += len(word)

wordc = len(textwords)

norm_textwords = []
for words in textwords:
    if words not in stopwords :
        norm_textwords.append(words)
        
BoW = f_count(norm_textwords)
z = []
if hashing == 1:
    for st in norm_textwords :
        z.append(fhash(st,M))
    BoW_hash = f_count(z)

print('-------------------')
print('char count =',chc)
print('alphanumeric count =',alpc)
print('line count =',linec)
print('word count =',wordc)
if hashing == 1 :
    print('BoW =',BoW_hash)  
else :
    print('BoW =',BoW)
# 6330554821 (17.05) 372 (2021-03-21 20:09)
file_name = input('File name = ')
use = input('Use feature hashing ? (y,Y,n,N) ')
while use != 'n' and use != 'N' and use != 'y' and use != 'Y':
    print('Try again.')
    use = input('Use feature hashing ? (y,Y,n,N) ')
    if use == 'n' and use == 'N' and use == 'y' and use == 'Y':
        break
if use == 'Y' or use == 'y':
    M = input('M = ')
    ch = ['"',"'",',',';',':',')','(','-','_','.']
    line_count = 0
    s = ''
    sentence = ''
    file = open(file_name,'r')
    for e in file:
        s += e.strip().lower()
        line_count +=1
    file.close()
    for i in range(len(s)):
        if s[i] in ch:
            sentence += ' '
        else:
            sentence += s[i]
    char_count = len(s)
    print('-'*len('Use feature hashing'))
    print('char count = '+str(char_count))
    word = sentence.split()
    alpha = ''.join(word)
    alpha_count = len(alpha)
    print('alphanumeric count = '+str(alpha_count))
    word_count = len(word)
    print('line count = '+str(line_count))
    print('word count = '+str(word_count))
    stop_word = []
    word1 = []
    stop = open('stopwords.txt','r')
    for e in stop:
        stop_word += e.split()
    stop.close()
    for i in range(len(word)):
        if word[i]  not in stop_word:
            word1.append(word[i])
    word1.sort()
    BOW2 = []
    BOW1 = []
    BOW1_COUNT = []
    ans = 0
    i = 0
    while i < len(word1):
        t = 0
        while t < len(word1[i]):
            ans += ord(word1[i][t])*(37)**t
            t += 1
        ans1 = ans % int(M)
        BOW1.append(ans1)
        i += 1
        ans = 0
    BOW1.sort()
    I = 0
    BOW1_count = 0
    while I < len(BOW1):
        if BOW1[I] not in BOW2 and I == 0:
            BOW1_count += 1
            BOW2.append(BOW1[I])
            I +=1
        elif BOW1[I] in BOW2:
            BOW1_count +=1
            I += 1
        elif BOW1[I] not in BOW2:
            BOW1_COUNT.append(BOW1_count)
            BOW2.append(BOW1[I])
            if BOW1_count != 0:
                BOW1_count = 1
            I += 1
    BOW1_COUNT.append(BOW1_count)
    BOW = []
    for i in range(len(BOW2)):
        BOW.append([BOW2[i],BOW1_COUNT[i]])
    print('BoW = '+str(BOW))
elif use == 'N' or use == 'n':
    ch = ['"',"'",',',';',':',')','(','-','_','.']
    line_count = 0
    s = ''
    sentence = ''
    file = open(file_name,'r')
    for e in file:
        s += e.strip().lower()
        line_count +=1
    file.close()
    for i in range(len(s)):
        if s[i] in ch:
            sentence += ' '
        else:
            sentence += s[i]
    char_count = len(s)
    print('-'*len('Use feature hashing'))
    print('char count = '+str(char_count))
    word = sentence.split()
    alpha = ''.join(word)
    alpha_count = len(alpha)
    print('alphanumeric count = '+str(alpha_count))
    word_count = len(word)
    print('line count = '+str(line_count))
    print('word count = '+str(word_count))
    stop_word = []
    word1 = []
    stop = open('stopwords.txt','r')
    for e in stop:
        stop_word += e.split()
    stop.close()
    for i in range(len(word)):
        if word[i]  not in stop_word:
            word1.append(word[i])
    word1.sort()
    I = 0
    word1_count = 0
    word2 = []
    word1_COUNT = []
    while I < len(word1):
        if word1[I] not in word2 and I == 0:
            word1_count += 1
            word2.append(word1[I])
            I +=1
        elif word1[I] in word2:
            word1_count +=1
            I += 1
        elif word1[I] not in word2:
            word1_COUNT.append(word1_count)
            word2.append(word1[I])
            if word1_count != 0:
                word1_count = 1
            I += 1
    word1_COUNT.append(word1_count)
    WORD = []
    for i in range(len(word2)):
        WORD.append([word2[i],word1_COUNT[i]])
    print('BoW = '+str(WORD))
# 6330555421 (16.47) 373 (2021-03-22 16:35)
file_name = open(input('File name = ').strip(), 'r')
x = file_name.readlines()
stopword_file = open('stopword.txt', 'r')
r = []
stopword = []
for line in stopword_file :
    r.append(line.split())
for i in range(len(r)) :
    for k in range(len(r[i])) :
        stopword.append(r[i][k])
stopword_file.close()
def fhash(w,M) :
    sum_hash = 0
    for i in range(len(w)) :
        sum_hash += ord(w[i])*(37**i)
    fhash = sum_hash % M
    return fhash

alpha_num = 'ABCDEFGHIJKLMNOPQRSTUVWSYZabcdefghijklmnopqrstuvwsyz0123456789'
words = ''
words_lst = []
lst_words = []
for i in range(len(x)) :
    for ch in x[i] :
        if ch in alpha_num :
            words += ch
        else :
            words_lst.append(words)
            words = ''
for f in words_lst :
    if f != '' :
        lst_words.append(f)
txt = []
for l in range(len(lst_words)) :
    if lst_words[l].lower() not in stopword :
        txt.append(lst_words[l])
#----------------------------------------------------------------        
ans = input('Use feature hashing ? (y,Y,n,N) ')
while ans not in ['y','Y','n','N']:
    print('Try again.')
    ans = input('Use feature hashing ? (y,Y,n,N) ')
if ans in ['y','Y'] :
    M = int(input('M = '))
    c = 0
    t = []
    print('-------------------')
    for i in range(len(x)) :
        if len(x[i]) > 0 and x[i][-1]=='\n' :
            t.append(x[i][:-1])
        else :
            t.append(x[i])
    for k in range(len(t)) :
        for ch in t[k] :
            c+=1
    print('char count =', c)

    d = 0
    result = 0
    tt = []
    for i in range(len(x)) :
        if len(x[i]) > 0 and x[i][-1]=='\n' :
            tt.append(x[i][:-1])
        else :
            tt.append(x[i])
    for k in range(len(t)) :
        for c in tt[k] :
            if c in 'ABCDEFGHIJKLMNOPQRSTUVWSYZabcdefghijklmnopqrstuvwsyz0123456789':
                result += 1
            else :
                result += 0
    print('alphanumeric count =', result)

    e = 0
    for line in x :
        e += 1
    print('line count =', e)

    words = ''
    words_lst = []
    lst_words = []
    for i in range(len(x)) :
        for w in range(len(x[i])) :
            if x[i][w] in alpha_num :
                if x[i][w] == x[i][-1] and x[i][w] in alpha_num :
                    words += x[i][w]
                    words_lst.append(words)
                else :
                    words += x[i][w]
            else :
                words_lst.append(words)
                words = ''
    for e in words_lst :
        if e != '' :
            lst_words.append(e)
    
    print('word count =', len(lst_words))
    #------------------
    aa = []
    for e in txt :
        aa.append(fhash(e,M))
    aa = sorted(aa)
    fBoW = []
    data = []
    for i in range(len(aa)) :
        if i == 0:
            data += [aa[i]]
        else :
            if aa[i] != aa[i-1] :
                data += [aa[i]]
    for g in data :
        fBoW.append([g,aa.count(g)])
    print('BoW =', fBoW)
    
elif ans in ['n','N'] :
    c = 0
    t = []
    print('-------------------')
    for i in range(len(x)) :
        if len(x[i]) > 0 and x[i][-1]=='\n' :
            t.append(x[i][:-1])
        else :
            t.append(x[i])
    for k in range(len(t)) :
        for ch in t[k] :
            c+=1
    print('char count =', c)

    d = 0
    result = 0
    tt = []
    for i in range(len(x)) :
        if len(x[i]) > 0 and x[i][-1]=='\n' :
            tt.append(x[i][:-1])
        else :
            tt.append(x[i])
    for k in range(len(t)) :
        for c in tt[k] :
            if c in 'ABCDEFGHIJKLMNOPQRSTUVWSYZabcdefghijklmnopqrstuvwsyz0123456789':
                result += 1
            else :
                result += 0
    print('alphanumeric count =', result)

    e = 0
    for line in x :
        e += 1
    print('line count =', e)

    words = ''
    words_lst = []
    lst_words = []
    for i in range(len(x)) :
        for w in range(len(x[i])) :
            if x[i][w] in alpha_num :
                if x[i][w] == x[i][-1] and x[i][w] in alpha_num :
                    words += x[i][w]
                    words_lst.append(words)
                else :
                    words += x[i][w]
            else :
                words_lst.append(words)
                words = ''
    for e in words_lst :
        if e != '' :
            lst_words.append(e)
    
    print('word count =', len(lst_words))
    
    #-------------------------
    words = sorted(txt)
    data = []
    BoW = []
    for i in range(len(words)) :
        if i == 0:
            data += [words[i]]
        else :
            if words[i] != words[i-1] :
                data += [words[i]]
    for e in data :
        BoW.append([e,words.count(e)])
    print('BoW =', BoW)



# 6330556021 (24.90) 374 (2021-03-20 16:26)


#--------------function---------------------
def fhash(w, M):
    plus = 0
    G = 37
    for i in range(len(w)):
        plus += ord(w[i])*(G**i)
    result = plus%M
    return (result) 

file_name = input("File name = ")
while True:
    choice = input("Use feature hashing ? (y,Y,n,N) ")
    if choice in ["y", "Y", "n", "N"]:
        break
    else: print("Try again.")
file2 = open("stopwords.txt", "r")
file1 = open(file_name, "r")
#---------list of stopwords-----------------
stopwords = []
x = [i.lower().strip() for i in file2]
for i in x:
    i = i.split()
    for j in i:
        stopwords.append(j)
#-----------list of words-------------------
words =[]
y = [i.lower().strip() for i in file1]
for i in y:
    i = i.split()
    for j in i:
        for k in j:
            if k.isalnum() == False:
                j = j.replace(k, "")
        words.append(j)
#-------------------------------------------
char_count = len("".join(y))
#-------------------------------------------
alphanumeric_count = len("".join(words))
#-------------------------------------------
line_count = len(y)
#-------------------------------------------
word_count = len(words)
#-------------Bag of words------------------

BoW = []
BoWonlywords = []
for i in words:
    if i not in stopwords and i not in BoWonlywords:
        BoW.append([i, 0])
        BoWonlywords.append(i)
BoW = sorted(BoW)
BoWonlywords = sorted(BoWonlywords)
for i in words:
    if i in BoWonlywords:
        BoW[BoWonlywords.index(i)][1] += 1
if choice in ["N", "n"]:
    show = BoW
else:
    M = int(input("M = "))
    BoWonlynum = []
    wordstran = []
    BoWver2 = []
    for i in BoWonlywords:
        m = str(fhash(i, M))
        if m not in BoWonlynum:
            BoWonlynum.append(m)
            BoWver2.append([m, 0])
    for i in words:
        if i not in stopwords:
            wordstran.append(str(fhash(i, M)))
    for i in wordstran:
        if i in BoWonlynum:
            BoWver2[BoWonlynum.index(i)][1] += 1
    show = sorted([[int(i[0]),i[1]] for i in BoWver2])  
print("-------------------")
print("char count =", char_count)
print("alphanumeric count =", alphanumeric_count)
print("line count =", line_count)
print("word count =", word_count)
print("BoW =", show)
file1.close()
file2.close()

# 6330557721 (23.55) 375 (2021-03-22 13:42)
file_name = input('File name = ')
findB = input('Use feature hashing ? (y,Y,n,N) ')
while findB not in ['Y','y','N','n']:
    print('Try again.')
    findB = input('Use feature hashing ? (y,Y,n,N) ')
#if findB in ['N','n']:
    
#3
stopwod = []
stw = open('stopwords.txt', 'r')
for line in stw:
    n = line.strip().split()
    for i in range(len(n)):
        stopwod.append(n[i])
stw.close()    
#4
li = 0
aka = 0
aln = 0
words = []
rfile = open(file_name,'r')
for line in rfile:
    li+=1
    for i in range(len(line)):
        aka +=1        
        if line[i].isalnum() == True:
            aln+=1
aka = (aka - li)+1
rfile.close()
words = []
r1file = open(file_name,'r')
words = [line.lower().strip().split() for line in r1file]           
r1file.close()
words1 = []
for i in words:
    for k in i:
        for u in k:
            if u.isalnum() == False:
                k=k.replace(u,'')
        words1.append(k)    
wo = len(words1)   
BoW1 = []
for i in range(len(words1)):
    if words1[i] not in stopwod:
        if words1[i] not in BoW1:
            BoW1.append(words1[i])
            BoW1.append(int(0))
        for k in range(len(BoW1)):
            if words1[i] == BoW1[k]:
                BoW1[k+1] +=1
BoW2 = []
for i in range(int((len(BoW1))/2)):
    l,m = BoW1[i*2],BoW1[(i*2)+1]
    BoW2.append([l,m])

if findB in ['N','n']:
    print('char count =',aka)
    print('alphanumeric count =',aln)
    print('line count =',li)
    print('word count =',wo)
    print('BoW = '+str(BoW2))
            
if findB in ['y','Y']:
    M = int(input('M = '))
    fhash = []
    words2 = []
    for i in range(len(words1)):
        if words1[i] not in stopwod:
            words2.append(words1[i])
    for i in range(len(words2)):
        f=0
        
        for k in range(len(words2[i])):
            f += ord(words2[i][k])*(37**(k))
        fhash.append(str(f%M))
    BoW = []
    for i in range(len(fhash)):
        if fhash[i] not in BoW:
            BoW.append(fhash[i])
            BoW.append(int(0))
        for k in range(len(BoW)):
            if fhash[i] == BoW[k]:
                BoW[k+1] +=1
    BoWW = []
    for i in range(int((len(BoW))/2)):
        l,m = BoW[i*2],BoW[(i*2)+1]
        BoWW.append([l,m])
    print('char count =',aka)
    print('alphanumeric count =',aln)
    print('line count =',li)
    print('word count =',wo)
    print('BoW = '+str(BoWW))
        
        
           
                
            
        




# 6330558321 (30.00) 376 (2021-03-22 00:43)
 
def read_stopwords(fd):
    bow = []
    with open(fd, 'r') as f:
        for line in f:
            if not line:
                continue
            else:
                bow.extend(line.split())
    return bow
 
def ask_hash():
    hs = input("Use feature hashing ? (y,Y,n,N) ")
    if hs in ['y', 'Y']:
        return True
    elif hs in ['n', 'N']:
        return False
    else:
        print("Try again.")
        return ask_hash()
 
def hash_word(word, m):
    cs = list(word)
    return sum([ord(c)*(37**i) for i, c in enumerate(cs)]) % m
 
def count_bow(bow):
    new_bow = []
    found = []
    for word in bow:
        if word in found:
            new_bow[found.index(word)][1] += 1
        else:
            found.extend([word])
            new_bow.append([word, 1])
    return new_bow
 
stop = read_stopwords("stopwords.txt")
 
fs = input("File name = ")
f = open(fs, 'r')
nlines, nwords, nchars, nalphanum = 0, 0, 0, 0
all_words = []
 
for line in f:
    line = line.strip('\n')
    nlines += 1
    nchars += len(line)
    nalphanum += sum(c.isalnum() for c in line)
    line = ''.join([c.lower() if c.isalnum() else " " for c in line])
    words = line.split()
    all_words.extend(words)
    nwords += len(words)
    
f.close()
 
bow = filter(lambda word: word not in stop, all_words)
 
hash_flag = ask_hash()
if hash_flag:
    m = int(input("M = "))
    bow = [hash_word(word, m) for word in bow]
 
# print results
print(f"char count = {nchars}")
print(f"alphanumeric count = {nalphanum}")
print(f"line count = {nlines}")
print(f"word count = {nwords}")
print(f"BoW = {sorted(count_bow(bow), key=lambda t: t[0])}")
# 6330559021 (27.00) 377 (2021-03-21 23:52)
def stopword(s):
    stop = open(s)
    y =[]
    for e in [line.strip()for line in stop] :y+=(e.split())
    stop.close()
    return y 
#----------------------------------------------------------
def charcount(s) : 
    x = open(s)
    k = 0 
    for line in x :
        for e in line :
            if e != "\n" :
                k += 1         
    x.close()
    return k
#----------------------------------------------------------
def alphanumeric(s) : 
    x = open(s)
    k =0 
    for line in x :
        for e in line :
            if e in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" :
                k += 1         
    x.close()
    return k
#----------------------------------------------------------
def words(s):
    x = open(s)
    k=""
    for line in x :
        for e in line :
            if e not in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" :
                k += " "
            else :k += e.lower()
    k=k.split()   
    x.close()
    return k
#----------------------------------------------------------
def line(s) :
    x = open(s)
    k = 0 
    for line in x :k += 1
    x.close()
    return k
#----------------------------------------------------------
file_name = input("File name = ")
feature = input("Use feature hashing ? (y,Y,n,N) ").lower()
while feature not in "yn" :
    print("Try again.")
    feature = input("Use feature hashing ? (y,Y,n,N) ").lower()
if feature == "n" :
    print("-------------------")
    print("char count =",charcount(file_name))
    print("alphanumeric count =",alphanumeric(file_name))
    print("line count =",line(file_name))
    print("word count =",len(words(file_name)))
    x = words(file_name)
    k =[]
    m=[]
    for e in x :
        if e not in words("stopwords.txt"):k.append(e)
    k.sort()
    for i in range(len(k)) :
        if i == 0:
            y = k.count(k[i])
            m.append([k[i],y])
        else :
            if k[i] != k[i-1] :
                y =k.count(k[i])
                m.append([k[i],y])
    print("BoW =",m)
    
elif feature == "y" :
    p = input("M = ")
    print("-------------------")
    print("char count =",charcount(file_name))
    print("alphanumeric count =",alphanumeric(file_name))
    print("line count =",line(file_name))
    print("word count =",len(words(file_name)))
    x = words(file_name)
    k =[]
    m=[]
    m2=[]
    j = []
    for e in x :
        if e not in words("stopwords.txt"):k.append(e)
    k.sort()
    for e in k :
        s = 0
        for i in range(len(e)) :
             s +=ord(e[i])*(37**i)
        j.append(int(s)%int(p))
    j.sort()
    for i in range(len(j)) :
            if i == 0:
                y = j.count(j[i])
                m.append([j[i],y])
            else :
                if j[i] != j[i-1] :
                    y =j.count(j[i])
                    m.append([j[i],y])

    print("BoW =",m)
    
    
# 6330560521 (30.00) 378 (2021-03-21 22:19)
alnum_list = ['a','b','c','d','e','f','g','h','i','j','k','l','m',\
                  'n','o','p','q','r','s','t','u','v','w','x','y','z',\
                  '1','2','3','4','5','6','7','8','9','0']
#--------------------------------------------------------------------
def run():  
    x = True
    file_name = input('File name = ')
    while x==True:
        use_fh = input('Use feature hashing ? (y,Y,n,N) ')
        if use_fh in ['y','Y'] :
            M = int(input('M = '))
            x = False
            print('-------------------')
            print('char count = '+str(char_count(file_name)))
            print('alphanumeric count = '+str(alnum_count(file_name)))
            print('line count = '+str(line_count(file_name)))
            print('word count = '+str(word_count(file_name)))
            list_fh = []
            f = 0
            list_words = real_txt(file_name)
            for e in list_words:
                f = fhash(e,M)
                #print(e,f)
                list_fh.append(f)
                f = 0 
            list_fh.sort()
            u_list_fh = get_unique(list_fh)
            Bow = []
            count = 0
            for e in u_list_fh:
                count = 0
                for c in list_fh :
                    if e == c:
                        count += 1
                each_Bow = []
                each_Bow.append(e)
                each_Bow.append(count)
                Bow.append(each_Bow)
            print('BoW = '+str(Bow))
        elif use_fh in ['n','N'] :
            x = False
            print('-------------------')
            print('char count = '+str(char_count(file_name)))
            print('alphanumeric count = '+str(alnum_count(file_name)))
            print('line count = '+str(line_count(file_name)))
            print('word count = '+str(word_count(file_name)))
            list_words=real_txt(file_name)
            u_words = get_unique(list_words)
            Bow = []
            count = 0
            for e in u_words:
                count = 0
                for c in list_words :
                    if e == c:
                        count += 1
                each_Bow = []
                each_Bow.append(e)
                each_Bow.append(count)
                Bow.append(each_Bow)
            print('BoW = '+str(Bow))
        else :
            print('Try again.')
            x = True
#------------------------------------------------------------
stopwords = open('stopwords.txt','r')
list_stopwords = []
for line in stopwords:
    s_words = line.split()
    for e in s_words :
        list_stopwords.append(e)
stopwords.close()
#------------------------------------------------------------
def char_count(file_name):
    txt = open(file_name,'r')
    char_c = 0             
    for line in txt:
        for i in range(len(line)):
         if line[i] != '\n':
            char_c += 1
    txt.close()
    return char_c
#-------------------------------------------------------------
def alnum_count(file_name):
    txt = open(file_name,'r')
    alnum_c = 0
    for line in txt:
        l = line.strip()
        l = line.lower()
        for alnum in l :
            if alnum in alnum_list :
                alnum_c += 1
    txt.close()
    return alnum_c
#--------------------------------------------------------------
def line_count(file_name):
    txt = open(file_name,'r')
    line_c = 0
    for line in txt :
        line_c += 1
    txt.close()
    return line_c
#--------------------------------------------------------------
def word_count(file_name):
    txt = open(file_name,'r')
    word_c = 0
    for line in txt:
        line = line.lower()
        line = line.strip()
        for i in range(len(line)):
            if line[i] not in alnum_list :
                line = line[:i] + ' ' + line[i+1:]
            else :
                line = line[:i] + line[i] + line[i+1:]
        a = line.split()
        for e in a :
            word_c += 1
    txt.close()
    return word_c
#--------------------------------------------------------------
def real_txt(file_name) : #txtทั้งหมด ตัดstop word
    txt = open(file_name,'r')
    list_words = []
    for line in txt :
        line = line.lower()
        line = line.strip()
        for i in range(len(line)):
            if line[i] not in alnum_list :
                line = line[:i] + ' ' + line[i+1:]
            else :
                line = line[:i] + line[i] + line[i+1:]
        a = line.split()
        for e in a :
            e = e.lower()
            if e not in list_stopwords :
                list_words.append(e)
    list_words.sort()
    txt.close()
    return list_words
#-----------------------------------------------------
def get_unique( words ):
    words.sort()
    unique_words = []
    for i in range(len(words)) :
        if i == 0 :
            unique_words.append(words[i])
        elif words[i-1] != words[i] :
            unique_words.append(words[i])
    return unique_words
#-------------------------------------------------------------
def fhash(w,M):
    G = 37
    fh = 0
    for i in range(len(w)):
        fh += ord(w[i])*(G**i)
        if i+1 == len(w) :
            fh = fh%M
    return fh
#-----------------------------------------------------
run()

# 6330561121 (16.70) 379 (2021-03-18 14:40)

tieagain = 0
stpw = []
charcount = 0
alphancount = 0
lincount = 0
wordc = 0
tempword = []
temphash = []
BoW = []
def fhash(w, M) :
    calc = 0
    for i in range(len(w)) :
        calc += ord(w[i]) * (37**(i))
    fhash = calc%M
    return fhash

file_name = input("File name = ")
while tieagain == 0 :
    choice = input("Use feature hashing ? (y,Y,n,N) ")
    if choice == 'y' or choice == 'Y' : M = input("M = ") ; FH = 1 ; tieagain = 1
    elif choice == 'n' or choice == 'N' : FH = 0  ;tieagain = 1
    else : print("Try again.")
print("-------------------")
stop = open("stopwords.txt", "r")
for line in stop :
    a = line.strip().split()
    for i in range(len(a)):
        stpw.append(a[i])
stop.close()
file = open(file_name,"r")
for line in file :
    charcount += len(line.strip("\n"))
    for i in range(len(line.strip("\n"))) :
        if line.strip("\n")[i].isalnum() == True: alphancount += 1
        if line[i].isalnum() == False :
            if i == len(line.strip("\n"))-1 : wordc +=1
            elif line[i+1].isalnum() == True : wordc +=1
    if line[0].isalnum() == False : wordc -=1
    if line.strip("\n")[-1].isalnum() == True : wordc +=1
    lincount += 1
file.close()
file = open(file_name,"r")
for line in file :
    line = line.strip().strip("\n").lower().split()
    for word in line :
        if word not in stpw :
            for i in range(len(word)) :
                if len(word) <= i : break
                if word[i].isalnum() == False : word = word.strip(word[i])
            tempword.append(word)
if FH == 1 :
    for i in range(len(tempword)) :
        temphash.append(fhash(tempword[i], int(M)))
    tempword = sorted(temphash)
    
for i in range(len(tempword)) :
    BoWc = 1
    if i == len(tempword) and tempword[i] not in tempword[:i] : BoW.append([tempword[i], BoWc])
    elif i == len(tempword) : break
    if tempword[i] in tempword[:i] : continue
    temptempword = tempword[i+1:]
    while tempword[i] in temptempword :
        if tempword[i] in temptempword :
            temptempword.remove(tempword[i])
            BoWc += 1 
    BoW.append([tempword[i], BoWc])

file.close()
print("char count =",charcount)
print("alphanumeric count =",alphancount)
print("line count =",lincount)
print("word count =",wordc)
print("BoW =",sorted(BoW))

# 6330562821 (30.00) 380 (2021-03-21 15:59)
def line_to_words(s):
    new_s = ''
    for e in s:
        if not e.isalnum():
            new_s += ' '
        else: new_s += e
    if len(new_s.split()) == 0:
        return ''
    else: return new_s.split()
    
def fhash(w, M):
    result = 0
    for i in range(len(w)):
        result += ord(w[i])*(37**i)
    return result % M
def Bag_of_words(w, M):
    check = []; BoW = []
    for i in range(M):
        check.append([i,0])
    for e1,e2 in w:
        check[fhash(e1, M)][1] += e2
    for i in range(len(check)):
        if check[i][1] != 0:
            BoW.append(check[i])
    return BoW

#-------------------------
# read stopwords file => turn to [stopw]
stopw = []
fn = open('stopwords.txt', 'r')
for line in fn:
    x = line_to_words(line.strip())
    for e in x:
        if e.lower() not in stopw:
            stopw.append(e.lower())
fn.close()

#-------------------------
file = input('File name = ').strip()
while True:
    key = input('Use feature hashing ? (y,Y,n,N) ')
    if key.upper() == 'Y' or key.upper() == 'N':
        break
    else: print('Try again.')
if key.upper() == 'Y':
    M = int(input('M = '))
fn = open(file, 'r')
counts_ch = 0; counts_alnum = 0; counts_line = 0; counts_word = 0
words = []; check_words = []
for line in fn:
    for e in line:
        if e != '\n':
            counts_ch += 1
        if e.isalnum():
            counts_alnum += 1
    counts_line += 1
    x = line_to_words(line)
    counts_word += len(x)
    for e in x:
        if e.lower() not in check_words and e.lower() not in stopw:
            words.append([e.lower(),0])
            check_words.append(e.lower())
        if e.lower() in check_words:
            k = check_words.index(e.lower())
            words[k][1] += 1
fn.close()
words.sort()
print('-'*19)
print('char count =', counts_ch)
print('alphanumeric count =',counts_alnum)
print('line count =', counts_line)
print('word count =', counts_word)
if key.upper() == 'Y':
    print('BoW =', Bag_of_words(words, M))
else:
    print('BoW =', words)
# 6330563421 (30.00) 381 (2021-03-22 20:25)

def fhash(w,M):
    fsum=0
    for i in range(len(w)):
        fsum+=(ord(w[i])*(37**i))
    fvalue=fsum%M
    return fvalue
def alphanume_count(x):
    c=0
    y=x.lower()
    for k in y:
        if k in 'abcdefghijklmnopqrstuvwxyz0123456789':
            c+=1
    return c
def word_count(x):
    word=''
    for k in x:
        if k not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789':
            word+=' '
        else:
            word+=k
    return len(word.split())    
  
file_name=input('File name = ')
while True:
    ask=input('Use feature hashing ? (y,Y,n,N) ')
    if ask=='y' or ask=='Y':
        M=int(input('M = '))
        break
    elif ask=='n' or ask=='N':
        break
    else:
        print('Try again.')
print('-------------------')

stop_word=[]
sw=open('stopwords.txt','r')
for line in sw:
    for k in line.lower().split():
        stop_word.append(k)
sw.close()

sample=''
line_count=0
char_count=0
fn=open(file_name,'r')
for line in fn:
    sample+=(line.strip()+' ')
    char_count+=len(line.strip())
    line_count+=1
fn.close()

print('char count = '+str(char_count))
print('alphanumeric count = '+str(alphanume_count(sample)))
print('line count = '+str(line_count))
print('word count = '+str(word_count(sample)))

bowsample=''
bowedit=[]
for k in sample.lower():
    if k in 'abcdefghijklmnopqrstuvwxyz0123456789':
        bowsample+=k
    else:
        bowsample+=' '
bowsample=bowsample.split()
for k in bowsample:
    if k not in stop_word:
        bowedit.append(k)

if ask=='y' or ask=='Y':
    fhashresult=[]
    fhashunique=[]
    fhashcount=[]
    for k in bowedit:
        fhashresult.append(fhash(k,M))
    for k in fhashresult:
        if k not in fhashunique:
            fhashunique.append(k)
    for k in fhashunique:
        counter=0
        for e in fhashresult:
            if e==k:
                counter+=1
        fhashcount.append(counter)
    fhashbow=[[fhashunique[i],fhashcount[i]] for i in range(len(fhashunique))]
    fhashbow.sort()
    print('BoW = '+str(fhashbow))
else:
    nofhashunique=[]
    nofhashcount=[]
    for k in bowedit:
        if k not in nofhashunique:
            nofhashunique.append(k)
    for k in nofhashunique:
        counter=0
        for e in bowedit:
            if e==k:
                counter+=1
        nofhashcount.append(counter)
    nofhashbow=[[nofhashunique[i],nofhashcount[i]] for i in range(len(nofhashunique))]
    nofhashbow.sort()
    print('BoW = '+str(nofhashbow))
    
# 6330564021 (30.00) 382 (2021-03-22 01:34)

#----------------------------------------------------------
def char_count(file_name):
    i = 0
    fn = open(file_name, 'r')
    for line in fn:
        if line[-1] == '\n':
            i += len(line[:-1])
        else:
            i += len(line)
    fn.close()
    return i
def alp_count(file_name):
    i = 0
    fn = open(file_name, 'r')
    for line in fn:
        for c in line.lower():
            if 'a' <= c <= 'z' or '0' <= c <= '9':
                i += 1    
    fn.close()
    return i
def line_count(file_name):
    i = 0
    fn = open(file_name, 'r')
    for line in fn:
        i += 1
    fn.close()
    return i
def stop_words(stop_name):
    k = []
    fn = open(stop_name, 'r')
    for line in fn:
        k += line.lower().strip().split()
    fn.close()
    return k
def words(file_name):
    k = []
    fn = open(file_name, 'r')
    for line in fn:
        d = ''
        for c in line.lower():
            if 'a' <= c <= 'z' or '0' <= c <= '9':
                d += c
            else:
                d += ' '
        k += d.strip().split()
    fn.close()
    return k
def BoW(file_name, stop_name):
    a = words(file_name)
    b = stop_words(stop_name)
    k = []
    for c in a:
        if c in b:
            k.append(c)
    p = []
    for d in a:
        if d not in k:
            p.append(d)
    word_c = []
    word = []
    for i in range(len(p)):
        if p[i] in word_c:
            j = word_c.index(p[i])
            word[j] += 1
        else:
            word_c.append(p[i])
            word.append(1)
    r = []
    for i in range(len(word)):
        r.append([word_c[i],word[i]])
    return r
def  f_hashing(file_name, stop_name,M):
    a = words(file_name)
    b = stop_words(stop_name)
    k = []
    for c in a:
        if c in b:
            k.append(c)
    p = []
    for d in a:
        if d not in k:
            p.append(d)
    word_or1 = []
    for e in p:
        n = 0
        for i in range(len(e)):
            x = ord(e[i])
            n += x*((37)**i)
        word_or1.append(n%M)
    word_or1 = sorted(word_or1)
    word_or2 = []
    word_num = []
    for i in range(len(word_or1)):
        if word_or1[i] in word_or2:
            j = word_or2.index(word_or1[i])
            word_num[j] += 1
        else:
            word_or2.append(word_or1[i])
            word_num.append(1)
    z = []
    for i in range(len(word_num)):
        z.append([word_or2[i],word_num[i]])
    return z

#----------------------------------------------------------
stop_name = 'stopwords.txt'
file_name = input('File name = ')
s = 1
while s == 1:
    t = input('Use feature hashing ? (y,Y,n,N) ').lower()
    if t == 'y' or t == 'n':
        s = 0
    else:
        print('Try again.')
        s = 1
if t == 'y':
    M = int(input('M = '))
    print('-------------------')
    print('char count =',char_count(file_name))
    print('alphanumeric count =',alp_count(file_name))
    print('line count =',line_count(file_name))
    print('word count =',len(words(file_name)))
    print('BoW =',f_hashing(file_name, stop_name,M))
else:
    print('-------------------')
    print('char count =',char_count(file_name))
    print('alphanumeric count =',alp_count(file_name))
    print('line count =',line_count(file_name))
    print('word count =',len(words(file_name)))
    print('BoW =',BoW(file_name, stop_name))


    
# 6330565721 (4.00) 383 (2021-03-22 21:11)
file_name = input('File name = ',)
x = input("Use feature hashing ? (y,Y,n,N) ",)
while x != 'n' and x != 'N' and x != 'y' and x != 'Y' :
        print('Try again.')
        a = input("Use feature hashing ? (y,Y,n,N) ",)
if x == 'y' or x == 'Y' :
    y = int(input('M = ',))
print('-------------------')
def fhash(o, p) :
    f = 0
    for i in range(len(o)) :
        f = f+(ord(o[i])*37**i)
    f %= p
    return f
def count_words(o) :
    ct5 = 0
    for i in range(len(h)):
        if h[i] == o :
            ct5 += 1
        else :
            ct5 += 0
    return ct5
s = open('stopwords.txt', 'r') ; z = open(file_name, 'r') ; z2 = z.read().strip() ; z3 = z2.split() ; z4 = " ".join(z3)
ss = s.read().strip().split()
ct1 = 0
for line in z2 :
    if line == '\n' :
        ct1+=0
    else :
        ct1 += len(line)
print('char count =', ct1)
ct2 = 0 ; a=[] ; b=[]
for line in ff :
    for i in range(len(line)) :
        if line[i].isalnum()==True :
            ct2 += 1
            a.append(line[i])
            b = ''.join(a)
        else :
            continue
    c.append(b)
    a=[]
print('alphanumeric count =', ct2)
ct3 = 0
z = open(file_name, 'r')
for line in z :
    ct3 += 1
print('line count =', ct3)
ct4 = 0 ; b=[]
for i in range(len(z4)) :
    if z4[i]==z4[0] :
        b.append(z4[i])
        continue
    if z4[i].isalnum()==False and z4[i].isalnum() != z4[i-1].isalnum() :
        ct4 += 1
    else :
        b.append(z4[i])
        continue
print('word count =', ct4)
BW = [] ; BW0 = [] ; BW1 = [] ; BW2 = []
h = " ".join(c).lower().split()
if x == 'y' or x == 'Y' :
    for i in range(len(h)) :
        if h[i] not in ss and h[i] not in BW0 :
            BW0.append(h[i])
            BW1.append([fhash(h[i],b), count_words(h[i])])
    q = sorted(BW1)
    for i in range(len(q)) :
        if i < len(q)-1 :
            for j in range(i+1,len(q)) :
                if q[i][0]==q[j][0] :
                    q[i][1]+=q[j][1]
    for i in range(len(q)) :
        if q[i][0]==q[i-1][0] :
            continue
        else :
            BW.append(q[i])
    print('BoW =', BW)
elif x == 'n' or x == 'N' :
    for i in range(len(h)) :
        if h[i] not in ss :
            BW2.append([h[i], count_words(h[i])])
    q = sorted(BoW2)
    for i in range(len(q)) :
        if q[i][0]==q[i-1][0] :
            continue
        else :
            BoW.append(k[i])
    print('BoW =', BW)
z.close()
s.close()


# 6330566321 (30.00) 384 (2021-03-22 22:29)
def get_words(txt):
    txt = txt.lower()
    tmp = ''
    words = []
    for c in txt:
        if ('a' <= c <= 'z') or ('0' <= c <= '9'):
            tmp += c
        else:
            if tmp != '':
                words.append(tmp)
                tmp = ''
    if tmp != '':
        words.append(tmp)
    return words
def fhash(w,M):
    return sum([ord(w[i])*37**i for i in range(len(w))])%M
def BoW(words,M):
    if M != 0:
        words = [fhash(word,M) for word in words]
    collected = []
    bag = []
    for word in words:
        if word not in collected:
            collected.append(word)
            bag.append([word,words.count(word)])
    return sorted(bag)
def remove_stop(words,stopwords):
    removed = []
    for word in words:
        if word not in stopwords:
            removed.append(word)
    return removed

M = 0
file = input('File name = ')
useFhash = input('Use feature hashing ? (y,Y,n,N) ')
while useFhash not in ['y','Y','n','N']:
    print('Try again.')
    useFhash = input('Use feature hashing ? (y,Y,n,N) ')
if useFhash in ['y','Y']:
    M = input('M = ')
    Pass = True
    if M == '0':
        Pass = False
    for c in M:
        if not ('0' <= M <= '9'):
            Pass = False
    while not Pass:
        print('Try again.')
        M = input('M = ')
        Pass = True

        if M == '0':
            Pass = False
        for c in M:
            if not ('0' <= M <= '9'):
                Pass = False
    M = int(M)
with open(file) as f:
    txt = f.read().lower()
charCount = len(txt) - txt.count('\n')
print('-------------------')
print('char count =',charCount)
lineCount = txt.count('\n') + 1
alphanumCount = 0
for c in txt:
    if ('a' <= c <= 'z') or ('0' <= c <= '9'):
        alphanumCount += 1
print('alphanumeric count =',alphanumCount)
print('line count =',lineCount)
words = get_words(txt)

wordCount = len(words)
print('word count =',wordCount)

with open('stopwords.txt') as f:
    stopwords = f.read().lower().split()
words = remove_stop(words,stopwords)
print('BoW =',BoW(words,M))

# 6330567021 (22.75) 385 (2021-03-20 21:59)
def no_sign(slice):
    string = ''
    for i in slice:
        if i in '():;\'\"\\/,.?':
            string += ' '
        else:string += i
    return string
def list_2_string(l):
    string = ''
    for i in l:
        string += str(i)
    return string
def fhash(list_word,M):
    p = 0
    for i in range(len(list_word)):
        p += ord(list_word[i])*(37**i)
    h = p%M
    return h
def no_repeat(slice):
    once_word = []
    for i in range(len(slice)):
        if slice[i] in once_word:once_word += []
        else:once_word+= [slice[i]]
    return once_word
def bow_y_1(slice,m):
    c = [fhash(i,m) for i in slice]
    clean_number = no_repeat(c)
    clean_number.sort()
    last = [[j,c.count(j)]for j in clean_number]
    return last ###################
def bow_n_coop(slice):
    zero_repeat = no_repeat(slice)
    c = [[i,slice.count(i)] for i in zero_repeat]
    return c ####################
def list_present_y(duo_list):
    c = ''
    for i in duo_list:
        if i[1] != 0:c += '['+str(i[0])+', '+str(i[1])+'], '
    c = '[' + c[:-2] + ']'
    return c
def list_present_n(bow_of_n):
    c = ''
    for i in bow_of_n:
        if i[1] != 0:c += "['"+str(i[0])+"', "+str(i[1])+'], '
    c = '[' + c[:-2] + ']'
    return c
def string_from_list(slice):
    string = ''
    for i in slice:
        for j in i:
            string += j
            string += ' '
    return string ###################
def splt_list_to_list(slice):
    space = []
    for i in slice:
        for j in i:
            space += [j]
    return space
def clear_list_but_repeat_nosign_nospace_small_letter(slice):
    pp = string_from_list(slice).lower()
    p = (no_sign(pp)).split()
    return p
def stop_words_layer(all_word_list,word_2_stop):
    space = []
    stop = word_2_stop
    for i in all_word_list:
        if i not in stop:
            space += [i]
    return space
#file_sample = open('sample.txt','w')
##file_sample.write('It was the best of times,\nit was the worst of times,\nit was the age of wisdom.\n"555"')
##file_sample.write('one bad bitch and she do what  I say so\n0.2 big dollar and a big ass,\nfaygo')
#file_sample.close()
#file_stop = open('stopwords.txt','w')
#file_stop.write('it they\nthe a an\nof on in at\nis am say so are was were')
#file_stop.close()
file_stopper = open('stopwords.txt','r')
stop_word = [jline.split() for jline in file_stopper]
word_2_stop = splt_list_to_list(stop_word)################stopwords

file_name = input('File name = ').strip()
file = open(file_name,'r')
word_list = [line.split() for line in file]
file2 = open(file_name,'r')
each_line = [line.strip() for line in file2]
def gogoy(word_list,m,each_line):
    all_word_list = clear_list_but_repeat_nosign_nospace_small_letter(word_list)
    number_of_line = len(each_line)
    number_of_word = len(all_word_list)
    string_word = list_2_string(each_line)
    only_alnum = list_2_string(all_word_list)
    word_from_layer_some_are_repeat = stop_words_layer(all_word_list,word_2_stop)## these section for 
    print('-------------------')
    print('char count = ' +str(len(string_word)))
    print('alphanumeric count = ' + str(len(only_alnum)))
    print('line count = ' +str(number_of_line))
    print('word count = ' +str(number_of_word))
    print('Bow = ' + list_present_y(bow_y_1(word_from_layer_some_are_repeat,m)))
def gogon(word_list,each_line):
    all_word_list = clear_list_but_repeat_nosign_nospace_small_letter(word_list)
    number_of_line = len(each_line)
    number_of_word = len(all_word_list)
    string_word = list_2_string(each_line)
    only_alnum = list_2_string(all_word_list)
    all_word_list_no_repeat = no_repeat(all_word_list)
    ##these section 4 bow
    word_from_layer_some_are_repeat = stop_words_layer(all_word_list,word_2_stop)
    bow_of_n = bow_n_coop(word_from_layer_some_are_repeat)
    #print(word_from_layer_some_are_repeat)
    print('-------------------')
    print('char count = ' +str(len(string_word)))
    print('alphanumeric count = ' + str(len(only_alnum)))
    print('line count = ' +str(number_of_line))
    print('word count = ' +str(number_of_word))
    print('BoW = ' + list_present_n(bow_of_n))
hash = input('Use feature hashing ? (y,Y,n,N) ').strip()
if hash in 'yYnN':
    if hash == 'y' or hash == 'Y':
        m_order = int(input('M = '))
        gogoy(word_list,m_order,each_line)
    else:
        gogon(word_list,each_line)
else:
    while hash not in 'yYnN':
        print('Try again.')
        hash = input('Use feature hashing ? (y,Y,n,N) ').strip()
        if hash in 'yYNn':
            if hash == 'y' or hash == 'Y':
                m_order = int(input('M = '))
                gogoy(word_list,m_order,each_line)
            else:
                gogon(word_list,each_line)

                


# 6330568621 (21.40) 386 (2021-03-21 14:41)
def fhash(w, M):
    s = 0
    for idx, char in enumerate(w):
        s += ord(char) * 37 ** idx
    return s % M

def bowList(word_list, stopwords, isDo, M):
    used_word = []
    for word in word_list:
        if word not in used_word and word not in stopwords:
            used_word.append(word)
    word_freq = [0] * len(used_word)
    for word in word_list:
        if word in used_word:
            word_freq[used_word.index(word)] += 1
    if isDo:
        temp = [0 for i in range(M)]
        for i in range(len(used_word)):
            temp[fhash(used_word[i], M)] += word_freq[i]
        return [[i, temp[i]] for i in range(M) if temp[i] != 0]
    else:
        return [[used_word[i], word_freq[i]] for i in range(len(used_word))]

def count(text):
    text = text.lower()
    alp_count = 0
    for idx, char in enumerate(text):
        if char.isalpha() or char.isdigit():
            alp_count += 1
        else:
            text = text[:idx] + " " + text[idx + 1 :]
    return alp_count, len(text.split()), text.split()

def read_file(fname):
    f = open(fname, "r")
    text = ""
    line_count = 0
    for line in f:
        text += line.strip()
        line_count += 1
    char_count = len(text)
    f.close()
    alp_count, word_count, word_list = count(text)
    return char_count, alp_count, line_count, word_count, word_list


file_name = input("File name = ")
BoW = input("Use feature hashing ? (y,Y,n,N) ")
while BoW not in ("N", "n", "Y", "y"):
    print("Try again.")
    BoW = input("Use feature hashing ? (y,Y,n,N) ")

stop_file = open("stopwords.txt")
stopwords = list()
for line in stop_file:
    stopwords += line.strip().split()
char_count, alp_count, line_count, word_count, word_list = read_file(file_name)
if BoW.upper() == "Y":
    M = int(input("M = "))
    BowList = bowList(word_list, stopwords, True, M)
else:
    BowList = bowList(word_list, stopwords, False, 0)

print("char count = ", char_count)
print("alphanumeric count = ", alp_count)
print("line count = ", line_count)
print("word count = ", word_count)
print("Bow = ", BowList)

# 6330570821 (25.15) 387 (2021-03-21 18:24)

#===========================(DefFunc)====================================#
def flash(w, M):
    t = 0
    x = 0
    for e in w:
        x += ord(e) * ( 37**t )
        t += 1
    return x%M
def char_count(filename):
    file = open(filename,'r')
    x = 0
    for line in file:
        if line[-1] == '\n':
            line = line[:-1]
        x += len(line)
    file.close()
    return x
def alphanumeric_count(filename):
    file = open(filename,'r')
    x = 0
    for line in file:
        for ch in line:
            if ch.isalnum():
                x += 1
    file.close()
    return x
def line_count(filename):
    file = open(filename,'r')
    x = 0
    for line in file:
        x += 1
    file.close()
    return x
def word_count(filename):
    file = open(filename,'r')
    longstr = ''
    for line in file:
        for ch in line:
            if ch.isalnum():
                longstr += ch
            else:
                longstr += ' '
    longstr = longstr.split()
    x = 0
    for e in longstr:
        if len(e) != 0:
            x +=1
    file.close()
    return x
def Bownorm(filename):
    file = open(filename,'r')
    longstr = ''
    BoW = []
    for line in file:
        for ch in line:
            if ch.isalnum():
                longstr += ch
            else:
                longstr += ' '
    longstr = longstr.split()
    q = []
    for e in longstr:
        if e.lower() not in stopword:
            q += [e]
    wordlist = []
    for e in q:
        if len(e) != 0 and e not in wordlist:
            wordlist.append(e)
    #-------------------------------------#
    for e in wordlist:
        BoW.append([e,longstr.count(e)])
        
    return BoW
def Bowflash(filename):
    file = open(filename,'r')
    longstr = ''
    BoW = []
    for line in file:
        for ch in line:
            if ch.isalnum():
                longstr += ch
            else:
                longstr += ' '
    longstr = longstr.split()
    q = []
    for e in longstr:
        if e.lower() not in stopword:
            q.append(flash(e,M))
    wordlist = []
    for e in q:
        if len(str(e)) != 0 and e not in wordlist:
            wordlist.append(e)
    #-------------------------------------#
    for e in wordlist:
        BoW.append([e,q.count(e)])
        
    return BoW

#===========================(DefFunc)====================================#

char_c = 0
alphanumeric_c = 0
line_c = 0
word_c = 0

file_name = input('File name = ').strip()
hashingcmd = input('Use feature hashing ? (y,Y,n,N) ')

while hashingcmd not in ['y','Y','n','N']:
    print( '''Try again.'''  )
    hashingcmd = input('Use feature hashing ? (y,Y,n,N) ')
if hashingcmd.lower() == 'y':
    M = int(input('M = '))
    
stop = open('stopwords.txt','r')
stopword = []
for line in stop:
    stopword += line.strip().split()
stop.close()

#-------------------------------(count)----------------------------------#
char_c = char_count(file_name)
print( '-------------------' )
print( 'char count =',str(char_c) )
alphanumeric_c = alphanumeric_count(file_name)
print( 'alphanumeric count =',str(alphanumeric_c) )
line_c = line_count(file_name)
print( 'line count =',str(line_c) )
word_c = word_count(file_name)
print( 'word count =',str(word_c) )
#-------------------------------(count)----------------------------------#

#================================(BoW)===================================#
if hashingcmd.lower() == 'y':
    BoW = Bowflash(file_name)

#------------------------------------------------------------------------#
else:
    BoW = Bownorm(file_name)
    
#================================(BoW)===================================#
print( 'BoW =',BoW)
# 6330571421 (17.78) 388 (2021-03-21 16:29)
#Prog-08: Bag-of-words
#6330571421 (17.78) Name Anik Romyanon
def feature_hashing(word,M):
    x = []
    value = 0
    for e in word:
        x.append(e)
    for i in range(len(x)):
        value += ord(x[i])*(37**i)
    return value%int(M)
def cleaning(wordlist):
    cleaned = []
    for word in wordlist:
        precleaned = []
        newword = ''
        for e in word:
            if e.lower() in '1234567890abcdefghijklmnopqrstuvwxyz':
                newword += e
            else:
                newword += ' '
        newword.strip()
        precleaned = newword.split()
        for e in precleaned:
            cleaned.append(e)
    return cleaned
#bool check ว่าเอาคำแบบไม่เอาสัญลักษณ์หรือไม่
def reading(filename,bool):
    wordlist = []
    linelist = []
    file = open(filename,'r')
    for line in file:
        word = line.split()
        for e in word:
            wordlist.append(e)
        linelist.append(line)
    file.close()
    if bool == True:
        wordlist = cleaning(wordlist)
        return wordlist,linelist
    else:
        return wordlist
def counting(filename):
    file = open(filename,'r')
    linelist = ''
    for line in file:
        if line.find('\n') != -1:
            singline= line[:line.find('\n')-1]
        else:
            singline = line
        linelist += singline[1:]
    file.close()
    return len(linelist)
def alph_counting(filename):
    file = open(filename,'r')
    alphchar = ''
    for line in file:
        if line.find('\n') != -1:
            singline = line[:line.find('\n')-1]
        else:
            singline = line
        for e in singline:
            if e.lower() in '1234567890abcdefghijklmnopqrstuvwxyz':
                alphchar += e
    file.close()
    return len(alphchar)
def bag_of_words(wordsinbag,bool):
    c = 0
    data = wordsinbag
    bagofwords = []
    hashlist = []
    if bool == True:
        M = input('M = ')    
        for e in wordsinbag:
            hashlist.append(feature_hashing(e,M))
        data = hashlist
    for e in data:
        for i in range(len(data)):
            if e == data[i]:
                c += 1
        if [e,c] not in bagofwords:
            bagofwords.append([e,c])
        c = 0
    return bagofwords
#---------------------------------------------
interestedwords =[]

file_name = input("File name = ")
hash = input('Use feature hashing ? (y,Y,n,N) ')
while hash not in ['y','Y','n','N']:
    print('Try again.')
    hash = input('Use feature hashing ? (y,Y,n,N) ')

stopwordlist = reading('stopwords.txt',False) #อ่านstopwords
filewordlist,linelist = reading(file_name,True) #อ่านไฟล์

#เหล่าสมาชิกชมรมตัวแปร
charcount = (counting(file_name))
alphcount = (alph_counting(file_name))
line = len(linelist)
wordcount = len(filewordlist)
print('-------------------')
print('char count = '+str(charcount))
print('alphanumeric count = '+str(alphcount))
print('line count = '+str(line))
print('word count = '+str(wordcount))
#ทำความสะอาด v.2
for e in filewordlist:
    if e.lower() not in stopwordlist:
        interestedwords.append(e)
if hash in 'yY':
    bagofwords = bag_of_words(interestedwords,True)
elif hash in 'nN':
    bagofwords = bag_of_words(interestedwords,False)
print('BoW = '+str(sorted(bagofwords)))

# 6330572021 (30.00) 389 (2021-03-22 15:09)

def fhash(w,m):
    p=0
    for i in range(len(w)):
        p+=ord(w[i])*(37**i)
    fh=p%m
    return fh
    
def bow(sen):
    b=[]
    bow=[]
    for e in sen :
        if not e in b:
            b.append(e)
    c=[0]*len(b)
    for i in range(len(sen)):
        for j in range(len(b)):
            if sen[i]==b[j]:
                c[j]+=1
    for k in range(len(b)):
        bow.append([b[k],c[k]])
    return bow

file_name=input('File name=')
f=input('Use feature hashing ? (y,Y,n,N)')
while f!='y' and f!='Y' and  f!='n' and f!='N':
    print('Try again.')
    f=input('Use feature hashing ? (y,Y,n,N)')
if f=='y' or f=='Y':
    m=input('M=')

file=open(file_name,'r')
stopw=open('stopwords.txt','r')
lines=stopw.readlines()
stopw.close()
lines=[line.strip() for line in lines]
stw=''
for i in range(len(lines)):
    stw+=str(lines[i].lower())+' '
stop=stw.split()
char=0
al=0
l=0
sen=''
for line in file:
    for c in line:
        if c.isalnum()==True:
            char+=1
            al+=1
            sen+=c
        else:
             char+=1
             sen+=' '
    l+=1
s=sen.lower().split()
sent=[]
for p in s:
    if p not in stop:
        sent+=[p]

print('-------------------')
print('char count = ',char-l+1)
print('alphanumeric count = ',al)
print('line count = ',l)
print('word count = ',len(s))
if f=='y' or f=='Y':
    bb=[]
    for q in range(len(sent)):
        bb+=[fhash(sent[q],int(m))]
    BoW=bow(bb)
    BoW.sort()
    print('BoW = ',BoW)
else:
    BoW=bow(sent)
    BoW.sort()
    print('BoW = ',BoW)
file.close()

# 6330573721 (13.00) 390 (2021-03-21 17:07)

def char_count(w):
    res = 0
    for i in w:
        res+=1
    return res - line_count(w)
def alphanumeric_count(w):
    res = 0
    for i in w:
        if i.isalnum():
            res+=1
    return res
def line_count(w):
    res = 0
    for i in w:
        if i == '\n':
            res+=1
    return res
def word_count(w):
    res = 0
    for i in w:
        res+=1
    return res
def fhash_calc(w, m): # w = list of character
    G = 37
    w = [i for i in w if i.isalnum()]
    res = ord(w[0])
    times = 1
    for i in w[1:]:
        res+=ord(i)*(G**times)
        times+=1
    return res%m
def fhash(w, m): # w = list of string
    #w = w.split()
    _list = []
    listnum = []
    res = []
    for i in range(len(w)):
        _list.append([j for j in w[i]])
        listnum.append(fhash_calc(_list[i], m))
    no_dups = []
    for i in listnum:
        count = listnum.count(i)
        if i not in no_dups:
            no_dups.append(i)
            res.append([i, count])
    return res

file_name = input("File name = ")
while(True):
    feature = input("Use feature hashing ? (y,Y,n,N) ").lower()
    if feature in ['y', 'n']:
        break
    else:
        print("Try again.")
    

stop_words = open('stopwords.txt', 'r')
stop_words = stop_words.read().split()
if feature.lower() == 'y':
    M = int(input("M = "))
    print('-------------------')
    with open(file_name, 'r') as f:
        text = f.read()
        print("char count = " + str(char_count(text)))
        print("alphanumeric count = " + str(alphanumeric_count(text)))
        print("line count = " + str(line_count(text)))
        print("word count = " + str(word_count(text.split())))
        words = []
        for i in text.split():
            if i.lower() in stop_words:
                continue
            words.append(i)
        print(fhash(words, M))
        f.close()

elif feature.lower() == 'n': 
    print('-------------------')
    with open(file_name, 'r') as f:
        text = f.read()
        print("char count = " + str(char_count(text)))
        print("alphanumeric count = " + str(alphanumeric_count(text)))
        print("line count = " + str(line_count(text)))
        print("word count = " + str(word_count(text.split())))
        words = []
        for i in text.split():
            if i.lower() in stop_words:
                continue
            string = str()
            for j in i:
                if j.isalnum():
                    string+=j
                else:
                    continue
            words.append(string)
        res = []
        no_dups = []
        for i in words:
            count = words.count(i)
            if i not in no_dups:
                no_dups.append(i)
                res.append([i, count])
        print(res)


# 6330574321 (0.00) 391 (2021-03-22 22:28)

file=input("File name = ")
file1="sample.txt"
file2="stopwords.txt"
fea=input("Use feature hashing ? (y,Y,n,N) ")
a=["y","Y","n","N"]
b=["y","Y"]

while fea not in a:
    print("Try again.")
    fea=input("Use feature hashing ? (y,Y,n,N) ")
if fea in b:
    M=int(input("M = "))

f1=open(file1,"r").read().lower()
f2=open(file2,"r").read().lower()
def fhash(j,p):
    c=0
    A=len(j)
    for e in range(A):
        c=c+ord(j[e])*(37**e)
    d=int(c%p)
    return d

k=0
l=1
n=0
w=[]
s=[]
o=[]
r=[]
x=""
y=""
for i in f1:
    if i!="\n":
        k=k+1   
    else:
        l=l+1
    if "0"<=i<="9" or "a"<=i<="z":
        n=n+1
        x=x+i
    elif x!="":
        w.append(x)
        x=""
        
print("-------------------")
K=str(k)
print("char count = "+K)
N=str(n)
print("alphanumeric count = "+N)
L=str(l)
print("line count = "+L)
if x!="":
    word.append(x)
    x=""
W2=str(len(w))
print("word count = "+W2)

for e in f2:
    if "0"<=e<="9" or "a"<=e<="z":
        y=y+e
    elif y!="":
        s.append(y)
        y=""
        
if y!="":
    s.append(y)

for e in s:
    for i in range(w.count(e)):
        w.remove(e)
        
w1=len(w)
if fea in b:
    for i in range(w1):
        w[i]=fhash(w[i],M) 
        
for e in w:
    if e not in r:
        o.append([e,w.count(e)])
        r.append(e)
o.sort()
O=str(o)
print("BoW = ",O)



# 6330575021 (30.00) 392 (2021-03-21 18:03)
def dosam(file):
    a = open(file,'r',errors = 'ignore').read().lower()
    b = ''
    for i in a :
        if i.isalnum() == True :
            b+=i
        else :
            b+=' '
    b = b.split()
    return b
#-------------------------------------------------
def dostop():
    x = open('stopwords.txt','r',errors = 'ignore').read().split()
    y = []
    for i in x :
        y.append(i)
    return y
#-------------------------------------------------
def alc():
    c=0
    for i in range(len(dosam(file))) :
       for j in dosam(file)[i] :
           c+=1
    return c
#-------------------------------------------------
def cc(file) :
    a = open(file,'r',errors = 'ignore')
    c=0
    while True :
        b = a.readline()
        if b == '' :
            break
        elif b[-1] == '\n' :
            c+=len(b)-1
        elif b[-1] != '\n' :
            c+=len(b)     
    return c
#-------------------------------------------------
def lc(file) :
    a = open(file,'r',errors = 'ignore')
    c=0
    while True :
        b = a.readline()
        c+=1
        if b == '' :
            c-=1
            break
    return c
#-------------------------------------------------
def wc() :
    c=0
    for i in range(len(dosam(file))) :
        c+=1
    return c
#-------------------------------------------------
def bownofh(file):
    x = dostop()
    y = dosam(file)
    z = []
    for i in range(len(y)) :
        if y[i] not in x :
            z.append(y[i])
            z.sort()
    l = arrange(z)
    return l
#-------------------------------------------------
def arrange(sen) :
    if len(sen) == 0 :
        return '[]'
    mem=sen[0]
    c=0
    ans=''
    for i in range (0,len(sen)):
        if sen[i]!=mem:
            ans+=str(mem)+" "+str(c)+" "
            mem=sen[i]
            c=1
        else:
            c+=1
    ans+=str(mem)+" "+str(c)
    ans = ans.split()
    l = [[ans[i*2],int(ans[i*2+1])] for i in range(len(ans)//2)]
    return l
#-------------------------------------------------
def rbof() :
    x = dostop()
    y = dosam(file)
    z = []
    l = []
    for i in range(len(y)) :
        if y[i] not in x :
            z.append(y[i])
            z.sort()
    for i in range(len(z)) :
        l.append(fhash(z[i],M))
        l.sort()
    l = arrange(l)
    if len(l) == 2 :
        return '[]'
    else :
        l = [[int(l[i][0]),l[i][1]] for i in range(len(l))]
    return l
#-------------------------------------------------
def fhash(w,M) :
    G=37
    c = []
    d = 0
    for i in range(len(w)) :
        c.append(ord(w[i]))
    for i in range(len(c)) :
       d += c[i]*(G**i)
    d = d%M
    return d
#-------------------------------------------------
file = input('File name = ')
x = input('Use feature hashing ? (y,Y,n,N) ')
c = 0
if x == 'n' or x == 'N'or x == 'y' or x == 'Y' :
    c+=1
else :
    c=0
while c == 0 :
    print('Try again'+'.')
    x = input('Use feature hashing ? (y,Y,n,N) ')
    if x == 'n' or x == 'N'or x == 'y' or x == 'Y' :
        c+=1
    else :
        c=0
if x == 'n' or x == 'N' :
    print('-------------------')
    print('char count =',cc(file))
    print('alphanumeric count =',alc())
    print('line count =',lc(file))
    print('word count =',wc())
    print('BoW =',bownofh(file))
if x == 'y' or x == 'Y' :
    M = int(input("M = "))
    print('-------------------')
    print('char count =',cc(file))
    print('alphanumeric count =',alc())
    print('line count =',lc(file))
    print('word count =',wc())
    print('BoW =',rbof())
    
    

# 6330576621 (22.99) 393 (2021-03-21 23:33)
def remove_pun(s):
    out=""
    i=0
    b=[]
    for c in s:
        if c.lower() in '0123456789abcdefghijklmnopqrstuvwxyz':
            out+=c.lower()
            i+=1
        elif out!="":
            b.append(out)
            out=""
    return b,i
def fhash_list(bow,M):
    temp = 0
    fhash = []
    for i in range(len(bow)):
        for k in range(bow[i][1]):
            for j,c in enumerate(bow[i][0]):
                temp += ord(c)*(37**j)
            fhash.append(temp%M)
            temp = 0
    return fhash
file_name=input("File name = ")
while True:
    feature=input("Use feature hashing ? (y,Y,n,N) ")
    if feature in "y,Y":
        M = int(input("M = "))
        a=1
        break
    elif feature in"n,N":
        a=0
        break
    else:
        print("Try again")
fn=open(file_name,"r")
char=("").join(fn.readlines())
s=open("stopwords.txt","r")
st=s.read().split()
x,y=remove_pun(char)
fn.seek(0)
line=fn.readlines()
line_count=len(line)
bow=[]
new=[]
bownew=[]
for e in x:
    if e not in st:
        new.append(e)
for e in new:
    bow.append([e,new.count(e)])
bow.sort()
for e in bow:
    if e not in bownew:
        bownew.append(e)
print("-------------------")
print("char count = "+ str(len(char)-char.count("\n")))
print("alphanumeric count = " + str(y))
print("line count = " +str(line_count))
print("word count = " +str(len(x)))
if a==1:
    listfhash=fhash_list(bownew,M)
    f1=[]
    f3=[]
    for e in listfhash:
        f1.append([e,listfhash.count(e)])
    f1.sort()
    for e in f1:
        if e not in f3:
            f3.append(e)
    print("BoW = " + str(f3))
else:
    print("BoW = " + str(bownew))







# 6330577221 (30.00) 394 (2021-03-22 02:52)
#Prog-08: Bag-of-Words
#6330577221 (30.00) Name Akrachai Kovittayanun
def fhash (w,M):
    allord=0
    for i in range(len(w)):
        o=ord(w[i])*37**i
        allord+=o
    return allord%M

filename=input('File name = ')
bow=input('Use feature hashing ? (y,Y,n,N)) ')
while bow not in ('y','Y','n','N'):
    print('Try again.')
    bow=input('Use feature hashing ? (y,Y,n,N)) ')
if bow == 'y' or bow == 'Y':
    m=int(input('M = '))
print('-------------------')

stopwords=[]
s_file=open('stopwords.txt','r')
for line in s_file:
    for w in line.strip().split():
        stopwords.append(w)
s_file.close()

text=''
textalnum=''
wordstext=''
line_count=0
file = open(filename,'r')
for line in file:
    for e in line.strip():
        text+=e
    text+=' '
    line_count+=1
char=len(text)-line_count
print('char count =',char)

for e in text:
    if e.isalnum()!=True:
        textalnum+=''
    else:
        textalnum+=e
alnum=len(textalnum)
print('alphanumeric count =',alnum)

print('line count =',line_count)

for e in text:
    if e.isalnum()==True:
        wordstext+=e.lower()
    else:
        wordstext+=' '
wordslist=wordstext.strip().split()
wordcount=len(wordslist)
print('word count =',wordcount)

uniquelist=[]
for e in wordslist:
    if e not in stopwords:
        uniquelist.append(e)

fhashlist=[]
if bow=='y' or bow=='Y':
    for e in uniquelist:
        fhashwords=fhash(e,m)
        fhashlist.append(fhashwords)
    uniquelist=fhashlist

output=[]
point=0
for i in range(len(uniquelist)):
    if uniquelist[i] not in uniquelist[i+1:] and uniquelist[i] not in uniquelist[:i]:
        output.append([uniquelist[i],1])
    if uniquelist[i] in uniquelist[i+1:] and uniquelist[i] not in uniquelist[:i]:
        point+=1
        j=i+1
        while j in range(len(uniquelist)) and uniquelist[i] in uniquelist[j:]:
            point+=1
            j=uniquelist[j:].index(uniquelist[i])+j+1
        output.append([uniquelist[i],point])
    point=0
print('BoW =',output)
file.close()
# 6330578921 (25.10) 395 (2021-03-20 14:28)

def remove_punc(t):
    out = ''
    for e in t:
        if e not in  [ '(', ')', '-', '_', '[', ']' ,'"' ,"'" ,';', ':', '>', '<','.',',','\\','/' ]:
            out += e            
    return out
def remove_stopword(t,s):
    out = ''
    for e in t:
        if e not in s:
            out += e    
    return out
def countitem(l):
    result = []
    for i in l:
        if i in result:
            result[i] += 1
        else:
            result[i]  = 1
    return result
def fhashword(w,M):
    n  = 0
    fh = 0
    for i in range(len(w)):
        fh += ord(w[i])*37**n
        n  += 1
    return fh % M

#--------------------------------------------------------------------------
file_name = input('File name = ',)
wantfh    = input('Use feature hashing ? (y,Y,n,N)  ')
while True:
    if wantfh == 'y' or wantfh == 'Y':
        print('-------------------')
        M = int(input('M = ',))
    
        file  = open(file_name,'r')
        lines = file.readlines()
        lines = [line.strip() for line in lines]
        file.close()

        file  = open('stopwords.txt','r')
        lines1 = file.readlines()
        lines1 = [line.strip() for line in lines1]
        file.close()
    
        word_input   = ''
        for i in lines:
            word_input += ' ' + i
    
        stopword     = ''
        for j in lines1:
            stopword  +=  ' ' +j
        
        newword_input = []
        for i in range(len(word_input)):
            if word_input[i].isalnum() == True:
                newword_input += word_input[i]

        w     = remove_punc(word_input.lower()).split()
        box   = ''
        for i in w:
            if i not in stopword.split():
                box += ' '+ i
            if i in stopword.split():
                box += ' '

        box0  = box.split()
        fhash = []
        for i in box0:
            fhash.append([i,fhashword(i,M)])

        bow1  = []
        for j in range(len(fhash)):
            bow1.append(fhash[j][1])
            bow1 = sorted(bow1)

        bow2  = []
        for k in bow1:
            bow2.append([k,bow1.count(k)])

        realbowfh = []
        for l in bow2:
            if l not in realbowfh:
                realbowfh.append(l)
                
        print('char count =',len(list(word_input.strip()))-(len(lines)-1))
        print('alphanumeric count =',len(newword_input))
        print('line count =',len(lines))
        print('word count =',len(word_input.split()))
        print('BoW =',realbowfh)
        break
    
    elif wantfh == 'n' or wantfh == 'N':
        print('-------------------')
    
        file  = open(file_name,'r')
        lines = file.readlines()
        lines = [line.strip() for line in lines]
        file.close()

        file  = open('stopwords.txt','r')
        lines1 = file.readlines()
        lines1 = [line.strip() for line in lines1]
        file.close()        

        word_input   = ''
        for i in lines:
            word_input += ' ' + i
    
        stopword     = ''
        for j in lines1:
            stopword  +=  ' ' +j
        
        newword_input = []
        for i in range(len(word_input)):
            if word_input[i].isalnum() == True:
                newword_input += word_input[i]
    
        word1        = remove_punc(word_input.lower()).split()
        box          = ''
        for i in word1:
            if i not in stopword.split():
                box += ' '+ i
            if i in stopword.split():
                box += ' '

        bow1      = []
        list_box = box.split()
        for j in list_box:
            bow1.append(list_box.count(j))

        bow2      = []
        for k in range(len(list_box)):
            bow2.append([list_box[k],bow1[k]])

        realbow = []
        for l in bow2:
            if l not in realbow:
                realbow.append(l)
                
        print('char count =',len(list(word_input.strip()))-(len(lines)-1))
        print('alphanumeric count =',len(newword_input))
        print('line count =',len(lines))
        print('word count =',len(word_input.split()))
        print('BoW =',realbow)
        break
    else:
        print('Try again.')
        wantfh    = input('Use feature hashing ? (y,Y,n,N)  ')
    
    
    
    
    
    
    


    
# 6330579521 (30.00) 396 (2021-03-22 12:12)
def fhash(w,m):
    bino=0
    G=37
    for i in range(len(w)):
        bino+=ord(w[i])*(G**i)
        result=bino%m
    return result
def alnum_only(a):
    cut = []
    left = 0
    i = 0
    while i < len(a):
        if(a[i].isalnum()):
            left = i
            while(i+1 < len(a) and a[i+1].isalnum()):
                i = i+1
            right = i+1
            cut.append(a[left:right])
        i = i+1
    return cut

print('File name = ',end='')
file_name=input()
print('Use feature hashing ? (y,Y,n,N) ',end='')
while True:
    yesno=input()
    if yesno in ['y','Y','n','N']:
        option=0
        if yesno=='y' or yesno=='Y':
            option=1
            print('M = ',end='')
            M=int(input())
            break
        break
    else:
        print('Try again.')
        print('Use feature hashing ? (y,Y,n,N) ',end='')
  
print('-------------------')
f=open('stopwords.txt','r')
list_=[line.strip().lower() for line in f]
# print(list_)
w=[]
stopwords=[]
for i in range(len(list_)):
    w.append(list_[i].split())
for i in range(len(w)):
    for h in range(len(w[i])):
        stopwords.append(w[i][h])


f2=open(file_name,'r')
listf2=[line.strip().lower() for line in f2] #ใช้lower ตั้งแต่ตรงนี้เลย 
# print(listf2)
charcount=0
alnumcount=0
for i in range(len(listf2)):
    if listf2[i]!='':
        charcount+=len(listf2[i])
        #   print('i=',i,'charcount=',charcount)
        for j in range(len(listf2[i])):
            if listf2[i][j].isalnum():
                alnumcount+=1
                
list_word=[]
for i in range(len(listf2)):
    for j in range(len((alnum_only(listf2[i])))):
        list_word.append(alnum_only(listf2[i])[j])

# print(list_word)
print('char count =',charcount)
print('alphanumeric count =',alnumcount)
print('line count =',len(listf2))
print('word count =',len(list_word))
# print('list_word_lower=',list_word)
# print('stopwords=',stopwords)
aftercut=[]
for y in list_word:
    if y not in stopwords:
        aftercut.append(y)
    else:
        continue
# print(aftercut)
bow=[]
marker = [0] * len(aftercut)
# print(marker)
c=0
for i in range(len(aftercut)):
    if(marker[i] == 0):
        bow.append([aftercut[i],aftercut.count(aftercut[i])])
        for j in range(len(aftercut)):
            if(aftercut[j] == aftercut[i]):
                marker[j]=1
if option==0:
    print('BoW =',bow)
else:
#     print('Bow=',bow)
    hashing=[]
    for i in range(len(bow)):
#         print('*',bow[i][0], fhash(bow[i][0],M),bow[i][1])
        for h in range(bow[i][1]):
            hashing.append(fhash(bow[i][0],M))
#     print('hashing=',hashing)
    bowhash=[]
    for i in range(M):
        if hashing.count(i)!=0:
            bowhash.append([i,hashing.count(i)])
#             print('i=',i,'bowhash=',bowhash)
    print('BoW =',bowhash)

# 6330580021 (26.67) 397 (2021-03-22 03:48)
file=open(input('File name = '),'r')
stop=open('stopword.txt','r')
fout=''
for line in file:
    fout+=line
stopword=''
for line in stop:
    stopword+=line
stopword=stopword.split()
line_list = fout.split('\n')
if line_list[-1]=='':
    line_list=line_list[:-1]
real_fout=''.join(line_list)
def list(fout):
    out=''
    for i in fout:
        if i.lower() in '0123456789abcdefghijklmnopqrstuvwxyz':
            out += i.lower()
        else:
            out += ' '
    out=out.split()
    return out
def deleate_stopword(fout,stopword):
    out=[]
    for i in range(len(fout)):
        if not fout[i] in stopword:
            out.append(fout[i])
    return out
def BoW(real_list):
    bow=[]
    for i in range(len(real_list)):
        n=0
        for j in range(len(real_list)):
            if real_list[i] == real_list[j]:
                n+=1
        if [real_list[i],n] not in bow:
            bow.append([real_list[i],n])
    bow=sorted(bow)
    return bow
def fhash(w,M):
    fhash1=[]
    for i in range(len(w)):
        fhash=0
        for j in range(len(w[i])):
            fhash+=ord(w[i][j])*(37**j)
        fhash=fhash%M
        fhash1.append(fhash)
    m=[]
    for i in range(len(fhash1)):
        n=0
        for j in range(len(fhash1)):
            if fhash1[i]==fhash1[j]:
                n+=1
        if [fhash1[i],n] not in m:
            m.append([fhash1[i],n])
    k=sorted(m)
    return k
listfile=list(fout)                 
a=input('Use feature hashing ? (y,Y,n,N) ')
while a not in ['y','Y','n','N']:
    print('Try again.')
    a=input('Use feature hashing ? (y,Y,n,N) ')
if a in ['y','Y']:
    M=int(input('M = '))
print('-------------------')
print('char count = ',len(real_fout))
print('alphanumeric count = ',len(''.join(listfile)))
print('line count = ',len(line_list))
print('word count = ',len(listfile))
if a in ['y','Y']:
    print('BoW = ',fhash(deleate_stopword(listfile,stopword),M))
else:
    print('BoW = ',BoW(deleate_stopword(listfile,stopword)))


# 6330583021 (28.00) 398 (2021-03-21 21:07)

def cutstop(file_name):
    lf = []
    lt=''
    f = open('stopwords.txt', "r" )
    t = open(file_name, "r" )
    for lines in f:
        line=lines.split()
        for l in line:
            lf.append(l)
    for lines in t:
        for i in lines:
            if 'a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9':
                lt += i.lower()
            else :
                lt += ' '
    lt=lt.split(' ')
    listt=[]
    for l in lt:
        if l != '':
            listt.append(l)
    l3 = []
    for i in range(len(listt)):
        if listt[i] not in lf:
            l3.append(listt[i])
    
    f.close()
    t.close()
    return l3

def four(file):
    f = open(file,"r")
    s1 = 0
    for lines in f:
        if '\n' in lines:
            s1 += len(lines)-1
        else :
            s1 += len(lines)
    
    s2=0
    f.close()
    f = open(file,"r")
    for lines in f:
        for i in lines:
            if 'a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9':
                s2 +=1    
    
    f.close()
    f = open(file,"r")
    s3=0
    for lines in f:
        s3 +=1
    
    f.close()
    f = open(file,"r")
    s4 = ''
    s5 = 0
    for lines in f:
        for i in lines:
            if 'a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9':
                s4 += i
            else :
                s4 += ' '
    s4=s4.split(' ')
    for n in s4:
        if n != '':
            s5 += 1
    
    print('-------------------')
    print('char count = '+ str(s1))
    print('alphanumeric count = '+ str(s2))
    print('line count = '+ str(s3))
    print('word count = '+ str(s5))
          

print('File name =', end=' ' )
file_name= input()

while True :
    print('Use feature hashing ? (y,Y,n,N)', end=' ')
    met = input()
    if met not in 'nNyY':
        print('Try again')
        
    else:
        met = met.lower()
        break
if met == 'n':
    four(file_name)
    lf = []
    lt=''
    f = open('stopwords.txt', "r" )
    t = open(file_name, "r" )
    for lines in f:
        line=lines.split()
        for l in line:
            lf.append(l)
    for lines in t:
        for i in lines:
            if 'a' <= i <= 'z' or 'A' <= i <= 'Z' or '0' <= i <= '9':
                lt += i.lower()
            else :
                lt += ' '
    lt=lt.split(' ')
    listt=[]
    for l in lt:
        if l != '':
            listt.append(l)
    l3 = []
    for i in range(len(listt)):
        if listt[i] not in lf:
            l3.append(listt[i])
    l4=[]
    l5=[]
    for l in l3:
        if l not in l4:
            l4.append(l)
            l5.append(1)
        else:
            x = l4.index(l)
            l5[x] = l5[x]+1
    bom=[]
    for i in range(len(l4)):
        n = [l4[i],l5[i]]
        bom.append(n)
    
    print('BoW = ', bom)
    f.close()
    t.close()
elif met == 'y':
    print('M = ',end="")
    m = input()
    four(file_name)
    cutlist = cutstop(file_name)
    bowlist=[]
    for l in cutlist:
        s=0
        for i in range(len(l)):
            s +=ord(l[i])*(37**i)
        s = s%int(m)
        bowlist.append(s)    
    bowlist.sort()        
    
    bowlist1 = [bowlist[0]]
    bowlist2 = [1]
    for i in range(1,len(bowlist)):
        if bowlist[i] != bowlist[i-1]:
            a= bowlist[i]
            b = 1
            bowlist1.append(a)
            bowlist2.append(b)
        else:
            k=bowlist1.index(bowlist[i])
            bowlist2[k] += 1
    truelist=[]
    for i in range(len(bowlist1)):
        truelist.append([bowlist1[i],bowlist2[i]])
        
    print('BoW = ',end='')    
    print(truelist)
   
    
# 6330585221 (30.00) 399 (2021-03-20 21:06)
def fhash(w,M):
    fhas = 0
    for i in range(len(w)):
        fhas += ord(w[i])*(37**i)
    return fhas%M
def edit_string(file):
    x=''; fo = open(file)
    for line in fo:
        for e in line.lower():
            if e.isalnum() or e == ' ':
                x+=e.lower()
            else:
                x+=' '
    fo.close()
    return x.split()
def sameword(sample):
    norepeat = []; x = sorted(sample)
    if len(x) > 1:
        for i in range(len(x)-1):
            if x[i] == x[i+1]:
                continue
            norepeat.append(x[i])
        norepeat.append(x[i+1])
    else: norepeat = x
    return sorted(norepeat)
def frequency(list_):
    if len(list_) == 0: return ''
    else:
        x = sorted(list_); f = x[0]; fre = []; c=0
        for i in range(len(x)):
            if f == x[i]: c+=1
            else:
                fre.append(c)
                c=1
                f = x[i]
        fre.append(c)
        return fre
def BoW(sample, stopwords, feature, M):
    edited = []
    for e in edit_string(sample):
        if e.lower() not in edit_string(stopwords):
            edited.append(e.lower())
    if feature.lower() == 'n':
        return [[sameword(sorted(edited))[i], frequency(sorted(edited))[i]] for i in range(len(frequency(edited)))]
    else:
        k = []
        for e in edited:
            k.append(str(fhash(e, M)))
        return [[int(sameword(sorted(k))[i]), frequency(sorted(k))[i]] for i in range(len(frequency(k)))]
def main():
    file_name = input('File name = '); M=0
    feature = input('Use feature hashing ? (y,Y,n,N) ')
    while feature.lower() != 'n' and feature.lower() != 'y':
        print('Try again.')
        feature = input('Use feature hashing ? (y,Y,n,N) ')
    if feature.lower() == 'y': M = int(input('M = ')) 
    c_alnum=0; c_char=0; c=0; c_line = 0; fn = open(file_name)
    for line in fn:
        for e in line:
            if e.lower().isalnum():
                c_alnum+=1
            if e == '\n': c+=1
        c_char += len(line)
        c_line += 1
    print('-'*19)
    print('char count =',c_char-c)
    print('alphanumeric count =', c_alnum)
    print('line count =', c_line)
    print('word count =', len(edit_string(file_name)))
    print('BoW =',BoW(file_name, 'stopwords.txt', feature, M))
    fn.close()
main()

    
    
    
    


# 6330586921 (24.67) 400 (2021-03-22 23:29)
def remove_punctuation(s):
  out = ''
  for c in str(s):
    if c.lower() in '0123456789abcdefghijklmnopqrstuvwxyz':
      out += c.lower()
    else:
      out += ' '
  return out
def read_file(f) :
    a = []
    for line in f :
        a.append(line)
    return a
def char_count(f): #true
    char_count = 0 
    for i in f :
        for e in i :
            if e != "\n" :
                char_count += 1
    return char_count
def alphabet_count(f):  #true
    count = 0
    for i in f :
        for e in i :
            if e.isalnum() :
                count += 1
    return count
def word_count(f): #true
    count = 0
    for e in f:
            a = remove_punctuation(e).split()
            count += len(a)
    return count
def read_stopword(stopword):
    a = []
    for line in stopword:
        for i in line.split():
            a.append(i.lower())
    return a
def count(a):
  e = []
  for i in a:
    for q in remove_punctuation(i).split():
      e.append(q)
  # print(e)
  b = []

  for i in e:
      if i not in b:b.append(i)
  # print(b)
  # print(e)
  count = []
  for i in b:
    c = 0
    for q in e :
      if i == q:c += 1
    count.append([i, c])
  return sorted(count)
def remove_stopword(line,stopword):
    line= line.split()
    out = ""
    for i in line :
      if i in stopword:pass
      else:out += i + " "
    return out
def BoW_not_have_fhash(File, stopword):
  file_not_have_stopword = ''
  for i in File:
    file_not_have_stopword+= remove_stopword(remove_punctuation(i), stopword) + ' '
  a = file_not_have_stopword.split()
  return count(a)
def fhash(w, M):
    G = 37
    f = 0
    for i in range(len(w)):
        f += ord(w[i])*(G**i)
    return f%M
def BoW_have_fhash(File, stopword,M):
  file_have_stopword = ''
  for i in File:
    file_have_stopword += remove_stopword(remove_punctuation(i), stopword) + ' '
  a = file_have_stopword.split()
  b = []
  # print(a)
  for i in a:
      b.append(fhash(i, M))
  # print(b)
  return count(b)

file_name = input("File name = ")
f = open(file_name)
f = read_file(f)
stopword = open('stopword.txt')
stopword = read_stopword(stopword)
while True:
    input_it = input("Use feature hashing ? (y,Y,n,N) ")
    if input_it == 'Y' or input_it == 'y':
        M = int(input("M = "))
        break
    elif input_it == 'N' or input_it == 'n':
        M = False
        break
    else:
      print("Try again")
    
print('-------------------')
print("char count = "+ str(char_count(f)) )
print("alphanumeric count = "+ str(alphabet_count(f)))
print("line count = "+ str(len(f)))
print("word count = "+ str(word_count(f)))

if M == False :
    print("BoW = " + str(BoW_not_have_fhash(f,stopword)))
else :
    print("BoW = " + str(BoW_have_fhash(f,stopword,M)))

        
        
        
        
        
      

# 6330587521 (26.00) 401 (2021-03-22 14:53)
#Prog-08: Bag-of-words
#6330587521 (26.00) Aunchisa Suwanchatree

#-----------------------------------------------------------------------------------------------------
def choose(ans):
    ans=ans.lower()
    repeat=0
    while repeat==0:
        if ans=='y':            
            return True
        
        elif ans=='n':
            return False
        else:
            print('Try again.')
            repeat=0
            ans=input('Use feature hashing ? (y,Y,n,N) ').lower()
def remove_punc(text):
    result=''
    for ch in text:
        if not ch.isalnum():
            result += " "
        else:
            result += ch
    return result
def stopword():
    sw=open('stopwords.txt','r')
    result=''
    for line in sw:
        line=line.lower().strip()
        line=remove_punc(line)
        result+=line
        result+=' '
    sw.close()
    return result.split()
def delete_stopword(listofword):
    result=[]
    for e in listofword:
        if not e in stopword():
            result.append(e)
    return result
def fhash(w,m):
    G=37
    j=[]
    for i in range(len(w)):
        j.append(ord(w[i])*G**i)
    return sum(j)%m

#-----------------------------------------------------------------------------------------------------        
file_name=input('File name = ')
choosing=choose(input('Use feature hashing ? (y,Y,n,N) '))
if choosing:
    m=int(input('M = '))

lines=open(file_name,'r')
wordcount=0
linecount=0
chcount=0
aln=0
text=''
for line in lines:
    linecount+=1
    for e in line:
        #count the charater
        if e!= '\n':
            chcount+=1
            
        #count only num and alphabet
        if e.isalnum():
            aln+=1
    #word count
    wordcount+= len(remove_punc(line).split())
    #text for bow
    text += remove_punc(line.lower())

text = delete_stopword(text.split())

print('-'*19)
print('char count =',chcount)
print('alphanumeric count =',aln)
print('line count =',linecount)
print('word count =',wordcount)


#case of no
if not choosing:
    text.sort()
    text.append(text[-1]+'cream')
    k=text[0]
    c=1
    bow=[]
    for i in range(1,len(text)):
        if text[i]==k:
            c+=1
        else:
            bow.append([k,c])
            k=text[i]
            c=1
    print ('BoW =',bow)
#case of yes
if choosing:
    numfhash=[]
    for e in text:
        numfhash.append(fhash(e,m))
    #bow
    numfhash.sort()
    numfhash.append(numfhash[-1]+1)
    k=numfhash[0]
    c=1
    bow=[]
    for i in range(1,len(numfhash)):
        if numfhash[i]==k:
            c+=1
        else:
            bow.append([k,c])
            k=numfhash[i]
            c=1
    print ('BoW =',bow)
lines.close()            
# 6330588121 (30.00) 402 (2021-03-22 23:45)
f=input('File name = ')
h=input('Use feature hashing ? (y,Y,n,N) ')
file_name=open(f)
file_st=open('stopwords.txt','r')
def low_er(file_name, file_st):
    st=[];txt=[];t=''
    for line in file_st:
        for e in line.strip().split():
            st.append(e)           
    for line in file_name:
        for i in line:
            if i.lower() in '0123456789abcdefghijklmnopqrstuvwxyz' or i.lower() in ' ':
                t+=i.lower()
            else:
                 t+=' '
    tlis=t.split()
    for n in tlis:
        if n not in st:
            txt.append(n)
    if txt==[]:
        return ''
    else: return txt
def char_count(f):
    file_name=open(f,'r')
    c=0
    for line in file_name:
        for i in line:
            if i!='\n':
                c+=1
    file_name.close()
    return c
def alphanumeric_count(f):
    file_name=open(f,'r')
    c=0
    for line in file_name:
        for t in line:
            if t.isalnum():
                c+=1
    file_name.close()
    return c
def line_count(f):
    file_name=open(f,'r')
    c=0
    for line in file_name:
        c+=1
    return c
def word_count(f):
    file_name = open(f,'r')
    p = ''
    for line in file_name:
        for ss in line:
            if ss.isalnum():
                p += ss
            else:
                p += ' '
    file_name.close()
    return len(p.split())
def bow1(file_name, file_st):
    low=sorted(low_er(file_name, file_st))
    n=1;a=[]
    for i in range(len(low)-1):
        if low[i]==low[i+1]:
            n+=1
        else:
            a.append([low[i],n])
            n=1
    if low==[]:
        return []
    elif low[-2]==low[-1]:
        a.append([low[-2],n])
    else:
        a.append([low[-1],1])
    return a
def outcome(f):
    print('-'*19)
    print('char count =',char_count(f))
    print('alphanumeric count =',alphanumeric_count(f))
    print('line count =',line_count(f))
    print('word count =',word_count(f))
def fhash(M, f, file_st):
    s=0;b=[]
    l=low_er(file_name, file_st)
    for w in l:
        for i in range(len(w)):
            s+=ord(w[i])*(37**i)
        f=s % M
        s=0
        b.append(f)
    fh=sorted(b)
    Q=[]
    for i in fh:
        if i not in Q:
            Q.append(i)
    return sorted(count(fh,Q))
def count( data, element ):
    c = 0
    a = []
    for i in element:
        for e in data:
             if e == i: c += 1
        a.append([i,c])
        c=0
    return a
        
while h not in ['y','Y','n','N']:
    print('Try again.')
    h=input('Use feature hashing ? (y,Y,n,N) ')
if h in ['n','N']:
    outcome(f)
    print('BoW =',bow1(file_name, file_st))
if h in ['y','Y']:
    M=int(input('M = '))
    outcome(f)
    print('BoW =',fhash(M, file_name, file_st))
file_name.close()
file_st.close()
# 6330589821 (27.00) 403 (2021-03-19 17:17)
#---------------------------------------------------
f_name = input('File name = ')
fh = input('Use feature hashing ? (y,Y,n,N) ')
if fh == 'y' or fh == 'Y':
    m = input('M = ')
    print('-------------------')
while fh not in 'YyNn':
    print('Try again.')
    fh = input('Use feature hashing ? (y,Y,n,N) ')

#---------------------------------------------------
#char count
#alphanumeric count
#line count
#word count
file = open(f_name)
char_count = 0
alpha =''
word =''
line_count = 0
for line in file:
    if len(line) != 0:
        char_count += len(line)-1
        line_count += 1
    if line[-1] != '\n':
        char_count += 1
    for e in line:
        if e.lower()  in 'abcdefghijklmnopqrstuvwxyz' or\
           e.lower()  in '0123456789':
            alpha += e
            word += e
        else:
            word += ' '
alpha_count = len(alpha)
word_count = len(word.split())
print('char count = '+str(char_count))
print('alphanumeric count = '+ str(alpha_count))
print('line count = ' + str(line_count))
print('word count = '+ str(word_count))
file.close()

#---------------------------------------------------
#stop word
stop = open('stopwords.txt','r')
stop_word =''
for line in stop:
     for e in line:
        if e.lower()  in 'abcdefghijklmnopqrstuvwxyz' or\
           e.lower()  in '0123456789':
            stop_word += e
        else:
            stop_word += ' '
stop_word =stop_word.split()
stop.close()

#---------------------------------------------------
#remove stop word
file = open(f_name)
s_word = ''
for line in file:
    for e in line:
        if e.lower()  in 'abcdefghijklmnopqrstuvwxyz' or\
           e.lower()  in '0123456789':
            s_word += e
        else:
            s_word += ' '
s_word = s_word.lower().split()
sig_word =[]
for i in range(len(s_word)):
    if s_word[i] not in stop_word:
        sig_word.append(s_word[i])
file.close()

#---------------------------------------------------
#BoW
#---------------------------------------------------
bow = []
sig_word.sort()
if fh in 'nN':
    if stop_word != s_word:
        num = 0
        for i in range(len(sig_word)):
            if i == 0:
                check = sig_word[i]
                num = 1
            elif sig_word[i] == check:
                num += 1
            else:
                bow.append([check, num])
                check = sig_word[i]
                num = 1
        bow.append([check, num])
    print('Bow =',bow)
#---------------------------------------------------
elif fh in 'yY':
    f_hash = []
    for e in sig_word:
        hash_num = 0
        for i in range(len(e)):
            hash_num += ord(e[i])*(37**i)
        hash_num = hash_num%int(m)
        f_hash.append(hash_num)
    f_hash.sort()
    bow = []
    if stop_word != s_word:
        for i in range(len(f_hash)):
            if i == 0:
                check = f_hash[i]
                num = 1
            elif f_hash[i] == check:
                num += 1
            else:
                bow.append([check,num])
                check = f_hash[i]
                num = 1
        bow.append([check,num])
    print('BoW =',bow)
    
        
    
    

    
    
        
        

# 6330591021 (22.00) 404 (2021-03-22 16:21)

def fhash(w,m) :
  a=0
  n=0
  for i in w :
    a=a+(ord(i)*(37**n))
    n=n+1
  a=a%int(m)
  return a

print("File name =" , end=" ")
file_name = input()
file=open(file_name,'r')
line_count=0
char_count=0
al_count=0
allwords=[]
word=""
word_count=0
bow=[]
for x in file:
  #print(x)
  char_count=char_count+len(x)
  line_count=line_count+1
  for i in x :
    if i.lower() in "1234567890abcdefghijklmnopqrstuvwxyz" :
      al_count = al_count+1
      word=word+i
    if i.lower() not in "1234567890abcdefghijklmnopqrstuvwxyz" :
      word=word+" "
a = ((word.lower()).split())
a.sort()
#print("a= ", a)
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours    ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"]
for i in a :
  word_count=word_count+1
  if i not in allwords :
    allwords.append(i)
    if i not in stopwords :
      bow.append([i.lower(),a.count(i)])
#print("allwords=",allwords)

print("Use feature hashing ? (y,Y,n,N)" , end=" ")
n= input()
check=0
while check==0 :
  if (n=="y" or n=="Y") :
    check=1
    print("M = " ,end= "")
    m=input()
    print("-------------------")
    print("char count =",char_count+1-line_count)
    print("alphanumeric count =",al_count)
    print("line count =",line_count)
    print("word count =",word_count)
    bownew=[]
    for i in a :
      if i not in stopwords :
        bownew.append(fhash(i,m))
    bownew.sort()
    #print(bownew)
    allnewans=[]
    for i in bownew :
      if [i, bownew.count(i)] not in allnewans :
        allnewans.append([i, bownew.count(i)])
    #print("allnewans=" , allnewans)
    print("BoW =" , allnewans)

  elif (n=="n" or n=="N") :
    check=1
    print("-------------------")
    print("char count =",char_count+1-line_count)
    print("alphanumeric count =",al_count)
    print("line count =",line_count)
    print("word count =",word_count)
    file.close()
    print("BoW =",bow)
  else :
    print ("Try again.")
    n = input()
# 6330592621 (26.00) 405 (2021-03-21 19:49)

def remove_punc(word):
    k = ''
    for i in range(len(word)) :
        if word[i].lower() in ' 0123456789abcdefghijklmnopqrstuvwxyz':
            k += word[i].lower()
        else :
            k += ' '
    return k
def char_count(word):
    count = 0
    for i in range (len(word)) :
        count += 1
    return count
def word_count(word):
    m = []
    k = remove_punc(word)
    k = k.split(" ")
    for i in range (len(k)):
        if k[i] != '' :
            m.append(k[i])
    count = len(m)
    return count
    
def alphanum_count(words) :
    count = 0
    for i in range (len(words)) :
        if words[i].lower() in '0123456789abcdefghijklmnopqrstuvwxyz':
            count += 1
    return count
def cut_stop (words):
    stopwords = open('stopwords.txt' , 'r')
    w = ''
    stop_w = []
    for word in stopwords:
        stop_w += word.split()
    words = words.split()
    for i in range(len(words)):
        if words[i] not in stop_w:
            w += words[i] + ' '
    return w
    
def bow (file) :
    bow = []
    word = []
    f = []
    words = ''
    file = open(file , 'r')
    for i in file :
        words += remove_punc(i)
    words = cut_stop(words)
    words = words.split()
    for i in range (len(words)):
        if words[i] not in word :
            word.append(words[i])
            f.append(1)
        else :
            k = word.index(words[i])
            f[k] += 1
    for i in range(len(word)) :
        bow.append([word[i] , f[i]])
    return bow
def fhash(w , m):
    a = 0
    for i in range (len(w)):
        a += ord(w[i])*(37)**i
    k = a % m
    return k
def bow_2 (file , m) :
    bow = []
    word = []
    f = []
    words = ''
    file = open(file , 'r')
    for i in file :
        words += remove_punc(i)
    words = cut_stop(words)
    words = words.split()
    for i in range (len(words)) :
        a = fhash(words[i] , m)
        if str(a) not in word:
            word.append(str(a))
            f.append(1)
        else :
            k = word.index(str(a))
            f[k] += 1
    for i in range (len(word)):
        bow.append([int(word[i]) , f[i]])
    return bow
def sum_func(file):
    a = open(file , 'r')
    char_countt = 0
    word_countt = 0
    alpnum_countt = 0
    line_countt = 0
    for lines in a :
        char_countt += char_count(lines)
        alpnum_countt += alphanum_count(lines)
        word_countt += word_count(lines)
        line_countt += 1
    a.close()
    return char_countt , alpnum_countt , word_countt , line_countt
    
    
file_name = input('File name = ')
open(file_name , 'r')
fh = input('Use feature hashing ? (y,Y,n,N) ')
while fh.lower() != 'y' and fh.lower() != 'n' :
    print('Try again.')
    fh = input('Use feature hashing ? (y,Y,n,N) ')
if fh.lower() == 'n' :
    a,b,c,d = sum_func(file_name)
    print('-------------------')
    print('char count =',a - d + 1)
    print('alphanumeirc count =',b)
    print('line count =',d)
    print('word count =',c)
    print('BoW =',bow(file_name))
    
elif fh.lower() == 'y' :
    a,b,c,d = sum_func(file_name)
    m = int(input('M ='))
    print('-------------------')
    print('char count =',a - d + 1)
    print('alphanumeirc count =',b)
    print('line count =',d)
    print('word count =',c)
    print('BoW =',bow_2(file_name, m))
    




            

# 6330593221 (26.67) 406 (2021-03-22 19:00)
def read_stopwords(stopwords):
    a = []
    for line in stopwords:
        for i in line.split():
            a.append(i.lower())
    return a
def read_file(file):
    a = []
    for line in file:
        a.append(line)
    return a
def remove_punctuation(s):
  out = ''
  for c in s:
    if c.lower() in '0123456789abcdefghijklmnopqrstuvwxyz':
      out += c.lower()
    else:
      out += ' '
  return out
def remove_stopwords(line, stopwords):
    line = line.split()
    out = []
    for i in line:
        if i not in stopwords:out.append(i)
    return ' '.join(out)
def char_count(File):
    c = 0
    for line in File:
        for i in line:
            if i == '\n':
                pass
            else: c+= 1
    return c
def alphanumeric_count(File):
    q = 0
    for line in File:
        for i in line:
            if i.lower() in '0123456789abcdefghijklmnopqrstuvwxyz':
                q += 1
    return q
def word_count(File):
    a = 0
    for i in File:
        a += len(remove_punctuation(i).split())
    return a
def count( data, element):
    a = []
    for e in element:
        c = 0
        for i in data:
            if i == e:c += 1
        a.append([e, c])
    return a
def BoW_with_fhash(File, stopword,m):
    a = ''
    for i in File:
        a += remove_stopwords(remove_punctuation(i), stopword) + " "
    a = a.split()
    e = []
    for i in a:
        e.append(fhash(i, m))
    q = []
    for i in e:
        if not i in q: q.append(i)
    return sorted(count(e, q))
def BoW(File, stopword):
    a = ''
    for i in File:
        a += remove_stopwords(remove_punctuation(i), stopword) + " "
    a = a.split()
    e = []
    for i in a:
        if not i in e: e.append(i)
    return sorted(count(a, e))
def fhash(w, M):
    G = 37
    f = 0
    for i in range(len(w)):
        f += ord(w[i])*(G**i)
    return f%M
def read_Use_feature_hashing_():
    a = input("Use feature hashing ? (y,Y,n,N) ")
    while True:
        if a == "Y" or a == "y" or a == "n" or a == "N":
            break
        a = input("Try again.\nUse feature hashing ? (y,Y,n,N) ")
    if a == "Y" or a == "y":
        m = input("M = ")
        return int(m)
    else:
        return False
#---------------------------------------------------------------------------
def main():
    file = input('File name = ').strip()
    stop_word = open('stopword.txt')
    file = open(file)
    fn = read_file(file)
    stopwords = read_stopwords(stop_word)
    m = read_Use_feature_hashing_()
    print('-------------------')
    print('char count =', char_count(fn))
    print('alphanumeric count =', alphanumeric_count(fn))
    print('line count =', len(fn))
    print('word count =', word_count(fn))
    if m == False:
        # print('yo')
        print("BoW =", BoW(fn, stopwords))
    else:
        # print('ya')
        print('BoW =', BoW_with_fhash(fn, stopwords, m))
    file.close()
    stop_word.close()
#--------------------------------------------------------------------------

main()



# 6330594921 (0.00) 407 (2021-03-22 19:09)
def summary_count():
    print('-'*19)
    open_file = open('sample.txt')
    line =  open_file.read()

    char_count = ''
    for e in line:
        if not e in '\n': char_count += e
    char_count = len(char_count)
    print('char count = ',char_count)

    alphanumeric_count = ''
    for e in line:
        if e.isalnum() : alphanumeric_count += e
    alphanumeric_count = len(alphanumeric_count) #alphanumeric count
    print('alphanumeric count = ',alphanumeric_count)
    open_file.close()

    open_file = open('sample.txt')
    line = open_file.readline();
    count = 1
    for line in open_file: count += 1
    print('line count = ', count)  # line count
    open_file.close()

    open_file = open('sample.txt')
    word_count = len(open_file.read().split())
    print('word count = ', word_count)  # word count
    open_file.close()
#---------------------------------------------------------------------------------------------------
def count( data, element ):
    # return the count of the given element in the given data
    c = 0
    for e in data:
        if e == element: c += 1
    return c

#---------------------------------------------------------------------------------------------------
def no_hashing_BoW():
    file = open('sample.txt')
    line1 = file.read(); line1 = line1.lower()
    L1 =''
    for e in line1:
        q = ''
        for i in e:
            if not i in ',.\'\"': q += i
        L1 += q; line1 = L1.split(); line1.sort()
    stop = open('stopword.txt')
    line2 = stop.read(); line2 = line2.lower(); line2 = line2.split()
    line2.sort()

    # print(line1); #print(line2)

    no_stop =[]
    for e in line1:
        if not e in no_stop:
            if not e in line2:
                no_stop.append(e)
    # print(no_stop)

    #----------------------------------------------------------


    for e in line1:
        p =[]
        for i in no_stop:
            if i in line1:
                c = count(line1, i)
                p.append([i,c])
    print('BoW = ',p)
#---------------------------------------------------------------------------------------------------
def hashing_BoW(M):
    file = open('sample.txt')
    line1 = file.read(); line1 = line1.lower()
    L1 = ''
    for e in line1:
        q = ''
        for i in e:
            if not i in ',.\'\"': q += i
        L1 += q; line1 = L1.split(); line1.sort()
    stop = open('stopword.txt')
    line2 = stop.read(); line2 = line2.lower(); line2 = line2.split()
    line2.sort()

    # print(line1); #print(line2)

    no_stop = []
    for e in line1:
        if not e in no_stop:
            if not e in line2:
                no_stop.append(e)

    # print(no_stop)
    # ----fhash--------------------------------------------------------
    def fhash(w, M):
        s = []
        for i in range(len(w)):
            if w[i] in w:
                c = ord(str(w[i])) * (37 ** i)
                s.append(c)
        return sum(s) % M

    #------------------------------------------------------------
    p =[]
    for e in line1:
        if e in no_stop:
            p.append(fhash(e,M)); p.sort()
    q =[]
    for e in p:
        if not e in q:
            q.append(e)
    f = []
    for i in range(len(q)):
        if q[i] in p:
            c = count(p,q[i])
            f.append(c)
    a =[]
    for i in range(len(q)):
        a.append([q[i],f[i]])
    return print('BoW = ',a)
#---------------------------------------------------------------------------------------------------
def show():
    inp = input('File name = ')  # ใส่ชื่อแฟ้ม
    while 'sample.txt' in inp:
        inp1 = input('Use feature hashing ? (y,Y,n,N) ')
        if inp1[-1] == 'y' or inp1[-1] == 'Y':  # ต้องการทำ feature hashing
            M = int(input('M = ')) # จากนั้นเอาไปประมวลผลต่อ
            summary_count();hashing_BoW(M) ; break

        if inp1[-1] == 'n' or inp1[-1] == 'N'.upper():  # ไม่ต้องการทำ feature hashing
            summary_count(); no_hashing_BoW(); break

        else:  # ต้องการทำ feature hashing แต่ใส่ผิด
            print('Try again')
#---------------------------------------------------------------------------------------------------
show()
# 6330595521 (18.00) 408 (2021-03-22 22:00)
def format(file):
    with open(file) as f:
        f = f.readlines()
    with open('stopword.txt') as stopword:
        stopword = stopword.read()
        stopword = stopword.splitlines()
        stopword = " ".join(stopword)
        stopword = stopword.split(" ")
    filt = "0123456789abcdefghijklmnopqrhtuvwxyzs"
    special = "'"
    tmp = []
    
    #Line count
    lines = len(f)
    
    count1 = len("".join(f))-lines
    f = " ".join(f)
    f = f.splitlines()
    f = " ".join(f)
    f = f.split(" ")
    ans = [i for i in f if len(i) > 0]
    text = " ".join(ans)
    text1 = "".join(ans)
    #text count
    ascii_count = 0
    #Pautuation count
    all_count = count1

    ver = []
    result = []

    word = ""
    for i in text:
        if i.lower() in filt:
            ascii_count += 1
            word += i
        else:
            if i == " ":
                tmp.append(word)
                word = ""
            else:
                tmp.append(word)
                word = ""
    tmp.append(word)
    for i in tmp:
        if len(i.split("'")) > 1:
            rel = i.split("'")
            ver.append(rel[0])
        else:
            if i != "":
                ver.append(i)
    for i in ver:
        if i.lower() in stopword:
            pass
        else:
            if len(i) > 0:
                result.append(i.lower())
    return result, lines, ascii_count, all_count, len(ver)
def fhash(w, M):
    count = 0
    for i in range(len(w)):
        count += ord(w[i])*(37**i)
    return count%M
def bowtext(text):
    result = []
    ans = []
    for i in text:
        if i in ans:
            pass
        else:
            ans.append(i)
    for i in ans:
        count = 0
        for j in text:
            if j == i:
                count += 1
        result.append([i, count])
    return result
def filterhars(bow, M):
    ans = []
    result = []
    result1 = []
    for i in bow:
        ans.append(fhash(i, M))
    for i in ans:
        if i in result:
            pass
        else:
            result.append(i)
    for i in result:
        count = 0
        for j in ans:
            if j == i:
                count += 1
        result1.append([i, count])
    return result1
def main(file, mode):
    bow, lines, textcount, nonedigit, wordcount = format(file)
    bow = filterhars(bow, mode)
    return bow, lines, textcount, nonedigit, wordcount
def mainly(file):
    bow, lines, textcount, nonedigit, wordcount = format(file)
    bow = bowtext(bow)
    return bow, lines, textcount, nonedigit, wordcount
    
        # if len(i.split("'")) > 1:
        #     ans = i.split("'")
        #     tmp.append(ans[0])
        # else:
        #     tmp.append(i)
    

    #for i in f:
    #    word = ""
    #    for j in i:
    #        if j in filt:
    #            word += j
#def ord(c):
    #return

#def fhash(w, M):
    #return
if __name__ == "__main__":
    textfile = str(input("File Name = "))
    
    while True:
        hars = str(input("Use feature hashing ? (y,Y,n,N) "))
        if hars.lower() == 'y':
           m = int(input('M = '))
           bow, lines, textcount, paucount, word = main(textfile, m)
           bow = sorted(bow, key=lambda x: x[0])
           break
        elif hars.lower() == 'n':
           bow, lines, textcount, paucount, word = mainly(textfile)
           break
        else:
            print('Try again.')
    print("-------------------")
    print('char count = {:}'.format(paucount))
    print('alphanumueric = {:}'.format(textcount))
    print('line count = {:}'.format(lines))
    print('word count = {:}'.format(word))
    print('BoW = {:}'.format(bow))