#Librerias a utilizar
import cv2
import pytesseract
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk
# Funciones que leen imagenes y texto
def load_images_from_folder(folder):
    images = []
    for filename in os.listdir(folder):
        img = cv2.imread(folder + "/" + filename)
        if img is not None:
            images.append(img)
    return images
def gettext(images):
    imagetext = []
    for image in images:
        text = pytesseract.image_to_string(image)
        
        imagetext.append(text)
    return imagetext
folder = "/work/mIX"
images = load_images_from_folder(folder)
fig = plt.figure(figsize=(20, 20))
for i in range(12):
    r = np.random.randint(1, len(images))
    ax = fig.add_subplot(3, 4, i + 1)    
    image = plt.imshow(images[r])
texto = gettext(images)
print(texto)
LANGUAGE ANALYSIS
import re
#Tokenizador
pattern = r'''(?x)                 # set flag to allow verbose regexps
              (?:[A-Z]\.)+         # abbreviations, e.g. U.S.A.
              | \w+(?:-\w+)*       # words with optional internal hyphens
              | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
              | \.\.\.             # ellipsis
              | [][.,;"'?():-_`]   # these are separate tokens; includes ], [
'''
def tokenizador(texto):
    texttok = []
    for i in texto:
        texttok.append(nltk.regexp_tokenize(i, pattern))
    return texttok
textok = tokenizador(texto)
print(textok)
#Creacion de suma de tokens 
sumtok= []
for l in textok:
    sumtok = sumtok + l
print(sumtok)
listmono = list(sumtok)
dic = {}
for palabra in set(sumtok):
  #dic[palabra] = porcentaje_palabra(palabra, text1)
  dic[palabra] = sumtok.count(palabra)
Filtering words with more than 5 syllables
threshold = 6
filtered_monograms = [monogram for monogram in listmono if len(monogram)>threshold ]
filtered_dist = nltk.FreqDist(filtered_monograms)
filtered_dist.plot(5)
filtered_dist = nltk.FreqDist(palabras_interesantes)
filtered_dist
df
filtered_monograms = [monogram for monogram in md_bigrams if len(bigram[0])>threshold and len(bigram[1])>threshold]
md_bigrams = list(nltk.bigrams(sumtok))
fdist = nltk.FreqDist(md_bigrams)
md_bigrams[:10]
fig = plt.figure(figsize=(15, 10))
threshold = 2
filtered_bigrams = [bigram for bigram in md_bigrams if len(bigram[0])>threshold and len(bigram[1])>threshold]
filtered_dist = nltk.FreqDist(filtered_bigrams)
filtered_dist.plot(6)