#Librerias a utilizar
import cv2
import pytesseract
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk
# Funciones que leen imagenes y texto
def load_images_from_folder(folder):
images = []
for filename in os.listdir(folder):
img = cv2.imread(folder + "/" + filename)
if img is not None:
images.append(img)
return images
def gettext(images):
imagetext = []
for image in images:
text = pytesseract.image_to_string(image)
imagetext.append(text)
return imagetext
folder = "/work/mIX"
images = load_images_from_folder(folder)
fig = plt.figure(figsize=(20, 20))
for i in range(12):
r = np.random.randint(1, len(images))
ax = fig.add_subplot(3, 4, i + 1)
image = plt.imshow(images[r])
texto = gettext(images)
print(texto)
LANGUAGE ANALYSIS
import re
#Tokenizador
pattern = r'''(?x) # set flag to allow verbose regexps
(?:[A-Z]\.)+ # abbreviations, e.g. U.S.A.
| \w+(?:-\w+)* # words with optional internal hyphens
| \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():-_`] # these are separate tokens; includes ], [
'''
def tokenizador(texto):
texttok = []
for i in texto:
texttok.append(nltk.regexp_tokenize(i, pattern))
return texttok
textok = tokenizador(texto)
print(textok)
#Creacion de suma de tokens
sumtok= []
for l in textok:
sumtok = sumtok + l
print(sumtok)
listmono = list(sumtok)
dic = {}
for palabra in set(sumtok):
#dic[palabra] = porcentaje_palabra(palabra, text1)
dic[palabra] = sumtok.count(palabra)
Filtering words with more than 5 syllables
threshold = 6
filtered_monograms = [monogram for monogram in listmono if len(monogram)>threshold ]
filtered_dist = nltk.FreqDist(filtered_monograms)
filtered_dist.plot(5)
filtered_dist = nltk.FreqDist(palabras_interesantes)
filtered_dist
df
filtered_monograms = [monogram for monogram in md_bigrams if len(bigram[0])>threshold and len(bigram[1])>threshold]
md_bigrams = list(nltk.bigrams(sumtok))
fdist = nltk.FreqDist(md_bigrams)
md_bigrams[:10]
fig = plt.figure(figsize=(15, 10))
threshold = 2
filtered_bigrams = [bigram for bigram in md_bigrams if len(bigram[0])>threshold and len(bigram[1])>threshold]
filtered_dist = nltk.FreqDist(filtered_bigrams)
filtered_dist.plot(6)