#Librerias a utilizar import cv2 import pytesseract import os import matplotlib.pyplot as plt import pandas as pd import numpy as np import nltk

# Funciones que leen imagenes y texto def load_images_from_folder(folder): images = [] for filename in os.listdir(folder): img = cv2.imread(folder + "/" + filename) if img is not None: images.append(img) return images def gettext(images): imagetext = [] for image in images: text = pytesseract.image_to_string(image) imagetext.append(text) return imagetext

folder = "/work/mIX" images = load_images_from_folder(folder)

fig = plt.figure(figsize=(20, 20)) for i in range(12): r = np.random.randint(1, len(images)) ax = fig.add_subplot(3, 4, i + 1) image = plt.imshow(images[r])

texto = gettext(images)

print(texto)

LANGUAGE ANALYSIS

import re

#Tokenizador pattern = r'''(?x) # set flag to allow verbose regexps (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(?:-\w+)* # words with optional internal hyphens | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():-_`] # these are separate tokens; includes ], [ ''' def tokenizador(texto): texttok = [] for i in texto: texttok.append(nltk.regexp_tokenize(i, pattern)) return texttok

textok = tokenizador(texto) print(textok)

#Creacion de suma de tokens sumtok= [] for l in textok: sumtok = sumtok + l print(sumtok)

listmono = list(sumtok)

dic = {} for palabra in set(sumtok): #dic[palabra] = porcentaje_palabra(palabra, text1) dic[palabra] = sumtok.count(palabra)

Filtering words with more than 5 syllables

threshold = 6 filtered_monograms = [monogram for monogram in listmono if len(monogram)>threshold ] filtered_dist = nltk.FreqDist(filtered_monograms) filtered_dist.plot(5)

filtered_dist = nltk.FreqDist(palabras_interesantes) filtered_dist

df

filtered_monograms = [monogram for monogram in md_bigrams if len(bigram[0])>threshold and len(bigram[1])>threshold]

md_bigrams = list(nltk.bigrams(sumtok)) fdist = nltk.FreqDist(md_bigrams) md_bigrams[:10]

fig = plt.figure(figsize=(15, 10)) threshold = 2 filtered_bigrams = [bigram for bigram in md_bigrams if len(bigram[0])>threshold and len(bigram[1])>threshold] filtered_dist = nltk.FreqDist(filtered_bigrams) filtered_dist.plot(6)

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}LANGUAGE ANALYSIS

Filtering words with more than 5 syllables

LANGUAGE ANALYSIS