import pandas as pd
df = pd.read_csv("names.txt")
#df["separado"] = df["Nombres"].str.split()
#df["nombre"] = df["separado"][0:]
#df["apellido"] = df["separado"][1:]
#df["nombre"]= df["Nombres"].str.split(" ", n = 1, expand = True)
#df["nombre"] = df["Nombres"].str.split(" ", n = 1, expand = True)
#df["nombre"]= df["Nombres"].str.split(" ", n = 1, expand = True)
df[['nombre', 'apellido']] = df['Nombres'].str.split(' ', 1, expand=True)
del df['Nombres']
df.to_csv("archivosalida.csv", sep=';', encoding='utf-8')
with open("archivosalida2.csv","w") as fw:
for i in df:
fw.write(i[0]+";"+i[1])
df.head(5)
import pandas as pd
words = []
with open('el_quijote.txt','r') as f:
for line in f:
for word in line.split():
words.append(word.lower())
column_values = ['palabra']
df = pd.DataFrame(data = words,
columns = column_values
)
#df.head(5)
#print(words)
#df.head(5)
#parte1
print("\nfrecuencia por palabra")
print(df["palabra"].value_counts())
valores = df["palabra"].value_counts()
print("\ntop 10 frecuencia por palabra")
print(valores[:10])
#print((df["palabra"].unique()).size )
#parte2
print("\ntop 10 cuales palabras")
print(df["palabra"].value_counts()[:10].index.tolist())
#import padas as pd
archivo = r'el_quijote.txt'
with open(archivo) as f:
texto = f.readlines()
completo = []
for l in texto:
linea = l.split()
for p in linea:
completo.append(p.lower())
palabra = ''
conteo_palabras = {}
conteo = 0
keys = []
for p in completo:
if p not in keys:
keys.append(p)
print(keys)
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize import word_tokenize, regexp_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('perluniprops')
nltk.download('nonbreaking_prefixes')
## defecto para ingles
word_tokens = word_tokenize(content.lower())
len(word_tokens)
## no funciona bien con español, ya que no incluye acentos ni ñ
word_tokens = regexp_tokenize(content.lower(), pattern='\w+')
len(word_tokens)
## mejor para español pero contiene caracteres como punto, comas, etc..
toktok = ToktokTokenizer()
word_tokens = toktok.tokenize(content.lower())
len(word_tokens)
stop_words = list(stopwords.words('spanish'))
stop_words += [',', '.', ':', ';'] ## agregamos mas stopwords
len(stop_words)
filtered_words =[w for w in word_tokens if not w in stop_words ]
len(filtered_words)
from collections import Counter
counter = Counter(filtered_words)
counter.most_common(10)
import csv
with open('data/data-text.csv', 'r') as f:
reader = csv.reader(f)
for row in reader:
print(row[0], row[1])
break
import pandas as pd
df = pd.read_csv('data/data-text.csv')
df
df.to_csv('data/data-text-copy.csv')
import json
with open('data/data-text.json') as f:
content = f.read()
data = json.loads(content)
for item in data:
print(item)
break
with open('file.json', 'w') as f:
json.dump(data, f)
df = pd.read_json('data/data-text.json')
df
df.to_json('data/data-text-copy.json')
!pip install xlrd