# Start writing code here...
!pip install bs4
!pip install html5lib
!pip install lxml
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
#parsear urls
from os import path
from pathlib import PurePath
# Abrimos el archivo donde se encuentran las 30 primeras urls de resenhas del film
with open('urls.txt', 'r') as fh:
urls = fh.readlines()
urls = [url.strip() for url in urls] # strip `\n`
# Abrimos el csv con append para que pueda agregar contenidos al final del archivo
with open('pulp_fiction_reviews.csv', 'a') as csv_file:
# Creamos el writer
writer = csv.writer(csv_file)
# Anhadimos una cabecera acorde a los datos que posteriormente se imprimiran y guardaran
writer.writerow(['ID', 'NICK_NAME', 'RATING', 'COUNTRY', 'LOCATION', 'DATE', 'TITLE', 'REVIEW'])
idReview=1
for url in urls:
page = requests.get(url).text
soup = BeautifulSoup(page, "lxml")
#objeto de exportacion
for review in soup.find_all("div", {"class": "movie-review-wrapper"}):
id_review=idReview
nick_name=review.find("div", {"class": "mr-user-nick"}).find("a").find("b").text
rating=review.find("div", {"class": "user-reviews-movie-rating"}).text
country=str(str(str(review.find("div", {"class": "mr-user-country"}).text).split("(")[1]).split(")")[0])
location=str(str(review.find("div", {"class": "mr-user-country"}).text).split("(")[0])
date=review.find("div", {"class": "review-date"}).text
title=review.find("div", {"class": "review-title"}).find("a").text
review_text=review.find("div", {"class": "review-text1"}).text
# se almacenan los datos obtenidos
writer.writerow([id_review, nick_name, rating, country, location, date, title, review_text])
idReview=idReview+1