# Start writing code here...
!pip install bs4
!pip install html5lib
!pip install lxml
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
#parsear urls
from os import path
from pathlib import PurePath
# Abrimos el archivo donde se encuentran las 30 primeras urls de resenhas del film
with open('urls.txt', 'r') as fh:
urls = fh.readlines()
urls = [url.strip() for url in urls] # strip `\n`
# Abrimos el csv con append para que pueda agregar contenidos al final del archivo
with open('pulp_fiction_reviews.csv', 'a') as csv_file:
# Creamos el writer
writer = csv.writer(csv_file)
# Anhadimos una cabecera acorde a los datos que posteriormente se imprimiran y guardaran
writer.writerow(['ID', 'NICK_NAME', 'RATING', 'COUNTRY', 'LOCATION', 'DATE', 'TITLE', 'REVIEW'])
idReview=1
for url in urls:
page = requests.get(url).text
soup = BeautifulSoup(page, "lxml")
#objeto de exportacion
for review in soup.find_all("div", {"class": "movie-review-wrapper"}):
id_review=idReview
nick_name=review.find("div", {"class": "mr-user-nick"}).find("a").find("b").text
rating=review.find("div", {"class": "user-reviews-movie-rating"}).text
country=str(str(str(review.find("div", {"class": "mr-user-country"}).text).split("(")[1]).split(")")[0])
location=str(str(review.find("div", {"class": "mr-user-country"}).text).split("(")[0])
date=review.find("div", {"class": "review-date"}).text
title=review.find("div", {"class": "review-title"}).find("a").text
review_text=review.find("div", {"class": "review-text1"}).text
# se almacenan los datos obtenidos
writer.writerow([id_review, nick_name, rating, country, location, date, title, review_text])
idReview=idReview+1
Requirement already satisfied: bs4 in /root/venv/lib/python3.9/site-packages (0.0.1)
Requirement already satisfied: beautifulsoup4 in /root/venv/lib/python3.9/site-packages (from bs4) (4.11.1)
Requirement already satisfied: soupsieve>1.2 in /root/venv/lib/python3.9/site-packages (from beautifulsoup4->bs4) (2.3.2.post1)
WARNING: You are using pip version 22.0.4; however, version 22.2.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
Requirement already satisfied: html5lib in /root/venv/lib/python3.9/site-packages (1.1)
Requirement already satisfied: webencodings in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from html5lib) (0.5.1)
Requirement already satisfied: six>=1.9 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from html5lib) (1.16.0)
WARNING: You are using pip version 22.0.4; however, version 22.2.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
Requirement already satisfied: lxml in /shared-libs/python3.9/py/lib/python3.9/site-packages (4.9.0)
WARNING: You are using pip version 22.0.4; however, version 22.2.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.