Objective
The Objective of this project is to web scrape british airways customer review data from the web and perform sentiment analysis using Azure AI APIfor Python and present Insights .
Checking and setting up environment variables
To set the environment variable for your Language resource key, open a console window, and follow the instructions for your operating system and development environment.
os.environ['LANGUAGE_KEY']= 'key' os.environ['LANGUAGE_ENDPOINT']= 'endpoint'
!pip install beautifulsoup4
!pip install azure-ai-textanalytics==5.2.0
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential
language_key = os.environ.get('LANGUAGE_KEY')
language_endpoint = os.environ.get('LANGUAGE_ENDPOINT')
import requests
from bs4 import BeautifulSoup
import pandas as pd
#initialize dataframe
df=pd.DataFrame(columns=['Date','Rating','Reviews_heading','Reviews_text','aircraft','Traveller_type'])
Authenticate the client using your key and endpoint
# Authenticate the client using your key and endpoint
def authenticate_client():
ta_credential = AzureKeyCredential(language_key)
text_analytics_client = TextAnalyticsClient(
endpoint=language_endpoint,
credential=ta_credential)
return text_analytics_client
client = authenticate_client()
Extracting data through Beautiful soup
subsoup={}
subsoup2={}
subsoup3={}
subsoup4={}
subsoup5={}
subsoup6={}
subsoup7={}
##extract the review header
def soup_extract_header(x):
for i in range(1,x+1):
html = requests.get('https://www.airlinequality.com/airline-reviews/british-airways/page/'+str(i)+'/?sortby=post_date%3ADesc&pagesize=100')
bs=BeautifulSoup(html.text, 'html.parser')
subsoup[i]=(bs.find_all('h2',class_='text_header'))
return subsoup
## extract user review rating
def soup_extract_rating(x):
for i in range(1,x+1):
html = requests.get('https://www.airlinequality.com/airline-reviews/british-airways/page/'+str(i)+'/?sortby=post_date%3ADesc&pagesize=100')
#print(html)
bs=BeautifulSoup(html.text, 'html.parser')
subsoup2[i]=bs.find_all('span', itemprop="ratingValue")
return subsoup2
###extract the review text
def soup_extract_content(x):
for i in range(1,x+1):
html = requests.get('https://www.airlinequality.com/airline-reviews/british-airways/page/'+str(i)+'/?sortby=post_date%3ADesc&pagesize=100')
#print(html)
bs=BeautifulSoup(html.text, 'html.parser')
subsoup3[i]=bs.find_all('div',class_='text_content',itemprop="reviewBody")
return subsoup3
## extract the attributes of customer
def soup_extract_stats(x):
for i in range(1,x+1):
html = requests.get('https://www.airlinequality.com/airline-reviews/british-airways/page/'+str(i)+'/?sortby=post_date%3ADesc&pagesize=100')
#print(html)
bs=BeautifulSoup(html.text, 'html.parser')
subsoup4[i]= bs.find_all('div',class_='review-stats')
return subsoup4
## extract the review date
def soup_extract_date(x):
for i in range(1,x+1):
html = requests.get('https://www.airlinequality.com/airline-reviews/british-airways/page/'+str(i)+'/?sortby=post_date%3ADesc&pagesize=100')
#print(html)
bs=BeautifulSoup(html.text, 'html.parser')
subsoup6[i]=bs.find_all('time',itemprop="datePublished")
return subsoup6
## extract customer name
def soup_extract_name(x):
for i in range(1,x+1):
html = requests.get('https://www.airlinequality.com/airline-reviews/british-airways/page/'+str(i)+'/?sortby=post_date%3ADesc&pagesize=100')
#print(html)
bs=BeautifulSoup(html.text, 'html.parser')
subsoup7[i]=bs.find_all('span', itemprop="name")
return subsoup7
Initialize the functions
subsoup_obj1=soup_extract_header(10)
subsoup_obj2=soup_extract_rating(10)
subsoup_obj3=soup_extract_content(10)
subsoup_obj4=soup_extract_stats(10)
subsoup_obj6=soup_extract_date(10)
subsoup_obj7=soup_extract_name(10)
import re
reviews_heading=[]
for i in range(1,len(subsoup_obj1)+1):
for j in range(len(subsoup_obj1[i])):
heading=" ".join(subsoup_obj1[i][j].contents)
heading=re.findall(r'\w+',heading)
heading=" ".join(heading)
reviews_heading.append(heading)
#print(reviews_heading)
reviews_heading[0:10]
df['Reviews_heading']=reviews_heading
ratings=[]
for i in range(1,len(subsoup_obj2)+1):
#print(f'for i=',i)
#print(f'subsoup_obj2[i]=',subsoup_obj2[i])
#print(f'no of records',len(subsoup_obj2[i]))
for j in range(0,100):
#print(f'for j=',j)
#print(f'subsoup_obj2[i][j]=',subsoup_obj2[i][j])
#print(f'subsoup_obj2[i][j].contents=',subsoup_obj2[i][j].contents)
reviews_ratings=" ".join(subsoup_obj2[i][j].contents)
reviews_ratings=re.findall(r'\d+',reviews_ratings)[0]
#print(reviews_ratings)
ratings.append(reviews_ratings)
#print(f'ratings',ratings)
#print(len(ratings))
print(ratings[0:10])
df['Rating']=ratings
Reviews_text=[]
for i in range(1,len(subsoup_obj3)+1):
print(f'subsoup_obj3[i]',subsoup_obj3[i])
for j in range(len(subsoup_obj3[i])):
text=" ".join([tag.text for tag in subsoup3[i][j].contents])
#Reviews_text=re.findall(r'\+',reviews_ratings)[0]
Reviews_text.append(text)
print(Reviews_text)
print(Reviews_text[0:10])
df['Reviews_text']=Reviews_text
stats=[]
aircraft=[]
traveller_type=[]
seat_type=[]
route=[]
date_flown=[]
recommended=[]
subsoup5={}
for i in range(1,len(subsoup_obj4)+1):
#print(subsoup_obj4[i])
for j in range(len(subsoup_obj4[i])):
#print(subsoup_obj4[i][j].contents)
subsoup5[j]=subsoup_obj4[i][j].contents[1].select('td.review-value')
text1=" ".join(subsoup5[j][-1].contents)
text2=" ".join(subsoup5[j][-3].contents)
text3=" ".join(subsoup5[j][-2].contents)
text4=" ".join(subsoup5[j][-4].contents)
#print(subsoup5)
#print(len(subsoup5))
recommended.append(text1)
date_flown.append(text3)
route.append(text2)
seat_type.append(text4)
#print(len(recommended))
print(seat_type[0:10])
print(recommended[0:10])
print(date_flown[0:10])
print(route[0:10])
#df['aircraft']=aircraft
#df['Traveller_type']=traveller_type
df['Seat_type']=seat_type
df['Route']=route
df['Date_Flown']=date_flown
df['Recommend']=recommended
#print(stats[0])
df.head()
date=[]
for i in range(1,len(subsoup_obj6)+1):
for j in range(len(subsoup_obj6[i])):
text=subsoup6[i][j].contents[0]
date.append(text)
print(date[0:10])
print(len(date))
df['Date']=date
name=[]
for i in range(1,len(subsoup_obj7)+1):
for j in range(len(subsoup_obj7[i])):
text=subsoup7[i][j].contents[0]
name.append(text)
print(f'name=',name[0:10])
print(f'length of name:,',len(name))
df['Name']=name
df.info()
df.head()
### cnverting date to datetime object
df['Date_new']=pd.to_datetime(df['Date'])
df.drop('Date',axis=1,inplace=True)
df.drop(['aircraft','Traveller_type'],axis=1,inplace=True)
df.shape
#extract year
df['Year']=df['Date_new'].dt.year
## extract month
df['Month']=df['Date_new'].dt.month
##extract day
df['Day']=df['Date_new'].dt.day
df.set_index('Date_new',inplace=True)
df.head()
Detecting sentiments for each record
positive_score={}
negative_score={}
neutral_score={}
def sentiment_analysis_with_opinion_mining_example(client):
for i in range(0,len(df)):
text=[]
text.append(df.iloc[i,2])
print(text)
result = client.analyze_sentiment(text, show_opinion_mining=False)
doc_result = [doc for doc in result if not doc.is_error]
positive_reviews = [doc for doc in doc_result if doc.sentiment == "positive"]
print(positive_reviews)
negative_reviews = [doc for doc in doc_result if doc.sentiment == "negative"]
print(negative_reviews)
for document in doc_result:
print("Document Sentiment: {}".format(document.sentiment))
print("Overall scores: positive={0:.2f}; neutral={1:.2f}; negative={2:.2f} \n".format(
document.confidence_scores.positive,
document.confidence_scores.neutral,
document.confidence_scores.negative))
positive_score[i]=document.confidence_scores.positive
neutral_score[i]=document.confidence_scores.neutral
negative_score[i]=document.confidence_scores.negative
sentiment_analysis_with_opinion_mining_example(client)
sentiment_df=pd.DataFrame(index=positive_score.keys(),data=positive_score.values(),columns=['Postive_score'])
sentiment_df2=pd.DataFrame(index=neutral_score.keys(),data=neutral_score.values(),columns=['Neutral_score'])
sentiment_df3=pd.DataFrame(index=negative_score.keys(),data=negative_score.values(),columns=['Negative_score'])
sentiment_df=pd.concat([sentiment_df,sentiment_df2,sentiment_df3],axis=1)
df.reset_index(drop=True,inplace=True)
df_final=pd.concat([df,sentiment_df],axis=1)
Exporting to csv file
df_final.to_csv('sentiments.csv')
df_final.head()