British Airways Sentiment analysis

Objective

The Objective of this project is to web scrape british airways customer review data from the web and perform sentiment analysis using Azure AI APIfor Python and present Insights .

Checking and setting up environment variables

To set the environment variable for your Language resource key, open a console window, and follow the instructions for your operating system and development environment.

os.environ['LANGUAGE_KEY']= 'key' os.environ['LANGUAGE_ENDPOINT']= 'endpoint'

!pip install beautifulsoup4 !pip install azure-ai-textanalytics==5.2.0 from azure.ai.textanalytics import TextAnalyticsClient from azure.core.credentials import AzureKeyCredential language_key = os.environ.get('LANGUAGE_KEY') language_endpoint = os.environ.get('LANGUAGE_ENDPOINT') import requests from bs4 import BeautifulSoup import pandas as pd #initialize dataframe df=pd.DataFrame(columns=['Date','Rating','Reviews_heading','Reviews_text','aircraft','Traveller_type'])

Authenticate the client using your key and endpoint

# Authenticate the client using your key and endpoint def authenticate_client(): ta_credential = AzureKeyCredential(language_key) text_analytics_client = TextAnalyticsClient( endpoint=language_endpoint, credential=ta_credential) return text_analytics_client client = authenticate_client()

Extracting data through Beautiful soup

subsoup={} subsoup2={} subsoup3={} subsoup4={} subsoup5={} subsoup6={} subsoup7={} ##extract the review header def soup_extract_header(x): for i in range(1,x+1): html = requests.get('https://www.airlinequality.com/airline-reviews/british-airways/page/'+str(i)+'/?sortby=post_date%3ADesc&pagesize=100') bs=BeautifulSoup(html.text, 'html.parser') subsoup[i]=(bs.find_all('h2',class_='text_header')) return subsoup ## extract user review rating def soup_extract_rating(x): for i in range(1,x+1): html = requests.get('https://www.airlinequality.com/airline-reviews/british-airways/page/'+str(i)+'/?sortby=post_date%3ADesc&pagesize=100') #print(html) bs=BeautifulSoup(html.text, 'html.parser') subsoup2[i]=bs.find_all('span', itemprop="ratingValue") return subsoup2 ###extract the review text def soup_extract_content(x): for i in range(1,x+1): html = requests.get('https://www.airlinequality.com/airline-reviews/british-airways/page/'+str(i)+'/?sortby=post_date%3ADesc&pagesize=100') #print(html) bs=BeautifulSoup(html.text, 'html.parser') subsoup3[i]=bs.find_all('div',class_='text_content',itemprop="reviewBody") return subsoup3 ## extract the attributes of customer def soup_extract_stats(x): for i in range(1,x+1): html = requests.get('https://www.airlinequality.com/airline-reviews/british-airways/page/'+str(i)+'/?sortby=post_date%3ADesc&pagesize=100') #print(html) bs=BeautifulSoup(html.text, 'html.parser') subsoup4[i]= bs.find_all('div',class_='review-stats') return subsoup4 ## extract the review date def soup_extract_date(x): for i in range(1,x+1): html = requests.get('https://www.airlinequality.com/airline-reviews/british-airways/page/'+str(i)+'/?sortby=post_date%3ADesc&pagesize=100') #print(html) bs=BeautifulSoup(html.text, 'html.parser') subsoup6[i]=bs.find_all('time',itemprop="datePublished") return subsoup6 ## extract customer name def soup_extract_name(x): for i in range(1,x+1): html = requests.get('https://www.airlinequality.com/airline-reviews/british-airways/page/'+str(i)+'/?sortby=post_date%3ADesc&pagesize=100') #print(html) bs=BeautifulSoup(html.text, 'html.parser') subsoup7[i]=bs.find_all('span', itemprop="name") return subsoup7

Initialize the functions

subsoup_obj1=soup_extract_header(10) subsoup_obj2=soup_extract_rating(10) subsoup_obj3=soup_extract_content(10) subsoup_obj4=soup_extract_stats(10) subsoup_obj6=soup_extract_date(10) subsoup_obj7=soup_extract_name(10)

import re reviews_heading=[] for i in range(1,len(subsoup_obj1)+1): for j in range(len(subsoup_obj1[i])): heading=" ".join(subsoup_obj1[i][j].contents) heading=re.findall(r'\w+',heading) heading=" ".join(heading) reviews_heading.append(heading) #print(reviews_heading) reviews_heading[0:10] df['Reviews_heading']=reviews_heading

ratings=[] for i in range(1,len(subsoup_obj2)+1): #print(f'for i=',i) #print(f'subsoup_obj2[i]=',subsoup_obj2[i]) #print(f'no of records',len(subsoup_obj2[i])) for j in range(0,100): #print(f'for j=',j) #print(f'subsoup_obj2[i][j]=',subsoup_obj2[i][j]) #print(f'subsoup_obj2[i][j].contents=',subsoup_obj2[i][j].contents) reviews_ratings=" ".join(subsoup_obj2[i][j].contents) reviews_ratings=re.findall(r'\d+',reviews_ratings)[0] #print(reviews_ratings) ratings.append(reviews_ratings) #print(f'ratings',ratings) #print(len(ratings)) print(ratings[0:10]) df['Rating']=ratings

Reviews_text=[] for i in range(1,len(subsoup_obj3)+1): print(f'subsoup_obj3[i]',subsoup_obj3[i]) for j in range(len(subsoup_obj3[i])): text=" ".join([tag.text for tag in subsoup3[i][j].contents]) #Reviews_text=re.findall(r'\+',reviews_ratings)[0] Reviews_text.append(text) print(Reviews_text) print(Reviews_text[0:10]) df['Reviews_text']=Reviews_text

stats=[] aircraft=[] traveller_type=[] seat_type=[] route=[] date_flown=[] recommended=[] subsoup5={} for i in range(1,len(subsoup_obj4)+1): #print(subsoup_obj4[i]) for j in range(len(subsoup_obj4[i])): #print(subsoup_obj4[i][j].contents) subsoup5[j]=subsoup_obj4[i][j].contents[1].select('td.review-value') text1=" ".join(subsoup5[j][-1].contents) text2=" ".join(subsoup5[j][-3].contents) text3=" ".join(subsoup5[j][-2].contents) text4=" ".join(subsoup5[j][-4].contents) #print(subsoup5) #print(len(subsoup5)) recommended.append(text1) date_flown.append(text3) route.append(text2) seat_type.append(text4) #print(len(recommended)) print(seat_type[0:10]) print(recommended[0:10]) print(date_flown[0:10]) print(route[0:10]) #df['aircraft']=aircraft #df['Traveller_type']=traveller_type df['Seat_type']=seat_type df['Route']=route df['Date_Flown']=date_flown df['Recommend']=recommended #print(stats[0])

df.head()

date=[] for i in range(1,len(subsoup_obj6)+1): for j in range(len(subsoup_obj6[i])): text=subsoup6[i][j].contents[0] date.append(text) print(date[0:10]) print(len(date)) df['Date']=date

name=[] for i in range(1,len(subsoup_obj7)+1): for j in range(len(subsoup_obj7[i])): text=subsoup7[i][j].contents[0] name.append(text) print(f'name=',name[0:10]) print(f'length of name:,',len(name)) df['Name']=name

df.info()

df.head()

### cnverting date to datetime object df['Date_new']=pd.to_datetime(df['Date'])

df.drop('Date',axis=1,inplace=True)

df.drop(['aircraft','Traveller_type'],axis=1,inplace=True)

df.shape

#extract year df['Year']=df['Date_new'].dt.year

## extract month df['Month']=df['Date_new'].dt.month

##extract day df['Day']=df['Date_new'].dt.day

df.set_index('Date_new',inplace=True)

df.head()

Detecting sentiments for each record

positive_score={} negative_score={} neutral_score={} def sentiment_analysis_with_opinion_mining_example(client): for i in range(0,len(df)): text=[] text.append(df.iloc[i,2]) print(text) result = client.analyze_sentiment(text, show_opinion_mining=False) doc_result = [doc for doc in result if not doc.is_error] positive_reviews = [doc for doc in doc_result if doc.sentiment == "positive"] print(positive_reviews) negative_reviews = [doc for doc in doc_result if doc.sentiment == "negative"] print(negative_reviews) for document in doc_result: print("Document Sentiment: {}".format(document.sentiment)) print("Overall scores: positive={0:.2f}; neutral={1:.2f}; negative={2:.2f} \n".format( document.confidence_scores.positive, document.confidence_scores.neutral, document.confidence_scores.negative)) positive_score[i]=document.confidence_scores.positive neutral_score[i]=document.confidence_scores.neutral negative_score[i]=document.confidence_scores.negative

sentiment_analysis_with_opinion_mining_example(client)

sentiment_df=pd.DataFrame(index=positive_score.keys(),data=positive_score.values(),columns=['Postive_score']) sentiment_df2=pd.DataFrame(index=neutral_score.keys(),data=neutral_score.values(),columns=['Neutral_score']) sentiment_df3=pd.DataFrame(index=negative_score.keys(),data=negative_score.values(),columns=['Negative_score']) sentiment_df=pd.concat([sentiment_df,sentiment_df2,sentiment_df3],axis=1)

df.reset_index(drop=True,inplace=True)

df_final=pd.concat([df,sentiment_df],axis=1)

Exporting to csv file

df_final.to_csv('sentiments.csv')

df_final.head()