def collectSportsNews():
URL = "https://indianexpress.com/section/sports/"
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html5lib')
selected = soup.find('div', attrs = {'class':'nation'})
newss = selected.findAll('div', attrs={'class': 'articles'})
news_list = []
for news in newss:
c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.find('p').text, "SPORTS"))
for i in range(2,100):
r = requests.get(URL+"/page/"+str(i))
soup = BeautifulSoup(r.content, 'html5lib')
selected = soup.find('div', attrs = {'class':'nation'})
newss = selected.findAll('div', attrs={'class': 'articles'})
for news in newss:
c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.find('p').text, "SPORTS"))
connection.commit()
print(news_list)
def collectEntertainmentData():
URL = "https://indianexpress.com/section/entertainment/"
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html5lib')
selected = soup.find('div', attrs = {'class':'nation'})
newss = selected.findAll('div', attrs={'class': 'articles'})
news_list = []
for news in newss:
c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.find('p').text, "ENTERTAINMENT"))
for i in range(2,100):
r = requests.get(URL+"/page/"+str(i))
soup = BeautifulSoup(r.content, 'html5lib')
selected = soup.find('div', attrs = {'class':'nation'})
newss = selected.findAll('div', attrs={'class': 'articles'})
for news in newss:
c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.find('p').text, "ENTERTAINMENT"))
connection.commit()
print(news_list)
def collectTechData():
URL = "https://indianexpress.com/section/technology/"
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html5lib')
# selected = soup.find('ul', attrs = {'class':'article-list'})
# newss = selected.findAll('li')
# for news in newss:
# c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.findAll('a')[1].text, "TECH"))
for i in range(100,106):
r = requests.get(URL+"/page/"+str(i))
soup = BeautifulSoup(r.content, 'html5lib')
selected = soup.find('ul', attrs = {'class':'article-list'})
newss = selected.findAll('li')
for news in newss:
c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.findAll('a')[1].text, "TECH"))
connection.commit()
def collectBusinessData():
URL = "https://indianexpress.com/section/business/"
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html5lib')
selected = soup.find('div', attrs = {'class':'nation'})
newss = selected.findAll('div', attrs={'class': 'articles'})
news_list = []
for news in newss:
c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.find('p').text, "BUSINESS"))
for i in range(2,100):
r = requests.get(URL+"/page/"+str(i))
soup = BeautifulSoup(r.content, 'html5lib')
selected = soup.find('div', attrs = {'class':'nation'})
newss = selected.findAll('div', attrs={'class': 'articles'})
for news in newss:
c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.find('p').text, "BUSINESS"))
connection.commit()
print(news_list)
def collectEducationData():
URL = "https://indianexpress.com/section/education/"
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html5lib')
selected = soup.find('div', attrs = {'class':'nation'})
newss = selected.findAll('div', attrs={'class': 'articles'})
news_list = []
for news in newss:
c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.find('p').text, "EDUCATION"))
for i in range(2,100):
r = requests.get(URL+"/page/"+str(i))
soup = BeautifulSoup(r.content, 'html5lib')
selected = soup.find('div', attrs = {'class':'nation'})
newss = selected.findAll('div', attrs={'class': 'articles'})
for news in newss:
c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.find('p').text, "EDUCATION"))
connection.commit()
print(news_list)
def getAllInfoData():
c.execute("SELECT Category , COUNT(*) FROM NEWS Group by Category")
print(c.fetchall())
!pip install bs4
import sqlite3
import requests
from bs4 import BeautifulSoup
import os
# if db not exist then ask and scrape
if not (os.path.isfile("News.db")):
print("#"*50)
choice = input("Can I scrape the data from net? Y-n: ")
if (choice.lower() == "y" or choice.lower() ==" yes"):
connection = sqlite3.connect("News.db")
c = connection.cursor()
c.execute("""CREATE TABLE NEWS(news_article text, category text)""")
collectSportsNews()
collectEntertainmentData()
collectTechData()
collectEducationData()
collectBusinessData()
getAllInfoData()
else:
pass
else:
connection = sqlite3.connect("News.db")
c = connection.cursor()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
sql_query = pd.read_sql_query ('''
SELECT
*
FROM NEWS
''', connection)
df = pd.DataFrame(sql_query, columns = ['news_article', 'category'])
# print (df)
# getAllInfoData()
df.head()
This chart is empty
Chart was probably not set up properly in the notebook
df.category.value_counts()
max(df["news_article"])
min(df["news_article"])
df.isnull().sum()
from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
df["categoryId"] = lc.fit_transform(df["category"])
df.sample(10)
pickle.dump(lc, open('helpers/LabelEncoder.pickel', 'wb'))
categoryDF = df[["category", "categoryId"]].drop_duplicates().sort_values(by="categoryId")
categoryDF
df["category"].value_counts().plot(kind="pie", shadow=True, autopct='%1.1f%%')
df["category"].value_counts().plot(kind="bar")
!pip install wordcloud
import nltk
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
# Create stopword list:
stopwords = set(STOPWORDS)
stopwords.update(["br", "href"])
textt = " ".join(review for review in df["news_article"])
wordcloud = WordCloud(stopwords=stopwords).generate(textt)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('images/wordcloud11.png')
plt.show()
"' '".join(set(" ".join(review for review in df["news_article"])))
# REMOVE SPECIAL CHARs
import re
def filterNonAscii(text):
split = re.split("\W+", text)
return " ".join(split)
df["filter1"]= df["news_article"].apply(lambda x: filterNonAscii(x.lower()))
"' '".join(set(" ".join(review for review in df["filter1"])))
nltk.download('stopwords')
def remove_stopwords(text):
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.update(["br", "href"])
text=[word for word in text.split(" ") if word not in stopwords]
return " ".join(text)
df["filter1"]= df["filter1"].apply(lambda x: remove_stopwords(x))
print(df["news_article"][4])
print("-"*170)
print(df["filter1"][4])
from nltk.stem import WordNetLemmatizer
def lemmitize(text):
wordnet = WordNetLemmatizer()
return " ".join([wordnet.lemmatize(word) for word in text.split(" ")])
nltk.download('wordnet')
df["filter2"]= df["filter1"].apply(lambda x: lemmitize(x))
print(df["news_article"][4])
print("-"*170)
print(df["filter2"][4])
sportsDF = df[df["category"]=="SPORTS"]
teachDF = df[df["category"]=="TECH"]
entertainmentDF = df[df["category"]=="ENTERTAINMENT"]
businessDF = df[df["category"]=="BUSINESS"]
educationDF = df[df["category"]=="EDUCATION"]
educationDF.sample(3)
textt = " ".join(review for review in sportsDF["filter2"])
wordcloud = WordCloud(stopwords=stopwords).generate(textt)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('images/sports.png')
plt.show()
textt = " ".join(review for review in educationDF["filter2"])
wordcloud = WordCloud(stopwords=stopwords).generate(textt)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('images/edu.png')
plt.show()
textt = " ".join(review for review in businessDF["filter2"])
wordcloud = WordCloud(stopwords=stopwords).generate(textt)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('images/business.png')
plt.show()
textt = " ".join(review for review in teachDF["filter2"])
wordcloud = WordCloud(stopwords=stopwords).generate(textt)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('images/tech.png')
plt.show()
textt = " ".join(review for review in entertainmentDF["filter2"])
wordcloud = WordCloud(stopwords=stopwords).generate(textt)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('images/entertainment.png')
plt.show()
from sklearn.feature_extraction.text import CountVectorizer
x = np.array(df["filter2"].values)
y = np.array(df.categoryId.values)
cv = CountVectorizer(max_features = 6000)
x = cv.fit_transform(df.filter2).toarray()
print("X.shape = ",x.shape)
print("y.shape = ",y.shape)
import pickle
pickle.dump(cv, open('helpers/countVectorizer.pickel', 'wb'))
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 10, shuffle = True)
print(len(x_train))
print(len(x_test))
y_train_sampling = np.unique(y_train, return_counts=True)[1]/len(y_train)*100
y_sampling = np.unique(y, return_counts=True)[1]/len(y)*100
y_train_sampling
y_sampling
np.sum(np.square(y_train_sampling-y_sampling))
# from sklearn.model_selection import train_test_split
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0, shuffle = True)
# print(len(x_train))
# print(len(x_test))
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, test_index in split.split(df, df["categoryId"]):
x_train = x[train_index]
y_train = y[train_index]
x_test = x[test_index]
y_test = y[test_index]
y_train_strat_sampling = np.unique(y_train, return_counts=True)[1]/len(y_train)*100
y_train_strat_sampling
np.sum(np.square(y_train_strat_sampling-y_sampling))
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
def trainAModel(mdl):
if(mdl == "logistic"):
mdl = LogisticRegression()
elif mdl == "SVC":
mdl = SVC()
elif mdl == "decissionTree":
mdl = DecisionTreeClassifier()
elif mdl == "randomForest":
mdl = RandomForestClassifier(n_estimators=100 ,criterion='entropy' , random_state=0)
elif mdl == "knearestN":
mdl = KNeighborsClassifier(n_neighbors=10 , metric= 'minkowski' , p = 4)
else:
mdl = MultinomialNB(alpha=1.0,fit_prior=True)
oneVsRest = OneVsRestClassifier(mdl)
oneVsRest.fit(x_train, y_train)
y_pred = oneVsRest.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print("Accuracy: "+ str(accuracy))
print("Precission Score: "+ str(classification_report(y_true=y_test, y_pred=y_pred)))
# print("Recall Score: "+ str(recall_score(y_test, y_pred, pos_label='positive' ,average='micro')))
# print("F1 Score: "+ str(f1_score(y_test, y_pred, pos_label='positive' ,average='micro')))
print("ConfusionMatrix: ")
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True)
print()
return oneVsRest
%%time
LogisticMdl = trainAModel("logistic")
import pickle
pickle.dump(LogisticMdl, open('models/logisticMdl.sav', 'wb'))
%%time
dTreeMdl = trainAModel("decissionTree")
pickle.dump(dTreeMdl, open('models/decissionTree.sav', 'wb'))
%%time
naiveMdl = trainAModel("naive")
pickle.dump(naiveMdl, open('models/naiveMdl.sav', 'wb'))
%%time
svcMdl = trainAModel("SVC")
pickle.dump(svcMdl, open('models/svcMdl.sav', 'wb'))
mdl = naiveMdl
def PredictNewsType(news):
if type(news) == str:
news = [news]
toBePredicted = pd.DataFrame(news)
toBePredicted = toBePredicted[0]
toBePredicted = toBePredicted.apply(lambda x: filterNonAscii(x.lower()))
toBePredicted = toBePredicted.apply(lambda x: remove_stopwords(x))
toBePredicted = toBePredicted.apply(lambda x: lemmitize(x))
outs = mdl.predict(cv.transform(toBePredicted).toarray())
outs_prob = mdl.predict_proba(cv.transform(toBePredicted).toarray())
ans= []
for i, out in enumerate(outs):
ans.append((lc.inverse_transform([out])[0], np.round(max(outs_prob[i])*100,2)))
return ans
PredictNewsType(["Before going to Spain, Loh had trained in Dubai with Tokyo Olympics champion Viktor Axelsen and Sen and said the young Indian is an extremely talented player.",
"The second Test saw quite a few heated exchanges between the players and with a fit-again Kohli back for the all-important game, Elgar feels more verbal blows will be traded at the Newlands.",
"China’s Chang’e 5 lunar probe finds first on-site evidence of water on moon’s surface",
"Candidates can check the detailed schedule on the official website of MCC- mcc.nic.in. The registration process for the counselling will begin on January 12.",
"In its pre-Budget memorandum, the Hotel Association of India (HAI) said policy interventions are imperative for the sector's survival and its early and quick rebound to normalcy.",
"Oh Young Soo was nominated alongside Billy Crudup and Mark Duplass in “The Morning Show,” Kieran Culkin in “Succession,” and Brett Goldstein in “Ted Lasso.”",
])
PredictNewsType("Classes will be online since june")
lc.inverse_transform([2])[0]