ML Project

def collectSportsNews(): URL = "https://indianexpress.com/section/sports/" r = requests.get(URL) soup = BeautifulSoup(r.content, 'html5lib') selected = soup.find('div', attrs = {'class':'nation'}) newss = selected.findAll('div', attrs={'class': 'articles'}) news_list = [] for news in newss: c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.find('p').text, "SPORTS")) for i in range(2,100): r = requests.get(URL+"/page/"+str(i)) soup = BeautifulSoup(r.content, 'html5lib') selected = soup.find('div', attrs = {'class':'nation'}) newss = selected.findAll('div', attrs={'class': 'articles'}) for news in newss: c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.find('p').text, "SPORTS")) connection.commit() print(news_list)

def collectEntertainmentData(): URL = "https://indianexpress.com/section/entertainment/" r = requests.get(URL) soup = BeautifulSoup(r.content, 'html5lib') selected = soup.find('div', attrs = {'class':'nation'}) newss = selected.findAll('div', attrs={'class': 'articles'}) news_list = [] for news in newss: c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.find('p').text, "ENTERTAINMENT")) for i in range(2,100): r = requests.get(URL+"/page/"+str(i)) soup = BeautifulSoup(r.content, 'html5lib') selected = soup.find('div', attrs = {'class':'nation'}) newss = selected.findAll('div', attrs={'class': 'articles'}) for news in newss: c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.find('p').text, "ENTERTAINMENT")) connection.commit() print(news_list)

def collectTechData(): URL = "https://indianexpress.com/section/technology/" r = requests.get(URL) soup = BeautifulSoup(r.content, 'html5lib') # selected = soup.find('ul', attrs = {'class':'article-list'}) # newss = selected.findAll('li') # for news in newss: # c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.findAll('a')[1].text, "TECH")) for i in range(100,106): r = requests.get(URL+"/page/"+str(i)) soup = BeautifulSoup(r.content, 'html5lib') selected = soup.find('ul', attrs = {'class':'article-list'}) newss = selected.findAll('li') for news in newss: c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.findAll('a')[1].text, "TECH")) connection.commit()

def collectBusinessData(): URL = "https://indianexpress.com/section/business/" r = requests.get(URL) soup = BeautifulSoup(r.content, 'html5lib') selected = soup.find('div', attrs = {'class':'nation'}) newss = selected.findAll('div', attrs={'class': 'articles'}) news_list = [] for news in newss: c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.find('p').text, "BUSINESS")) for i in range(2,100): r = requests.get(URL+"/page/"+str(i)) soup = BeautifulSoup(r.content, 'html5lib') selected = soup.find('div', attrs = {'class':'nation'}) newss = selected.findAll('div', attrs={'class': 'articles'}) for news in newss: c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.find('p').text, "BUSINESS")) connection.commit() print(news_list)

def collectEducationData(): URL = "https://indianexpress.com/section/education/" r = requests.get(URL) soup = BeautifulSoup(r.content, 'html5lib') selected = soup.find('div', attrs = {'class':'nation'}) newss = selected.findAll('div', attrs={'class': 'articles'}) news_list = [] for news in newss: c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.find('p').text, "EDUCATION")) for i in range(2,100): r = requests.get(URL+"/page/"+str(i)) soup = BeautifulSoup(r.content, 'html5lib') selected = soup.find('div', attrs = {'class':'nation'}) newss = selected.findAll('div', attrs={'class': 'articles'}) for news in newss: c.execute("INSERT INTO NEWS(news_article , category ) VALUES (?,?)",(news.find('p').text, "EDUCATION")) connection.commit() print(news_list)

def getAllInfoData(): c.execute("SELECT Category , COUNT(*) FROM NEWS Group by Category") print(c.fetchall())

!pip install bs4

import sqlite3 import requests from bs4 import BeautifulSoup import os # if db not exist then ask and scrape if not (os.path.isfile("News.db")): print("#"*50) choice = input("Can I scrape the data from net? Y-n: ") if (choice.lower() == "y" or choice.lower() ==" yes"): connection = sqlite3.connect("News.db") c = connection.cursor() c.execute("""CREATE TABLE NEWS(news_article text, category text)""") collectSportsNews() collectEntertainmentData() collectTechData() collectEducationData() collectBusinessData() getAllInfoData() else: pass else: connection = sqlite3.connect("News.db") c = connection.cursor()

import numpy as np import pandas as pd import matplotlib.pyplot as plt import pickle

sql_query = pd.read_sql_query (''' SELECT * FROM NEWS ''', connection) df = pd.DataFrame(sql_query, columns = ['news_article', 'category']) # print (df) # getAllInfoData()

df.head()

This chart is empty

Chart was probably not set up properly in the notebook

df.category.value_counts()

max(df["news_article"])

min(df["news_article"])

df.isnull().sum()

from sklearn.preprocessing import LabelEncoder

lc = LabelEncoder() df["categoryId"] = lc.fit_transform(df["category"]) df.sample(10)

pickle.dump(lc, open('helpers/LabelEncoder.pickel', 'wb'))

categoryDF = df[["category", "categoryId"]].drop_duplicates().sort_values(by="categoryId") categoryDF

df["category"].value_counts().plot(kind="pie", shadow=True, autopct='%1.1f%%')

df["category"].value_counts().plot(kind="bar")

!pip install wordcloud

import nltk from wordcloud import WordCloud, STOPWORDS from nltk.corpus import stopwords # Create stopword list: stopwords = set(STOPWORDS) stopwords.update(["br", "href"]) textt = " ".join(review for review in df["news_article"]) wordcloud = WordCloud(stopwords=stopwords).generate(textt) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.savefig('images/wordcloud11.png') plt.show()

"' '".join(set(" ".join(review for review in df["news_article"])))

# REMOVE SPECIAL CHARs import re def filterNonAscii(text): split = re.split("\W+", text) return " ".join(split)

df["filter1"]= df["news_article"].apply(lambda x: filterNonAscii(x.lower()))

"' '".join(set(" ".join(review for review in df["filter1"])))

nltk.download('stopwords')

def remove_stopwords(text): stopwords = set(nltk.corpus.stopwords.words('english')) stopwords.update(["br", "href"]) text=[word for word in text.split(" ") if word not in stopwords] return " ".join(text)

df["filter1"]= df["filter1"].apply(lambda x: remove_stopwords(x)) print(df["news_article"][4]) print("-"*170) print(df["filter1"][4])

from nltk.stem import WordNetLemmatizer def lemmitize(text): wordnet = WordNetLemmatizer() return " ".join([wordnet.lemmatize(word) for word in text.split(" ")])

nltk.download('wordnet')

df["filter2"]= df["filter1"].apply(lambda x: lemmitize(x)) print(df["news_article"][4]) print("-"*170) print(df["filter2"][4])

sportsDF = df[df["category"]=="SPORTS"] teachDF = df[df["category"]=="TECH"] entertainmentDF = df[df["category"]=="ENTERTAINMENT"] businessDF = df[df["category"]=="BUSINESS"] educationDF = df[df["category"]=="EDUCATION"]

educationDF.sample(3)

textt = " ".join(review for review in sportsDF["filter2"]) wordcloud = WordCloud(stopwords=stopwords).generate(textt) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.savefig('images/sports.png') plt.show()

textt = " ".join(review for review in educationDF["filter2"]) wordcloud = WordCloud(stopwords=stopwords).generate(textt) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.savefig('images/edu.png') plt.show()

textt = " ".join(review for review in businessDF["filter2"]) wordcloud = WordCloud(stopwords=stopwords).generate(textt) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.savefig('images/business.png') plt.show()

textt = " ".join(review for review in teachDF["filter2"]) wordcloud = WordCloud(stopwords=stopwords).generate(textt) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.savefig('images/tech.png') plt.show()

textt = " ".join(review for review in entertainmentDF["filter2"]) wordcloud = WordCloud(stopwords=stopwords).generate(textt) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.savefig('images/entertainment.png') plt.show()

from sklearn.feature_extraction.text import CountVectorizer x = np.array(df["filter2"].values) y = np.array(df.categoryId.values) cv = CountVectorizer(max_features = 6000) x = cv.fit_transform(df.filter2).toarray() print("X.shape = ",x.shape) print("y.shape = ",y.shape)

import pickle pickle.dump(cv, open('helpers/countVectorizer.pickel', 'wb'))

from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 10, shuffle = True) print(len(x_train)) print(len(x_test))

y_train_sampling = np.unique(y_train, return_counts=True)[1]/len(y_train)*100 y_sampling = np.unique(y, return_counts=True)[1]/len(y)*100

y_train_sampling

y_sampling

np.sum(np.square(y_train_sampling-y_sampling))

# from sklearn.model_selection import train_test_split # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0, shuffle = True) # print(len(x_train)) # print(len(x_test)) from sklearn.model_selection import StratifiedShuffleSplit split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42) for train_index, test_index in split.split(df, df["categoryId"]): x_train = x[train_index] y_train = y[train_index] x_test = x[test_index] y_test = y[test_index]

y_train_strat_sampling = np.unique(y_train, return_counts=True)[1]/len(y_train)*100 y_train_strat_sampling

np.sum(np.square(y_train_strat_sampling-y_sampling))

from sklearn.linear_model import LogisticRegression from sklearn.multiclass import OneVsRestClassifier from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix from sklearn.metrics import recall_score, precision_score, f1_score from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from sklearn.svm import SVC, LinearSVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier import seaborn as sns from sklearn.naive_bayes import MultinomialNB

def trainAModel(mdl): if(mdl == "logistic"): mdl = LogisticRegression() elif mdl == "SVC": mdl = SVC() elif mdl == "decissionTree": mdl = DecisionTreeClassifier() elif mdl == "randomForest": mdl = RandomForestClassifier(n_estimators=100 ,criterion='entropy' , random_state=0) elif mdl == "knearestN": mdl = KNeighborsClassifier(n_neighbors=10 , metric= 'minkowski' , p = 4) else: mdl = MultinomialNB(alpha=1.0,fit_prior=True) oneVsRest = OneVsRestClassifier(mdl) oneVsRest.fit(x_train, y_train) y_pred = oneVsRest.predict(x_test) accuracy = round(accuracy_score(y_test, y_pred) * 100, 2) print("Accuracy: "+ str(accuracy)) print("Precission Score: "+ str(classification_report(y_true=y_test, y_pred=y_pred))) # print("Recall Score: "+ str(recall_score(y_test, y_pred, pos_label='positive' ,average='micro'))) # print("F1 Score: "+ str(f1_score(y_test, y_pred, pos_label='positive' ,average='micro'))) print("ConfusionMatrix: ") sns.heatmap(confusion_matrix(y_test, y_pred), annot=True) print() return oneVsRest

%%time LogisticMdl = trainAModel("logistic")

import pickle pickle.dump(LogisticMdl, open('models/logisticMdl.sav', 'wb'))

%%time dTreeMdl = trainAModel("decissionTree")

pickle.dump(dTreeMdl, open('models/decissionTree.sav', 'wb'))

%%time naiveMdl = trainAModel("naive")

pickle.dump(naiveMdl, open('models/naiveMdl.sav', 'wb'))

%%time svcMdl = trainAModel("SVC")

pickle.dump(svcMdl, open('models/svcMdl.sav', 'wb'))

mdl = naiveMdl

def PredictNewsType(news): if type(news) == str: news = [news] toBePredicted = pd.DataFrame(news) toBePredicted = toBePredicted[0] toBePredicted = toBePredicted.apply(lambda x: filterNonAscii(x.lower())) toBePredicted = toBePredicted.apply(lambda x: remove_stopwords(x)) toBePredicted = toBePredicted.apply(lambda x: lemmitize(x)) outs = mdl.predict(cv.transform(toBePredicted).toarray()) outs_prob = mdl.predict_proba(cv.transform(toBePredicted).toarray()) ans= [] for i, out in enumerate(outs): ans.append((lc.inverse_transform([out])[0], np.round(max(outs_prob[i])*100,2))) return ans

PredictNewsType(["Before going to Spain, Loh had trained in Dubai with Tokyo Olympics champion Viktor Axelsen and Sen and said the young Indian is an extremely talented player.", "The second Test saw quite a few heated exchanges between the players and with a fit-again Kohli back for the all-important game, Elgar feels more verbal blows will be traded at the Newlands.", "China’s Chang’e 5 lunar probe finds first on-site evidence of water on moon’s surface", "Candidates can check the detailed schedule on the official website of MCC- mcc.nic.in. The registration process for the counselling will begin on January 12.", "In its pre-Budget memorandum, the Hotel Association of India (HAI) said policy interventions are imperative for the sector's survival and its early and quick rebound to normalcy.", "Oh Young Soo was nominated alongside Billy Crudup and Mark Duplass in “The Morning Show,” Kieran Culkin in “Succession,” and Brett Goldstein in “Ted Lasso.”", ])

PredictNewsType("Classes will be online since june")

lc.inverse_transform([2])[0]