print('hello there this is anish jain doing some cold emailing')
hello there this is anish jain doing some cold emailing
import pandas as pd
import numpy as np
np_array = np.array([[1,2,3],[3,9,4]])
np_array
print('what is this')
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfTransformer,TfidfVectorizer
what is this
from sklearn.metrics.pairwise import sigmoid_kernel
help(sigmoid_kernel)
Help on function sigmoid_kernel in module sklearn.metrics.pairwise:
sigmoid_kernel(X, Y=None, gamma=None, coef0=1)
Compute the sigmoid kernel between X and Y::
K(X, Y) = tanh(gamma <X, Y> + coef0)
Read more in the :ref:`User Guide <sigmoid_kernel>`.
Parameters
----------
X : ndarray of shape (n_samples_X, n_features)
Y : ndarray of shape (n_samples_Y, n_features), default=None
gamma : float, default=None
If None, defaults to 1.0 / n_features.
coef0 : float, default=1
Returns
-------
Gram matrix : ndarray of shape (n_samples_X, n_samples_Y)
train = pd.read_csv('Train.csv')
train.tail()
text = train.Text
cv = CountVectorizer()
cvm = cv.fit_transform(text)
type(cvm)
# help(CountVectorizer)
cvm.shape
import matplotlib.pyplot as plt
import seaborn as sns
train.Label.unique()
train['sent_len'] = train['Text'].apply(lambda x: len(x.split(' ')) )
train.head()
sum(train[train['Label'] == 'POLITICS']['sent_len']) / train[train['Label'] == 'POLITICS'].shape[0]
bar_x =[]
bar_y = []
for c in train.Label.unique():
print(c)
var = sum(train[train['Label'] == c]['sent_len']) / train[train['Label'] == c].shape[0]
bar_x.append(c)
bar_y.append(var)
POLITICS
HEALTH
LAW/ORDER
RELIGION
FARMING
WILDLIFE/ENVIRONMENT
SOCIAL ISSUES
SOCIAL
OPINION/ESSAY
LOCALCHIEFS
WITCHCRAFT
ECONOMY
SPORTS
RELATIONSHIPS
TRANSPORT
CULTURE
EDUCATION
MUSIC
ARTS AND CRAFTS
FLOODING
train.Label.nunique(),train.shape , train.Label.value_counts()
plt.figure(figsize=(12,6))
ax = sns.barplot(x=bar_x,y=bar_y)
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
plt.title('Word-Count',weight='bold')
plt.xlabel('Topics')
plt.ylabel('word-count')
train['Text'][1]
NameError: name 'train' is not defined
tfv = TfidfVectorizer()
tfvm = tfv.fit_transform(train.Label)
tfvm.shape
test = pd.read_csv('Test.csv')
test_tfvm = tfv.transform(test.Text)
test_tfvm.shape
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(tfvm,train.Label)
lr.predict(test_tfvm[0])
ValueError: X has 30206 features per sample; expecting 50677