import pandas as pd
import numpy as np
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import nltk
import pymorphy2
from string import punctuation as PUNCTUATION
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
STOPWORDS = nltk.corpus.stopwords.words('russian')
MONTHS = mdates.MonthLocator()
DAYS = mdates.DayLocator()
pd.set_option(
'display.float_format', lambda x: '%.3f' % x
)
sns.set(
rc = {
'figure.figsize': (16, 9),
'figure.dpi': 80,
'axes.grid': True,
'axes.grid.axis': 'x',
'axes.grid.which': 'both',
'grid.alpha': .4,
'xtick.minor.visible': True,
},
palette = 'colorblind',
style = 'ticks'
)
PATH = '/work/final_project/data/slack'
TEAM = pd.read_csv(f'{PATH}/team.csv', squeeze=True)
FILES = os.listdir(PATH)
df = pd.DataFrame()
for file in FILES:
if 'da_20' not in file: continue
temp = pd.read_csv(f'{PATH}/{file}')
for col in temp.select_dtypes('object'): temp[col] = temp[col].str.replace('ё', 'е')
temp['channel'] = re.sub('(?:da_20_)|(?:[_\d]*.csv$)', '', file)
temp = temp.query("real_name not in @TEAM")
df = df.append(temp, ignore_index=True)
df.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2154 entries, 0 to 2153
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 thread_ts 1998 non-null float64
1 ts 2154 non-null float64
2 user 2154 non-null object
3 text 2144 non-null object
4 reactions 946 non-null object
5 real_name 2154 non-null object
6 channel 2154 non-null object
dtypes: float64(2), object(5)
memory usage: 1.9 MB
for col in df.select_dtypes('object'):
df[col] = df[col].str.lower()
index = df['ts'].append(df['thread_ts'].dropna()).unique()
temp = pd.Series(
index = index,
data = range(len(index))
)
for col in ('thread_ts', 'ts'):
df[re.sub('ts', 'id', col)] = df[col].replace(temp)
df[col] = pd.to_datetime(df[col], unit='s')
df = df.sort_values('ts', ascending=False)
df.isna().mean()
df[df['text'].isna()]
for col in df.select_dtypes('datetime64'):
df[col] = df[col] + pd.Timedelta('3H')
df['text'] = df['text'].fillna('attached_file')
for col in ('thread_ts', 'thread_id'):
df[col] = df[col].fillna(df[re.sub('thread_', '', col)]).astype('int')
temp = df.loc[df['real_name'].str.contains('[a-z]'), 'real_name'].unique()
temp
for name in temp:
df.loc[df['real_name'] == name, 'real_name'] = re.sub('(?:\s*/[a-z\s]+$)|(?:^[a-z\s]+/\s*)', '', name)
temp = df.loc[df['real_name'].str.contains('[a-z]'), 'real_name'].unique()
temp
df = df.query("real_name not in ['nikita bergman', 'ulad-khan', 'глеб михайлов']").reset_index(drop=True)
temp = (
df
.groupby('real_name')
.agg({
'text': 'count',
'channel': lambda x: f'{x.value_counts().index[0]} - {x.value_counts().iloc[0]}'
})
.sort_values('text', ascending=False)
.head(10)
)
temp
sns.barplot(
data = temp,
x = 'text',
y = temp.index,
)
plt.title('Количество сообщений по студентам')
plt.xlabel('')
plt.ylabel('')
plt.show()
temp = df[df['id'] == df['thread_id']]['real_name'].value_counts().head(10)
temp
sns.barplot(
x = temp,
y = temp.index,
)
plt.title('Количество созданных тредов по студентам')
plt.xlabel('')
plt.ylabel('')
plt.show()
len(temp.to_frame().join(df['real_name'].value_counts().head(10), rsuffix='_all', how='inner'))
df['create_thread'] = df['id'] == df['thread_id']
temp = df.astype({'ts': 'datetime64[D]'}).groupby('ts').agg({'id': 'nunique', 'create_thread': 'sum'})
ax = sns.lineplot(
data = temp,
y = 'id',
x = temp.index,
label = 'Количество сообщений',
)
sns.lineplot(
data = temp,
y = 'create_thread',
x = temp.index,
label = 'Количество тредов',
ax = ax
)
WEEKS = df['ts'].astype('datetime64[W]') + pd.Timedelta(4, 'D')
[
ax.axvline(
week,
alpha=.5,
ls='--',
label='Начало спринта' if i[0] == 1 else ''
)
for i, week in np.ndenumerate(WEEKS.unique()) if i[0]%2
]
for i, week in enumerate(['2021-03-08', '2021-05-17']):
week = pd.to_datetime(week)
ax.axvspan(
week,
week + pd.Timedelta(7, 'D'),
alpha=.3,
label='Каникулы' if i==0 else ''
)
ax.axhline(0, c='black', alpha=.5)
ax.xaxis.set_major_locator(MONTHS)
ax.xaxis.set_minor_locator(DAYS)
plt.title('Распределение сообщений и тредов по времени')
plt.xlabel('')
plt.ylabel('')
plt.legend()
plt.show()
df.groupby('channel')['thread_id'].nunique(dropna=False).sort_values(ascending=False)
temp = df.groupby(['channel', 'thread_id'])['id'].nunique().reset_index()
sns.histplot(
data = temp,
hue = 'channel',
x = 'id',
multiple = 'stack'
)
plt.xlim(0, 25)
plt.show()
temp.sort_values('id', ascending=False).head()
df[df['thread_id'] == 2191].head()
df['hour'] = df['ts'].dt.hour
temp = df[~df['create_thread']].groupby('hour')['thread_id'].count()
temp
sns.lineplot(
x = temp.index,
y = temp
)
plt.title('Распределение количества сообщений в тредах во времени')
plt.xlabel('Час')
plt.ylabel('')
plt.show()
df['thread_id'].value_counts().head(10)
STOPWORDS.append('attached_file')
def clean_text(text):
"""Возвращает строку, очищенную от знаков пунктуации и лишних слов."""
text = re.split(' |:|\.|\(|\)|,|"|;|/|\n|\t|-|\?|\[|\]|!', text)
text = ' '.join([word for word in text if word not in PUNCTUATION])
text = text.lower()
text = ' '.join([word for word in text.split() if word not in STOPWORDS])
text = re.sub('<.+>', '', text)
return text
df['clean_text'] = df['text'].apply(clean_text)
df['cnt_words'] = df