# Start writing code he
!pip install requirements.txt
pip install youtube-data-api
WARNING: The directory '/home/jovyan/.cache/pip' or its parent directory is not owned or is not writable by the current user. The cache has been disabled. Check the permissions and owner of that directory. If executing pip with sudo, you may want sudo's -H flag.
Collecting youtube-data-api
Downloading youtube_data_api-0.0.20-py3-none-any.whl (12 kB)
Requirement already satisfied: pandas in /opt/venv/lib/python3.7/site-packages (from youtube-data-api) (1.0.5)
Requirement already satisfied: requests in /opt/venv/lib/python3.7/site-packages (from youtube-data-api) (2.25.0)
Requirement already satisfied: numpy>=1.13.3 in /opt/venv/lib/python3.7/site-packages (from pandas->youtube-data-api) (1.19.4)
Requirement already satisfied: python-dateutil>=2.6.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas->youtube-data-api) (2.8.1)
Requirement already satisfied: pytz>=2017.2 in /opt/venv/lib/python3.7/site-packages (from pandas->youtube-data-api) (2020.4)
Requirement already satisfied: chardet<4,>=3.0.2 in /opt/venv/lib/python3.7/site-packages (from requests->youtube-data-api) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /opt/venv/lib/python3.7/site-packages (from requests->youtube-data-api) (2020.11.8)
Requirement already satisfied: idna<3,>=2.5 in /opt/venv/lib/python3.7/site-packages (from requests->youtube-data-api) (2.10)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/venv/lib/python3.7/site-packages (from requests->youtube-data-api) (1.26.2)
Requirement already satisfied: six>=1.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from python-dateutil>=2.6.1->pandas->youtube-data-api) (1.15.0)
Installing collected packages: youtube-data-api
Successfully installed youtube-data-api-0.0.20
WARNING: You are using pip version 20.2.4; however, version 20.3.3 is available.
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.
from youtube_api import YouTubeDataAPI
from youtube_api import parsers
api_key = 'AIzaSyCjfvDwwfe322328YBcpKYH0_BYt0b50NLDDN0-Pw0'
yt = YouTubeDataAPI(api_key)
yt.search(q='что было дальше', max_results=5, order_by='relevance')
yt.get_playlists(channel_id='UCNqktdxgAADBj36dC7VGOgg')
video_data=list()
all_videos=yt.get_videos_from_playlist_id('PLmkbS48df313zBeQtxckns8Nq4IyWhV1P')
for item in all_videos:
video_data.append(yt.get_video_metadata(item['video_id']))
video_data
import pandas as pd
df=pd.DataFrame(video_data)
df
from natasha import (
Segmenter,
MorphVocab,
NewsEmbedding,
NewsMorphTagger,
NewsSyntaxParser,
NewsNERTagger,
PER,
NamesExtractor,
Doc
)
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)
names_extractor = NamesExtractor(morph_vocab)
def return_names(data):
doc=Doc(data)
doc.segment(segmenter)
doc.tag_ner(ner_tagger)
jam=list()
for item2 in doc.spans:
if item2.type == "PER":
print(item2)
jam.append(item2.text)
return str(jam)
df['PERSON']=df['video_title'].apply(return_names)
DocSpan(stop=17, type='PER', text='Дмитрий Губерниев', tokens=[...])
DocSpan(start=20, stop=33, type='PER', text='Настя Ивлеева', tokens=[...])
DocSpan(start=36, stop=47, type='PER', text='Олег Майами', tokens=[...])
DocSpan(start=50, stop=73, type='PER', text='Гарик Харламов х Джиган', tokens=[...])
DocSpan(stop=15, type='PER', text='Павел Деревянко', tokens=[...])
DocSpan(start=18, stop=31, type='PER', text='Денис Дорохов', tokens=[...])
DocSpan(start=21, stop=37, type='PER', text='Отар Кушанашвили', tokens=[...])
DocSpan(stop=15, type='PER', text='Никита Джигурда', tokens=[...])
DocSpan(start=18, stop=31, type='PER', text='Павел Дедищев', tokens=[...])
DocSpan(stop=14, type='PER', text='Богдан Титомир', tokens=[...])
DocSpan(start=17, stop=30, type='PER', text='Василий Уткин', tokens=[...])
DocSpan(start=18, stop=36, type='PER', text='Братья Пономаренко', tokens=[...])
DocSpan(stop=15, type='PER', text='Алексей Смирнов', tokens=[...])
DocSpan(start=18, stop=36, type='PER', text='Михаил Шуфутинский', tokens=[...])
DocSpan(stop=14, type='PER', text='Доминик Джокер', tokens=[...])
DocSpan(start=17, stop=32, type='PER', text='Виталий Милонов', tokens=[...])
DocSpan(stop=12, type='PER', text='Антон Шастун', tokens=[...])
DocSpan(stop=6, type='PER', text='Джиган', tokens=[...])
DocSpan(start=9, stop=20, type='PER', text='Ваня Усович', tokens=[...])
DocSpan(stop=22, type='PER', text='Настя Ивлеева x Тимати', tokens=[...])
DocSpan(start=10, stop=20, type='PER', text='Митя Фомин', tokens=[...])
DocSpan(stop=9, type='PER', text='Юрий Дудь', tokens=[...])
DocSpan(start=12, stop=18, type='PER', text='Лолита', tokens=[...])
DocSpan(stop=6, type='PER', text='Мигель', tokens=[...])
DocSpan(start=9, stop=24, type='PER', text='Сарик Андреасян', tokens=[...])
DocSpan(start=27, stop=40, type='PER', text='Артур Чапарян', tokens=[...])
DocSpan(start=7, stop=13, type='PER', text='Азамат', tokens=[...])
DocSpan(start=16, stop=33, type='PER', text='Дмитрий Губерниев', tokens=[...])
DocSpan(start=54, stop=61, type='PER', text='Сабуров', tokens=[...])
DocSpan(start=63, stop=71, type='PER', text='Щербаков', tokens=[...])
DocSpan(start=73, stop=82, type='PER', text='Рептилоид', tokens=[...])
DocSpan(start=91, stop=97, type='PER', text='Детков', tokens=[...])
DocSpan(start=14, stop=29, type='PER', text='Артемий Лебедев', tokens=[...])
DocSpan(start=50, stop=57, type='PER', text='Сабуров', tokens=[...])
DocSpan(start=59, stop=67, type='PER', text='Щербаков', tokens=[...])
DocSpan(start=69, stop=78, type='PER', text='Рептилоид', tokens=[...])
DocSpan(start=87, stop=93, type='PER', text='Детков', tokens=[...])
DocSpan(stop=13, type='PER', text='Вадим Галыгин', tokens=[...])
DocSpan(start=16, stop=22, type='PER', text='Джиган', tokens=[...])
DocSpan(start=43, stop=50, type='PER', text='Сабуров', tokens=[...])
DocSpan(start=52, stop=60, type='PER', text='Щербаков', tokens=[...])
DocSpan(start=62, stop=71, type='PER', text='Рептилоид', tokens=[...])
DocSpan(start=80, stop=86, type='PER', text='Детков', tokens=[...])
DocSpan(stop=12, type='PER', text='Анатолий Цой', tokens=[...])
DocSpan(start=43, stop=50, type='PER', text='Сабуров', tokens=[...])
DocSpan(start=52, stop=60, type='PER', text='Щербаков', tokens=[...])
DocSpan(start=62, stop=71, type='PER', text='Рептилоид', tokens=[...])
DocSpan(start=80, stop=86, type='PER', text='Детков', tokens=[...])
DocSpan(stop=14, type='PER', text='Александр Шпак', tokens=[...])
DocSpan(start=17, stop=30, type='PER', text='Сергей Минаев', tokens=[...])
DocSpan(start=51, stop=58, type='PER', text='Сабуров', tokens=[...])
DocSpan(start=60, stop=68, type='PER', text='Щербаков', tokens=[...])
DocSpan(start=70, stop=79, type='PER', text='Рептилоид', tokens=[...])
DocSpan(start=88, stop=94, type='PER', text='Детков', tokens=[...])
DocSpan(stop=12, type='PER', text='Илья Соболев', tokens=[...])
DocSpan(start=44, stop=51, type='PER', text='Сабуров', tokens=[...])
DocSpan(start=53, stop=61, type='PER', text='Щербаков', tokens=[...])
DocSpan(start=63, stop=72, type='PER', text='Рептилоид', tokens=[...])
DocSpan(start=81, stop=87, type='PER', text='Детков', tokens=[...])
DocSpan(stop=14, type='PER', text='Стас Костюшкин', tokens=[...])
DocSpan(start=17, stop=31, type='PER', text='Семён Слепаков', tokens=[...])
DocSpan(start=52, stop=59, type='PER', text='Сабуров', tokens=[...])
DocSpan(start=61, stop=69, type='PER', text='Щербаков', tokens=[...])
DocSpan(start=71, stop=80, type='PER', text='Рептилоид', tokens=[...])
DocSpan(start=89, stop=95, type='PER', text='Детков', tokens=[...])
DocSpan(stop=16, type='PER', text='Гарик Мартиросян', tokens=[...])
DocSpan(start=19, stop=30, type='PER', text='Олег Майами', tokens=[...])
DocSpan(start=51, stop=58, type='PER', text='Сабуров', tokens=[...])
DocSpan(start=60, stop=68, type='PER', text='Щербаков', tokens=[...])
DocSpan(start=70, stop=79, type='PER', text='Рептилоид', tokens=[...])
DocSpan(start=88, stop=94, type='PER', text='Детков', tokens=[...])
DocSpan(start=7, stop=13, type='PER', text='Азамат', tokens=[...])
DocSpan(start=52, stop=59, type='PER', text='Сабуров', tokens=[...])
DocSpan(start=61, stop=69, type='PER', text='Щербаков', tokens=[...])
DocSpan(start=71, stop=78, type='PER', text='Чапарян', tokens=[...])
DocSpan(start=87, stop=96, type='PER', text='Рептилоид', tokens=[...])
DocSpan(start=13, stop=23, type='PER', text='Михаил Шац', tokens=[...])
DocSpan(start=44, stop=51, type='PER', text='Сабуров', tokens=[...])
DocSpan(start=53, stop=61, type='PER', text='Щербаков', tokens=[...])
DocSpan(start=63, stop=70, type='PER', text='Чапарян', tokens=[...])
DocSpan(start=79, stop=88, type='PER', text='Рептилоид', tokens=[...])
DocSpan(stop=17, type='PER', text='Тимур Батрутдинов', tokens=[...])
DocSpan(start=20, stop=34, type='PER', text='Тимур Каргинов', tokens=[...])
DocSpan(start=55, stop=62, type='PER', text='Сабуров', tokens=[...])
DocSpan(start=64, stop=72, type='PER', text='Щербаков', tokens=[...])
DocSpan(start=74, stop=81, type='PER', text='Чапарян', tokens=[...])
DocSpan(start=90, stop=99, type='PER', text='Рептилоид', tokens=[...])
DocSpan(stop=24, type='PER', text='Николай Соболев x Тарзан', tokens=[...])
DocSpan(start=45, stop=52, type='PER', text='Сабуров', tokens=[...])
DocSpan(start=54, stop=62, type='PER', text='Щербаков', tokens=[...])
DocSpan(start=64, stop=71, type='PER', text='Чапарян', tokens=[...])
DocSpan(start=80, stop=89, type='PER', text='Рептилоид', tokens=[...])
DocSpan(start=13, stop=27, type='PER', text='Иосиф Пригожин', tokens=[...])
DocSpan(start=48, stop=55, type='PER', text='Сабуров', tokens=[...])
DocSpan(start=57, stop=65, type='PER', text='Щербаков', tokens=[...])
DocSpan(start=67, stop=74, type='PER', text='Чапарян', tokens=[...])
DocSpan(start=83, stop=92, type='PER', text='Рептилоид', tokens=[...])
DocSpan(stop=5, type='PER', text='Баста', tokens=[...])
DocSpan(start=8, stop=22, type='PER', text='Прохор Шаляпин', tokens=[...])
DocSpan(start=43, stop=50, type='PER', text='Сабуров', tokens=[...])
DocSpan(start=52, stop=60, type='PER', text='Щербаков', tokens=[...])
DocSpan(start=62, stop=69, type='PER', text='Чапарян', tokens=[...])
DocSpan(start=78, stop=87, type='PER', text='Рептилоид', tokens=[...])
DocSpan(stop=23, type='PER', text='Гарик Харламов x Мигель', tokens=[...])
DocSpan(start=44, stop=51, type='PER', text='Сабуров', tokens=[...])
DocSpan(start=53, stop=61, type='PER', text='Щербаков', tokens=[...])
DocSpan(start=63, stop=70, type='PER', text='Чапарян', tokens=[...])
DocSpan(start=79, stop=88, type='PER', text='Рептилоид', tokens=[...])
df
def return_orgs(data):
doc=Doc(data)
doc.segment(segmenter)
doc.tag_ner(ner_tagger)
jam=list()
for item2 in doc.spans:
if item2.type == "ORG":
print(item2)
jam.append(item2.text)
return str(jam)
df['ORGANISATION']=df['video_description'].apply(return_orgs)
DocSpan(start=318, stop=321, type='ORG', text='МТС', tokens=[...])
DocSpan(start=324, stop=332, type='ORG', text='НЕтарифу', tokens=[...])
DocSpan(start=147, stop=154, type='ORG', text='КАССЕТА', tokens=[...])
DocSpan(start=155, stop=157, type='ORG', text='YT', tokens=[...])
DocSpan(start=301, stop=305, type='ORG', text='EDIT', tokens=[...])
DocSpan(start=33, stop=41, type='ORG', text='Skillbox', tokens=[...])
DocSpan(start=27, stop=30, type='ORG', text='МТС', tokens=[...])
DocSpan(start=67, stop=82, type='ORG', text='SPOTIFY Premium', tokens=[...])
DocSpan(start=16, stop=25, type='ORG', text='Old Spice', tokens=[...])
DocSpan(start=114, stop=124, type='ORG', text='Mastercard', tokens=[...])
DocSpan(start=55, stop=69, type='ORG', text='Head&Shoulders', tokens=[...])
DocSpan(start=111, stop=119, type='ORG', text='МегаФона', tokens=[...])
DocSpan(start=467, stop=474, type='ORG', text='HammAli', tokens=[...])
DocSpan(start=139, stop=148, type='ORG', text='КАЗУСКОМА', tokens=[...])
DocSpan(start=149, stop=151, type='ORG', text='YT', tokens=[...])
DocSpan(start=90, stop=97, type='ORG', text='Абсолют', tokens=[...])
DocSpan(start=109, stop=124, type='ORG', text='Третье сентября', tokens=[...])
DocSpan(start=16, stop=25, type='ORG', text='Old Spice', tokens=[...])
DocSpan(start=43, stop=50, type='ORG', text='YouTube', tokens=[...])
DocSpan(stop=7, type='ORG', text='Связной', tokens=[...])
DocSpan(start=22, stop=29, type='ORG', text='Samsung', tokens=[...])
DocSpan(start=16, stop=25, type='ORG', text='Old Spice', tokens=[...])
DocSpan(start=110, stop=117, type='ORG', text='Связном', tokens=[...])
DocSpan(start=145, stop=152, type='ORG', text='Samsung', tokens=[...])
DocSpan(start=181, stop=193, type='ORG', text='Эльдорадо.ру', tokens=[...])
DocSpan(start=31, stop=38, type='ORG', text='YouTube', tokens=[...])
DocSpan(start=958, stop=974, type='ORG', text='Sinners\nПродакшн', tokens=[...])
DocSpan(start=976, stop=989, type='ORG', text='PBC Worldwide', tokens=[...])
DocSpan(start=59, stop=66, type='ORG', text='Samsung', tokens=[...])
DocSpan(start=116, stop=119, type='ORG', text='FAR', tokens=[...])
DocSpan(stop=8, type='ORG', text='LABELCOM', tokens=[...])
DocSpan(start=47, stop=57, type='ORG', text='Рокетбанка', tokens=[...])
DocSpan(start=97, stop=105, type='ORG', text='МегаФона', tokens=[...])
DocSpan(start=109, stop=117, type='ORG', text='МегаФона', tokens=[...])
DocSpan(start=50, stop=59, type='ORG', text='Aviasales', tokens=[...])
DocSpan(start=682, stop=689, type='ORG', text='КраСава', tokens=[...])
DocSpan(start=43, stop=52, type='ORG', text='Aviasales', tokens=[...])
DocSpan(start=295, stop=298, type='ORG', text='МТС', tokens=[...])
DocSpan(start=299, stop=304, type='ORG', text='Банка', tokens=[...])
DocSpan(start=915, stop=919, type='ORG', text='Ашан', tokens=[...])
DocSpan(start=68, stop=71, type='ORG', text='ВСЁ', tokens=[...])
DocSpan(start=182, stop=188, type='ORG', text='Яндекс', tokens=[...])
DocSpan(start=799, stop=805, type='ORG', text='KIXBOX', tokens=[...])
DocSpan(start=16, stop=24, type='ORG', text='Тинькофф', tokens=[...])
DocSpan(start=734, stop=740, type='ORG', text='KIXBOX', tokens=[...])
DocSpan(stop=15, type='ORG', text='Присоединяйтесь', tokens=[...])
DocSpan(start=56, stop=67, type='ORG', text='Chupa Chups', tokens=[...])
DocSpan(start=70, stop=79, type='ORG', text='Instagram', tokens=[...])
DocSpan(start=1121, stop=1127, type='ORG', text='KIXBOX', tokens=[...])
DocSpan(start=43, stop=51, type='ORG', text='LABELCOM', tokens=[...])
DocSpan(stop=8, type='ORG', text='LABELCOM', tokens=[...])
DocSpan(stop=8, type='ORG', text='LABELCOM', tokens=[...])
DocSpan(stop=8, type='ORG', text='LABELCOM', tokens=[...])
DocSpan(stop=8, type='ORG', text='LABELCOM', tokens=[...])
df
import re
def get_links(data):
res=re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+',data)
while 'https://instagram.com' in res: res.remove('https://instagram.com')
while 'https://www.instagram.com' in res: res.remove('https://www.instagram.com')
return res
df['LINKS']=df['video_description'].apply(get_links)
df