!pip install transformers
!pip install datasets
!pip install sentencepiece
!pip install emoji
!pip install git+https://github.com/JustAnotherArchivist/snscrape.git
import os
import re
import random
import time
import datetime
import torch
import argparse
import numpy as np
import pandas as pd
import snscrape.modules.twitter as sntwitter
from nltk.tokenize import TweetTokenizer
from emoji import demojize
from sklearn.metrics import classification_report
from scipy.special import softmax
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
Trainer, TrainingArguments)
from torch.utils.data import (TensorDataset, DataLoader,
RandomSampler, SequentialSampler, Dataset)
###############################################################################################################
########################################## Language model fine-tune ###########################################
###############################################################################################################
# --- PARAMS ---
LR = 1e-5
EPOCHS = 3
BATCH_SIZE = 128
MODEL = 'vinai/bertweet-base'
MAX_TRAINING_EXAMPLES = -1
# --- DATA ---
files = """test_labels.txt
test_text.txt
train_labels.txt
train_text.txt
val_labels.txt
val_text.txt""".splitlines()
for f in files:
p = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/{f}"
!wget $p
dataset_dict = {}
for i in ['train','val','test']:
dataset_dict[i] = {}
for j in ['text','labels']:
dataset_dict[i][j] = open(f"{i}_{j}.txt").read().splitlines()
if j == 'labels':
dataset_dict[i][j] =[int(x) for x in dataset_dict[i][j]]
if MAX_TRAINING_EXAMPLES > 0:
dataset_dict['train']['text']=dataset_dict['train']['text'][:MAX_TRAINING_EXAMPLES]
dataset_dict['train']['labels']=dataset_dict['train']['labels'][:MAX_TRAINING_EXAMPLES]
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True, normalization=True)
train_encodings = tokenizer(dataset_dict['train']['text'], truncation=True, padding=True)
val_encodings = tokenizer(dataset_dict['val']['text'], truncation=True, padding=True)
test_encodings = tokenizer(dataset_dict['test']['text'], truncation=True, padding=True)
class MyDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_dataset = MyDataset(train_encodings, dataset_dict['train']['labels'])
val_dataset = MyDataset(val_encodings, dataset_dict['val']['labels'])
test_dataset = MyDataset(test_encodings, dataset_dict['test']['labels'])
# --- FINE-TUNE ---
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=EPOCHS, # total number of training epochs
per_device_train_batch_size=BATCH_SIZE, # batch size per device during training
per_device_eval_batch_size=BATCH_SIZE, # batch size for evaluation
warmup_steps=100, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10, # when to print log
load_best_model_at_end=True, # load or not best model at the end
)
num_labels = len(set(dataset_dict["train"]["labels"]))
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels)
trainer = Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset # evaluation dataset
)
trainer.train()
trainer.save_model("./results/best_model") # save best model
# --- EVALUATE ---
test_preds_raw, test_labels , _ = trainer.predict(test_dataset)
test_preds = np.argmax(test_preds_raw, axis=-1)
print(classification_report(test_labels, test_preds, digits=3))
###############################################################################################################
############################################## Tweet preprocess ###############################################
###############################################################################################################
def preprocess(corpus):
outcorpus = []
for text in corpus:
new_text = []
for t in text.split(" "):
t = '@user' if t.startswith('@') and len(t) > 1 else t
t = '#hashtag' if t.startswith('#') and len(t) > 1 else t
t = '$cashtag' if t.startswith('$') and len(t) > 1 else t
t = 'http' if t.startswith('http') else t
t.replace("’", "'").replace("…", "...")
# Contractions, underspellings
t = t.replace("cannot ", "can not ").replace("n't ", " n't ").replace("n 't ", " n't ").replace("ca n't", "can't").replace("ai n't", "ain't")
t = t.replace("'m ", " 'm ").replace("'re ", " 're ").replace("'s ", " 's ").replace("'ll ", " 'll ").replace("'d ", " 'd ").replace("'ve ", " 've ")
t = t.replace(" p . m .", " p.m.") .replace(" p . m ", " p.m ").replace(" a . m .", " a.m.").replace(" a . m ", " a.m ")
# Special characters
t = re.sub(r"\x89Û_", "", t)
t = re.sub(r"\x89ÛÒ", "", t)
t = re.sub(r"\x89ÛÓ", "", t)
t = re.sub(r"\x89ÛÏWhen", "When", t)
t = re.sub(r"\x89ÛÏ", "", t)
t = re.sub(r"let\x89Ûªs", "let's", t)
t = re.sub(r"\x89Û÷", "", t)
t = re.sub(r"\x89Ûª", "", t)
t = re.sub(r"\x89Û\x9d", "", t)
t = re.sub(r"å_", "", t)
t = re.sub(r"\x89Û¢", "", t)
t = re.sub(r"\x89Û¢åÊ", "", t)
t = re.sub(r"åÊ", "", t)
t = re.sub(r"åÈ", "", t)
t = re.sub(r"Ì©", "e", t)
t = re.sub(r"å¨", "", t)
t = re.sub(r"åÇ", "", t)
t = re.sub(r"åÀ", "", t)
# Other contractions
t = re.sub(r"won't", "will not", t)
t = re.sub(r"I'M", "I am", t)
t = re.sub(r"i'm", "I am", t)
t = re.sub(r"I\x89Ûªm", "I am", t)
t = re.sub(r"I'm", "I am", t)
t = re.sub(r"y'all", "you all", t)
t = re.sub(r"Let's", "Let us", t)
t = re.sub(r"Ain't", "am not", t)
# Other contractions
t = re.sub(r"won't", "will not", t)
t = re.sub(r"I'M", "I am", t)
t = re.sub(r"i'm", "I am", t)
t = re.sub(r"I\x89Ûªm", "I am", t)
t = re.sub(r"I'm", "I am", t)
t = re.sub(r"y'all", "you all", t)
t = re.sub(r"Let's", "Let us", t)
t = re.sub(r"Ain't", "am not", t)
# Character entity references
t = re.sub(r">", ">", t)
t = re.sub(r"<", "<", t)
t = re.sub(r"&", "&", t)
# Typos, slang and informal abbreviations
t = re.sub(r"w/e", "whatever", t)
t = re.sub(r"w/", "with", t)
t = re.sub(r"USAgov", "USA government", t)
t = re.sub(r"recentlu", "recently", t)
t = re.sub(r"amirite", "am I right", t)
t = re.sub(r"exp0sed", "exposed", t)
t = re.sub(r"<3", "love", t)
t = re.sub(r"lmao", "laughing my ass off", t)
t = re.sub(r"airdrop", "free destribution", t)
t = re.sub(r"lmao", "laughing my ass off", t)
t = re.sub(r"ATH", "all time high", t)
t = re.sub(r"ath", "all time high", t)
t = re.sub(r"arbitrage", "profiting of price difference on different markets", t)
t = re.sub(r"bag holder", "long time loyal investor", t)
t = re.sub(r"bull", "continued price increase", t)
t = re.sub(r"bear", "continued price decrease", t)
t = re.sub(r"DAO", "decentralised autonomous organisation", t)
t = re.sub(r"dao", "decentralised autonomous organisation", t)
t = re.sub(r"bear", "continued price decrease", t)
t = re.sub(r"FUD", "fear, uncertainty, doubt", t)
t = re.sub(r"fud", "fear, uncertainty, doubt", t)
t = re.sub(r"fudding", "spreading fear, uncertainty, doubt", t)
t = re.sub(r"FOMO", "fear of missing out", t)
t = re.sub(r"fomo", "fear of missing out", t)
t = re.sub(r"gas", "commission for transaction verification", t)
t = re.sub(r"fomo", "fear of missing out", t)
t = re.sub(r"HODL", "to be a loyal investor", t)
t = re.sub(r"hodl", "to be a loyal investor", t)
t = re.sub(r"hodling", "being a loyal investor", t)
t = re.sub(r"IDO", "initial distributed exchange offering", t)
t = re.sub(r"DEX", "distributed exchange", t)
t = re.sub(r"dex", "distributed exchange", t)
t = re.sub(r"moon", "limitless price rise", t)
t = re.sub(r"rekt", "completely destroyed", t)
t = re.sub(r"shitcoin", "garbage", t)
t = re.sub(r"rug pull", "scam", t)
# Words with punctuations and special characters
punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`"
for p in punctuations:
t = t.replace(p, f' {p} ')
t = re.sub(r"([0-9]{2,4}) , ([0-9]{2,4})", r",\1,\2", t)
t = re.sub(r"([0-9]{1,3}) / ([0-9]{2,4})", r"\1/\2", t)
t = re.sub(r"([0-9]{1,3})- ([0-9]{2,4})", r"\1-\2", t)
new_text.append(t)
new_text = " ".join(new_text)
outcorpus.append(new_text)
return outcorpus
###############################################################################################################
############################################## Twitter feed prep ##############################################
###############################################################################################################
# --- FEI ---
# List to append tweet data to
tweets_list_fei = []
# Scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('$FEI OR @feiprotocol since:2021-01-01 lang:en -from:defisniper -from:DYORCryptoBot -from:FuturesTracker, -from:Crypto3OT, -from:BoxerXrp').get_items()):
tweets_list_fei.append([tweet.date, tweet.content])
# A dataframe from the list of tweets
tweets_df_fei = pd.DataFrame(tweets_list_fei, columns=['Datetime', 'Text'])