DeFi DL

!pip install transformers !pip install datasets !pip install sentencepiece !pip install emoji !pip install git+https://github.com/JustAnotherArchivist/snscrape.git import os import re import random import time import datetime import torch import argparse import numpy as np import pandas as pd import snscrape.modules.twitter as sntwitter from nltk.tokenize import TweetTokenizer from emoji import demojize from sklearn.metrics import classification_report from scipy.special import softmax from transformers import (AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments) from torch.utils.data import (TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset) ############################################################################################################### ########################################## Language model fine-tune ########################################### ############################################################################################################### # --- PARAMS --- LR = 1e-5 EPOCHS = 3 BATCH_SIZE = 128 MODEL = 'vinai/bertweet-base' MAX_TRAINING_EXAMPLES = -1 # --- DATA --- files = """test_labels.txt test_text.txt train_labels.txt train_text.txt val_labels.txt val_text.txt""".splitlines() for f in files: p = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/{f}" !wget $p dataset_dict = {} for i in ['train','val','test']: dataset_dict[i] = {} for j in ['text','labels']: dataset_dict[i][j] = open(f"{i}_{j}.txt").read().splitlines() if j == 'labels': dataset_dict[i][j] =[int(x) for x in dataset_dict[i][j]] if MAX_TRAINING_EXAMPLES > 0: dataset_dict['train']['text']=dataset_dict['train']['text'][:MAX_TRAINING_EXAMPLES] dataset_dict['train']['labels']=dataset_dict['train']['labels'][:MAX_TRAINING_EXAMPLES] tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True, normalization=True) train_encodings = tokenizer(dataset_dict['train']['text'], truncation=True, padding=True) val_encodings = tokenizer(dataset_dict['val']['text'], truncation=True, padding=True) test_encodings = tokenizer(dataset_dict['test']['text'], truncation=True, padding=True) class MyDataset(torch.utils.data.Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item['labels'] = torch.tensor(self.labels[idx]) return item def __len__(self): return len(self.labels) train_dataset = MyDataset(train_encodings, dataset_dict['train']['labels']) val_dataset = MyDataset(val_encodings, dataset_dict['val']['labels']) test_dataset = MyDataset(test_encodings, dataset_dict['test']['labels']) # --- FINE-TUNE --- training_args = TrainingArguments( output_dir='./results', # output directory num_train_epochs=EPOCHS, # total number of training epochs per_device_train_batch_size=BATCH_SIZE, # batch size per device during training per_device_eval_batch_size=BATCH_SIZE, # batch size for evaluation warmup_steps=100, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=10, # when to print log load_best_model_at_end=True, # load or not best model at the end ) num_labels = len(set(dataset_dict["train"]["labels"])) model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=train_dataset, # training dataset eval_dataset=val_dataset # evaluation dataset ) trainer.train() trainer.save_model("./results/best_model") # save best model # --- EVALUATE --- test_preds_raw, test_labels , _ = trainer.predict(test_dataset) test_preds = np.argmax(test_preds_raw, axis=-1) print(classification_report(test_labels, test_preds, digits=3)) ############################################################################################################### ############################################## Tweet preprocess ############################################### ############################################################################################################### def preprocess(corpus): outcorpus = [] for text in corpus: new_text = [] for t in text.split(" "): t = '@user' if t.startswith('@') and len(t) > 1 else t t = '#hashtag' if t.startswith('#') and len(t) > 1 else t t = '$cashtag' if t.startswith('$') and len(t) > 1 else t t = 'http' if t.startswith('http') else t t.replace("’", "'").replace("…", "...") # Contractions, underspellings t = t.replace("cannot ", "can not ").replace("n't ", " n't ").replace("n 't ", " n't ").replace("ca n't", "can't").replace("ai n't", "ain't") t = t.replace("'m ", " 'm ").replace("'re ", " 're ").replace("'s ", " 's ").replace("'ll ", " 'll ").replace("'d ", " 'd ").replace("'ve ", " 've ") t = t.replace(" p . m .", " p.m.") .replace(" p . m ", " p.m ").replace(" a . m .", " a.m.").replace(" a . m ", " a.m ") # Special characters t = re.sub(r"\x89Û_", "", t) t = re.sub(r"\x89ÛÒ", "", t) t = re.sub(r"\x89ÛÓ", "", t) t = re.sub(r"\x89ÛÏWhen", "When", t) t = re.sub(r"\x89ÛÏ", "", t) t = re.sub(r"let\x89Ûªs", "let's", t) t = re.sub(r"\x89Û÷", "", t) t = re.sub(r"\x89Ûª", "", t) t = re.sub(r"\x89Û\x9d", "", t) t = re.sub(r"å_", "", t) t = re.sub(r"\x89Û¢", "", t) t = re.sub(r"\x89Û¢åÊ", "", t) t = re.sub(r"åÊ", "", t) t = re.sub(r"åÈ", "", t) t = re.sub(r"Ì©", "e", t) t = re.sub(r"å¨", "", t) t = re.sub(r"åÇ", "", t) t = re.sub(r"åÀ", "", t) # Other contractions t = re.sub(r"won't", "will not", t) t = re.sub(r"I'M", "I am", t) t = re.sub(r"i'm", "I am", t) t = re.sub(r"I\x89Ûªm", "I am", t) t = re.sub(r"I'm", "I am", t) t = re.sub(r"y'all", "you all", t) t = re.sub(r"Let's", "Let us", t) t = re.sub(r"Ain't", "am not", t) # Other contractions t = re.sub(r"won't", "will not", t) t = re.sub(r"I'M", "I am", t) t = re.sub(r"i'm", "I am", t) t = re.sub(r"I\x89Ûªm", "I am", t) t = re.sub(r"I'm", "I am", t) t = re.sub(r"y'all", "you all", t) t = re.sub(r"Let's", "Let us", t) t = re.sub(r"Ain't", "am not", t) # Character entity references t = re.sub(r">", ">", t) t = re.sub(r"<", "<", t) t = re.sub(r"&", "&", t) # Typos, slang and informal abbreviations t = re.sub(r"w/e", "whatever", t) t = re.sub(r"w/", "with", t) t = re.sub(r"USAgov", "USA government", t) t = re.sub(r"recentlu", "recently", t) t = re.sub(r"amirite", "am I right", t) t = re.sub(r"exp0sed", "exposed", t) t = re.sub(r"<3", "love", t) t = re.sub(r"lmao", "laughing my ass off", t) t = re.sub(r"airdrop", "free destribution", t) t = re.sub(r"lmao", "laughing my ass off", t) t = re.sub(r"ATH", "all time high", t) t = re.sub(r"ath", "all time high", t) t = re.sub(r"arbitrage", "profiting of price difference on different markets", t) t = re.sub(r"bag holder", "long time loyal investor", t) t = re.sub(r"bull", "continued price increase", t) t = re.sub(r"bear", "continued price decrease", t) t = re.sub(r"DAO", "decentralised autonomous organisation", t) t = re.sub(r"dao", "decentralised autonomous organisation", t) t = re.sub(r"bear", "continued price decrease", t) t = re.sub(r"FUD", "fear, uncertainty, doubt", t) t = re.sub(r"fud", "fear, uncertainty, doubt", t) t = re.sub(r"fudding", "spreading fear, uncertainty, doubt", t) t = re.sub(r"FOMO", "fear of missing out", t) t = re.sub(r"fomo", "fear of missing out", t) t = re.sub(r"gas", "commission for transaction verification", t) t = re.sub(r"fomo", "fear of missing out", t) t = re.sub(r"HODL", "to be a loyal investor", t) t = re.sub(r"hodl", "to be a loyal investor", t) t = re.sub(r"hodling", "being a loyal investor", t) t = re.sub(r"IDO", "initial distributed exchange offering", t) t = re.sub(r"DEX", "distributed exchange", t) t = re.sub(r"dex", "distributed exchange", t) t = re.sub(r"moon", "limitless price rise", t) t = re.sub(r"rekt", "completely destroyed", t) t = re.sub(r"shitcoin", "garbage", t) t = re.sub(r"rug pull", "scam", t) # Words with punctuations and special characters punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" for p in punctuations: t = t.replace(p, f' {p} ') t = re.sub(r"([0-9]{2,4}) , ([0-9]{2,4})", r",\1,\2", t) t = re.sub(r"([0-9]{1,3}) / ([0-9]{2,4})", r"\1/\2", t) t = re.sub(r"([0-9]{1,3})- ([0-9]{2,4})", r"\1-\2", t) new_text.append(t) new_text = " ".join(new_text) outcorpus.append(new_text) return outcorpus ############################################################################################################### ############################################## Twitter feed prep ############################################## ############################################################################################################### # --- FEI --- # List to append tweet data to tweets_list_fei = [] # Scrape data and append tweets to list for i,tweet in enumerate(sntwitter.TwitterSearchScraper('$FEI OR @feiprotocol since:2021-01-01 lang:en -from:defisniper -from:DYORCryptoBot -from:FuturesTracker, -from:Crypto3OT, -from:BoxerXrp').get_items()): tweets_list_fei.append([tweet.date, tweet.content]) # A dataframe from the list of tweets tweets_df_fei = pd.DataFrame(tweets_list_fei, columns=['Datetime', 'Text'])