
# Importing necessary libraries
import requests
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import finnhub
import sqlite3
from datetime import datetime, timedelta, timezone
from tqdm import tqdm
tqdm.pandas()
### DATE SETTINGS ###
# Function to convert date string to Unix timestamp
def unix_timestamp_from_date(date_str, date_format="%Y-%m-%d"):
dt = datetime.strptime(date_str, date_format)
unix_timestamp = dt.replace(tzinfo=timezone.utc).timestamp()
return int(unix_timestamp)
# Get the current date and time
current_datetime = datetime.now()
# Format the current date
# current_date = current_datetime.strftime("%Y-%m-%d")
# Set the current date to predefined date
# current_date = datetime.now().strftime("%Y-%m-%d")
current_date = '2023-09-01'
# Calculate the date 30 days ago from the current date
from_datetime = current_datetime - timedelta(days=30)
from_datetime
# Format the from_date
from_date_str = from_datetime.strftime("%Y-%m-%d")
# Convert to Unix timestamps
from_date = unix_timestamp_from_date(from_date_str)
to_date = unix_timestamp_from_date(current_date)
### FINNHUB API SETTINGS ###
# API key setup (replace 'YOUR_API_KEY' with the actual API key)
api_key = 'API_KEY'
# DJIA Tickers and Companies
symbols = ['AAPL', 'MSFT', 'JNJ', 'PG', 'V', 'RTX', 'UNH', 'VZ', 'CSCO', 'KO',
'DOW', 'TRV', 'JPM', 'INTC', 'WBA', 'CVX', 'CAT', 'MMM', 'GS',
'NKE', 'HD', 'MRK', 'DIS', 'IBM', 'MCD', 'BA', 'AMGN', 'CRM', 'XOM', 'PFE']
companies = ['Apple', 'Microsoft', 'Johnson & Johnson', 'Procter & Gamble',
'Visa', 'Raytheon', 'UnitedHealth', 'Verizon', 'Cisco', 'Coca-Cola',
'Dow Chemical', 'Travelers', 'JPMorgan Chase', 'Intel', 'Walgreens',
'Chevron', 'Caterpillar', '3M', 'Goldman Sachs', 'Nike', 'Home Depot',
'Merck', 'Disney', 'IBM', 'McDonalds', 'Boeing', 'Amgen', 'Salesforce',
'Exxon Mobil', 'Pfizer']
# Initialize an empty list to store news headlines as DataFrames
df_list = []
# Loop through each DJIA company
for symbol, company in zip(symbols, companies):
# API request to get news headlines
url = f'https://finnhub.io/api/v1/company-news?symbol={symbol}&from={current_date}&to={current_date}&token={api_key}'
response = requests.get(url)
# Error checking for API response
if response.status_code != 200:
print(f"Failed to get data for {symbol}")
continue
# Extract news headlines
news_data = response.json()
news_df = pd.DataFrame(news_data)
# Convert Unix timestamps to human-readable datetime
news_df['datetime'] = pd.to_datetime(news_df['datetime'], unit='s')
# Add 'company' and 'symbol' columns
news_df['company'] = company
news_df['symbol'] = symbol
# Keep only the columns we need
# news_df = news_df[['company', 'symbol', 'datetime', 'headline']]
# Add to list of DataFrames
df_list.append(news_df)
# Concatenate all the collected DataFrames
df_headlines = pd.concat(df_list, ignore_index=True)
# Create new column 'analysis' by concatenating 'headline' and 'summary'
df_headlines['analysis'] = df_headlines['headline'] + ' ' + df_headlines['summary']
### SENTIMENT ANALYSIS SETTINGS ###
# Initialize finBERT model and tokenizer
# For finBERT, we used the model identifier from Hugging Face
model_name = 'ProsusAI/finbert'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# Function to compute sentiment
def compute_sentiment(headline):
inputs = tokenizer(headline, return_tensors="pt", max_length=512, truncation=True)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
sentiment = torch.softmax(logits, dim=1).numpy()
# Assuming 0: negative, 1: neutral, 2: positive
return ['negative', 'neutral', 'positive'][sentiment.argmax()]
# Reminder: df_headlines is our DataFrame with news headlines
# Add a new column for sentiment
df_headlines['sentiment'] = df_headlines['analysis'].progress_apply(compute_sentiment)
print(f"Data for {current_date} has been fetched and inserted into the dataframe.")
display(df_headlines)
# Convert 'datetime' to date
df_headlines['date'] = pd.to_datetime(df_headlines['datetime'], unit='s').dt.date
# Group by 'company' and 'date', then count sentiment occurrences
grouped_df = df_headlines.groupby(['company', 'date', 'sentiment']).size().reset_index(name='count')
# Initialize an empty DataFrame to store final results
result_df = pd.DataFrame()
# Loop through unique companies and dates
for name, group in grouped_df.groupby(['company', 'date']):
company, date = name
total_count = group['count'].sum()
# Calculate sentiment score
sentiment_score = ""
for idx, row in group.iterrows():
if row['count'] / total_count > 0.5:
sentiment_score = row['sentiment']
break
if not sentiment_score:
sentiment_score = "neutral"
# Append to result DataFrame
result_df = result_df.append({'company': company, 'date': date, 'sentiment_score': sentiment_score}, ignore_index=True)
display(result_df)
import finnhub
import os
import time
from datetime import datetime, timezone
from datetime import date
from zoneinfo import ZoneInfo
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sqlite3
# Function to convert date string to Unix timestamp
def unix_timestamp_from_date(date_str, date_format="%Y-%m-%d"):
dt = datetime.strptime(date_str, date_format)
unix_timestamp = dt.replace(tzinfo=timezone.utc).timestamp()
return int(unix_timestamp)
start_date = '2023-08-01'
#current_date = '2023-09-01'
# Convert to Unix timestamps
from_date = unix_timestamp_from_date(start_date)
to_date = unix_timestamp_from_date(current_date)
# DJIA Tickers and Companies
symbols = ['AAPL', 'MSFT', 'JNJ', 'PG', 'V', 'RTX', 'UNH', 'VZ', 'CSCO', 'KO',
'DOW', 'TRV', 'JPM', 'INTC', 'WBA', 'CVX', 'CAT', 'MMM', 'GS',
'NKE', 'HD', 'MRK', 'DIS', 'IBM', 'MCD', 'BA', 'AMGN', 'CRM', 'XOM', 'PFE']
companies = ['Apple', 'Microsoft', 'Johnson & Johnson', 'Procter & Gamble',
'Visa', 'Raytheon', 'UnitedHealth', 'Verizon', 'Cisco', 'Coca-Cola',
'Dow Chemical', 'Travelers', 'JPMorgan Chase', 'Intel', 'Walgreens',
'Chevron', 'Caterpillar', '3M', 'Goldman Sachs', 'Nike', 'Home Depot',
'Merck', 'Disney', 'IBM', 'McDonalds', 'Boeing', 'Amgen', 'Salesforce',
'Exxon Mobil', 'Pfizer']
# Set up client
finnhub_client = finnhub.Client(api_key='API_KEY')
df_list = []
resolution = 'D'
# make request and print
for symbol, company in zip(symbols, companies):
res = finnhub_client.stock_candles(
symbol,
resolution,
from_date,
to_date
)
# Ditch the status code
try:
res.pop('s')
except KeyError as e:
print("Already ditched status code")
stock_data = res
stock_df = pd.DataFrame(stock_data)
# Convert Unix timestamps to human-readable datetime
stock_df['t'] = pd.to_datetime(stock_df['t'], unit='s')
# Add 'company' and 'symbol' columns
stock_df['company'] = company
stock_df['symbol'] = symbol
stock_df['prices'] = stock_df.pop('c')
stock_df['dates'] = stock_df.pop('t')
# Keep only the columns we need
stock_df = stock_df[['company', 'symbol', 'dates', 'prices']]
# Add to list of DataFrames
df_list.append(stock_df)
df_stock_data = pd.concat(df_list, ignore_index=True)
display(df_stock_data)
df_stock_data['Daily Return'] = df_stock_data.groupby('symbol')['prices'].pct_change()
df_stock_data.sort_values(['symbol', 'dates'], inplace=True)
# Define the short-term and long-term periods for EMA
short_term = 12
long_term = 26
# Calculate the short-term and long-term exponential moving averages (EMAs)
df_stock_data['ShortEMA'] = df_stock_data.groupby('symbol')['Daily Return'].transform(lambda x: x.ewm(span=short_term).mean())
df_stock_data['LongEMA'] = df_stock_data.groupby('symbol')['Daily Return'].transform(lambda x: x.ewm(span=long_term).mean())
# Calculate the MACD line (the difference between short-term and long-term EMAs)
df_stock_data['MACD'] = df_stock_data['ShortEMA'] - df_stock_data['LongEMA']
# Define the signal line period
signal_period = 9
# Calculate the signal line (9-day EMA of the MACD)
df_stock_data['SignalLine'] = df_stock_data.groupby('symbol')['MACD'].transform(lambda x: x.ewm(span=signal_period).mean())
# Calculate the MACD histogram (the difference between MACD and Signal Line)
df_stock_data['MACD_Histogram'] = df_stock_data['MACD'] - df_stock_data['SignalLine']
df_stock_data['Delta Histogram'] = df_stock_data.groupby('symbol')['MACD_Histogram'].pct_change()
def get_stock_recommendation(row):
if row['Delta Histogram'] > 0.00:
return "Buy"
else:
return "Sell"
df_stock_data['stock_rec'] = df_stock_data.apply(get_stock_recommendation, axis=1)
# Sort by date and ticker
#df_stock_data = df_stock_data.sort_values(by=['symbol', 'dates'])
# Drop duplicates keeping last occurrence
df_stock_data = df_stock_data.drop_duplicates(subset='company', keep='last')
display(df_stock_data)
# Convert 'dates' to date
df_stock_data['dates'] = pd.to_datetime(df_stock_data['dates'], unit='s').dt.date
display(df_stock_data)
clean_stock_df = df_stock_data[['company', 'symbol', 'dates', 'stock_rec']]
clean_stock_df['primary_key'] = clean_stock_df['company'] + '_' + clean_stock_df['dates'].astype(str)
display(clean_stock_df)
clean_result_df = result_df[['company', 'date', 'sentiment_score']]
clean_result_df['primary_key'] = clean_result_df['company'] + '_' + clean_result_df['date'].astype(str)
display(clean_result_df)
merged_df = pd.merge(clean_stock_df, clean_result_df, how='inner', on='primary_key')
merged_df = merged_df[['primary_key', 'symbol', 'date', 'sentiment_score', 'stock_rec']]
display(merged_df)
def total_rec(row):
if row['stock_rec'] == 'Buy' and row['sentiment_score']=='positive':
return "Strong Buy"
elif row['stock_rec'] == 'Buy' and row['sentiment_score']=='neutral':
return "Buy"
elif row['stock_rec'] == 'Buy' and row['sentiment_score']=='negative':
return "Hold"
elif row['stock_rec'] == 'Sell' and row['sentiment_score']=='positive':
return "Hold"
elif row['stock_rec'] == 'Sell' and row['sentiment_score']=='neutral':
return "Sell"
else:
return "Strong Sell"
merged_df['total_rec'] = merged_df.apply(total_rec, axis=1)
display(merged_df)
# Append DataFrame to existing CSV
merged_df.to_csv('/datasets/ucfgdrive/COP_DJIA_Dataset.csv', mode='a', header=False, index=False)