DJIA_Analysis

# Importing necessary libraries import requests import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from transformers import AutoModelForSequenceClassification, AutoTokenizer import torch import finnhub import sqlite3 from datetime import datetime, timedelta, timezone from tqdm import tqdm tqdm.pandas() ### DATE SETTINGS ### # Function to convert date string to Unix timestamp def unix_timestamp_from_date(date_str, date_format="%Y-%m-%d"): dt = datetime.strptime(date_str, date_format) unix_timestamp = dt.replace(tzinfo=timezone.utc).timestamp() return int(unix_timestamp) # Get the current date and time current_datetime = datetime.now() # Format the current date # current_date = current_datetime.strftime("%Y-%m-%d") # Set the current date to predefined date # current_date = datetime.now().strftime("%Y-%m-%d") current_date = '2023-09-01' # Calculate the date 30 days ago from the current date from_datetime = current_datetime - timedelta(days=30) from_datetime # Format the from_date from_date_str = from_datetime.strftime("%Y-%m-%d") # Convert to Unix timestamps from_date = unix_timestamp_from_date(from_date_str) to_date = unix_timestamp_from_date(current_date) ### FINNHUB API SETTINGS ### # API key setup (replace 'YOUR_API_KEY' with the actual API key) api_key = 'API_KEY' # DJIA Tickers and Companies symbols = ['AAPL', 'MSFT', 'JNJ', 'PG', 'V', 'RTX', 'UNH', 'VZ', 'CSCO', 'KO', 'DOW', 'TRV', 'JPM', 'INTC', 'WBA', 'CVX', 'CAT', 'MMM', 'GS', 'NKE', 'HD', 'MRK', 'DIS', 'IBM', 'MCD', 'BA', 'AMGN', 'CRM', 'XOM', 'PFE'] companies = ['Apple', 'Microsoft', 'Johnson & Johnson', 'Procter & Gamble', 'Visa', 'Raytheon', 'UnitedHealth', 'Verizon', 'Cisco', 'Coca-Cola', 'Dow Chemical', 'Travelers', 'JPMorgan Chase', 'Intel', 'Walgreens', 'Chevron', 'Caterpillar', '3M', 'Goldman Sachs', 'Nike', 'Home Depot', 'Merck', 'Disney', 'IBM', 'McDonalds', 'Boeing', 'Amgen', 'Salesforce', 'Exxon Mobil', 'Pfizer'] # Initialize an empty list to store news headlines as DataFrames df_list = [] # Loop through each DJIA company for symbol, company in zip(symbols, companies): # API request to get news headlines url = f'https://finnhub.io/api/v1/company-news?symbol={symbol}&from={current_date}&to={current_date}&token={api_key}' response = requests.get(url) # Error checking for API response if response.status_code != 200: print(f"Failed to get data for {symbol}") continue # Extract news headlines news_data = response.json() news_df = pd.DataFrame(news_data) # Convert Unix timestamps to human-readable datetime news_df['datetime'] = pd.to_datetime(news_df['datetime'], unit='s') # Add 'company' and 'symbol' columns news_df['company'] = company news_df['symbol'] = symbol # Keep only the columns we need # news_df = news_df[['company', 'symbol', 'datetime', 'headline']] # Add to list of DataFrames df_list.append(news_df) # Concatenate all the collected DataFrames df_headlines = pd.concat(df_list, ignore_index=True) # Create new column 'analysis' by concatenating 'headline' and 'summary' df_headlines['analysis'] = df_headlines['headline'] + ' ' + df_headlines['summary'] ### SENTIMENT ANALYSIS SETTINGS ### # Initialize finBERT model and tokenizer # For finBERT, we used the model identifier from Hugging Face model_name = 'ProsusAI/finbert' tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name) # Function to compute sentiment def compute_sentiment(headline): inputs = tokenizer(headline, return_tensors="pt", max_length=512, truncation=True) with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits sentiment = torch.softmax(logits, dim=1).numpy() # Assuming 0: negative, 1: neutral, 2: positive return ['negative', 'neutral', 'positive'][sentiment.argmax()] # Reminder: df_headlines is our DataFrame with news headlines # Add a new column for sentiment df_headlines['sentiment'] = df_headlines['analysis'].progress_apply(compute_sentiment) print(f"Data for {current_date} has been fetched and inserted into the dataframe.")

display(df_headlines)

# Convert 'datetime' to date df_headlines['date'] = pd.to_datetime(df_headlines['datetime'], unit='s').dt.date # Group by 'company' and 'date', then count sentiment occurrences grouped_df = df_headlines.groupby(['company', 'date', 'sentiment']).size().reset_index(name='count') # Initialize an empty DataFrame to store final results result_df = pd.DataFrame() # Loop through unique companies and dates for name, group in grouped_df.groupby(['company', 'date']): company, date = name total_count = group['count'].sum() # Calculate sentiment score sentiment_score = "" for idx, row in group.iterrows(): if row['count'] / total_count > 0.5: sentiment_score = row['sentiment'] break if not sentiment_score: sentiment_score = "neutral" # Append to result DataFrame result_df = result_df.append({'company': company, 'date': date, 'sentiment_score': sentiment_score}, ignore_index=True)

display(result_df)

import finnhub import os import time from datetime import datetime, timezone from datetime import date from zoneinfo import ZoneInfo import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import sqlite3 # Function to convert date string to Unix timestamp def unix_timestamp_from_date(date_str, date_format="%Y-%m-%d"): dt = datetime.strptime(date_str, date_format) unix_timestamp = dt.replace(tzinfo=timezone.utc).timestamp() return int(unix_timestamp) start_date = '2023-08-01' #current_date = '2023-09-01' # Convert to Unix timestamps from_date = unix_timestamp_from_date(start_date) to_date = unix_timestamp_from_date(current_date) # DJIA Tickers and Companies symbols = ['AAPL', 'MSFT', 'JNJ', 'PG', 'V', 'RTX', 'UNH', 'VZ', 'CSCO', 'KO', 'DOW', 'TRV', 'JPM', 'INTC', 'WBA', 'CVX', 'CAT', 'MMM', 'GS', 'NKE', 'HD', 'MRK', 'DIS', 'IBM', 'MCD', 'BA', 'AMGN', 'CRM', 'XOM', 'PFE'] companies = ['Apple', 'Microsoft', 'Johnson & Johnson', 'Procter & Gamble', 'Visa', 'Raytheon', 'UnitedHealth', 'Verizon', 'Cisco', 'Coca-Cola', 'Dow Chemical', 'Travelers', 'JPMorgan Chase', 'Intel', 'Walgreens', 'Chevron', 'Caterpillar', '3M', 'Goldman Sachs', 'Nike', 'Home Depot', 'Merck', 'Disney', 'IBM', 'McDonalds', 'Boeing', 'Amgen', 'Salesforce', 'Exxon Mobil', 'Pfizer'] # Set up client finnhub_client = finnhub.Client(api_key='API_KEY') df_list = [] resolution = 'D' # make request and print for symbol, company in zip(symbols, companies): res = finnhub_client.stock_candles( symbol, resolution, from_date, to_date ) # Ditch the status code try: res.pop('s') except KeyError as e: print("Already ditched status code") stock_data = res stock_df = pd.DataFrame(stock_data) # Convert Unix timestamps to human-readable datetime stock_df['t'] = pd.to_datetime(stock_df['t'], unit='s') # Add 'company' and 'symbol' columns stock_df['company'] = company stock_df['symbol'] = symbol stock_df['prices'] = stock_df.pop('c') stock_df['dates'] = stock_df.pop('t') # Keep only the columns we need stock_df = stock_df[['company', 'symbol', 'dates', 'prices']] # Add to list of DataFrames df_list.append(stock_df) df_stock_data = pd.concat(df_list, ignore_index=True) display(df_stock_data)

df_stock_data['Daily Return'] = df_stock_data.groupby('symbol')['prices'].pct_change() df_stock_data.sort_values(['symbol', 'dates'], inplace=True) # Define the short-term and long-term periods for EMA short_term = 12 long_term = 26 # Calculate the short-term and long-term exponential moving averages (EMAs) df_stock_data['ShortEMA'] = df_stock_data.groupby('symbol')['Daily Return'].transform(lambda x: x.ewm(span=short_term).mean()) df_stock_data['LongEMA'] = df_stock_data.groupby('symbol')['Daily Return'].transform(lambda x: x.ewm(span=long_term).mean()) # Calculate the MACD line (the difference between short-term and long-term EMAs) df_stock_data['MACD'] = df_stock_data['ShortEMA'] - df_stock_data['LongEMA'] # Define the signal line period signal_period = 9 # Calculate the signal line (9-day EMA of the MACD) df_stock_data['SignalLine'] = df_stock_data.groupby('symbol')['MACD'].transform(lambda x: x.ewm(span=signal_period).mean()) # Calculate the MACD histogram (the difference between MACD and Signal Line) df_stock_data['MACD_Histogram'] = df_stock_data['MACD'] - df_stock_data['SignalLine'] df_stock_data['Delta Histogram'] = df_stock_data.groupby('symbol')['MACD_Histogram'].pct_change() def get_stock_recommendation(row): if row['Delta Histogram'] > 0.00: return "Buy" else: return "Sell" df_stock_data['stock_rec'] = df_stock_data.apply(get_stock_recommendation, axis=1) # Sort by date and ticker #df_stock_data = df_stock_data.sort_values(by=['symbol', 'dates']) # Drop duplicates keeping last occurrence df_stock_data = df_stock_data.drop_duplicates(subset='company', keep='last') display(df_stock_data)

# Convert 'dates' to date df_stock_data['dates'] = pd.to_datetime(df_stock_data['dates'], unit='s').dt.date

display(df_stock_data)

clean_stock_df = df_stock_data[['company', 'symbol', 'dates', 'stock_rec']] clean_stock_df['primary_key'] = clean_stock_df['company'] + '_' + clean_stock_df['dates'].astype(str) display(clean_stock_df)

clean_result_df = result_df[['company', 'date', 'sentiment_score']] clean_result_df['primary_key'] = clean_result_df['company'] + '_' + clean_result_df['date'].astype(str) display(clean_result_df)

merged_df = pd.merge(clean_stock_df, clean_result_df, how='inner', on='primary_key') merged_df = merged_df[['primary_key', 'symbol', 'date', 'sentiment_score', 'stock_rec']] display(merged_df)

def total_rec(row): if row['stock_rec'] == 'Buy' and row['sentiment_score']=='positive': return "Strong Buy" elif row['stock_rec'] == 'Buy' and row['sentiment_score']=='neutral': return "Buy" elif row['stock_rec'] == 'Buy' and row['sentiment_score']=='negative': return "Hold" elif row['stock_rec'] == 'Sell' and row['sentiment_score']=='positive': return "Hold" elif row['stock_rec'] == 'Sell' and row['sentiment_score']=='neutral': return "Sell" else: return "Strong Sell" merged_df['total_rec'] = merged_df.apply(total_rec, axis=1) display(merged_df)

# Append DataFrame to existing CSV merged_df.to_csv('/datasets/ucfgdrive/COP_DJIA_Dataset.csv', mode='a', header=False, index=False)