from datetime import datetime
import pytz
from google.cloud import storage
import pandas as pd
from google.oauth2.service_account import Credentials
import os
import missingno as mnso
import matplotlib.pyplot as plt
# Put the timestamp for tracking
ts = datetime.now(pytz.timezone('Asia/Ho_Chi_Minh')).strftime('%Y-%m-%d %H:%M:%S')
print(f'Run at: {ts}')
# Parameters
ts = datetime.now(pytz.timezone("Asia/Ho_Chi_Minh")).strftime("%Y%m%d-%H%M%S")
elyra_path = 'analytics-training-samples/02-reproducibility/elyra-assignment'
pipeline_filename = f'{elyra_path}/starter.pipeline'
output_filename = f'{elyra_path}/data/02_primary/cleaned_stories.csv'
gs_bucket = 'cleaned-data-deepnote'
gs_filename = f'cleaned_stories/{ts}/data-stories.csv'
# Set variable for bash execution
os.environ['ELYRA_PATH'] = elyra_path
os.environ['PIPELINE_FILENAME'] = pipeline_filename
%%bash
ls ${ELYRA_PATH}
%%bash
elyra-pipeline run ${PIPELINE_FILENAME}
df = pd.read_csv(output_filename)
mnso.matrix(df.sample(100))
plt.title(f'Missing matrix: {ts}', size=16);
def upload_blob(bucket_name, source_file_name, destination_blob_name):
"""Uploads a file to the bucket."""
# Utilize the existing DEMO_ANALYTICS_HUB_SERVICE_ACCOUNT, Deepnote enable BQ
# Create credentials to upload the output to GCS
credentials = Credentials.from_service_account_info(eval(os.environ['DEMO_ANALYTICS_HUB_SERVICE_ACCOUNT']))
storage_client = storage.Client(credentials=credentials)
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename(source_file_name)
print(
f"UPLOAD ===> \n {source_file_name}\nTO ===>\n {destination_blob_name}."
)
upload_blob(gs_bucket, output_filename, gs_filename)