process-demo

from datetime import datetime import pytz from google.cloud import storage import pandas as pd from google.oauth2.service_account import Credentials import os import missingno as mnso import matplotlib.pyplot as plt

# Put the timestamp for tracking ts = datetime.now(pytz.timezone('Asia/Ho_Chi_Minh')).strftime('%Y-%m-%d %H:%M:%S') print(f'Run at: {ts}')

# Parameters ts = datetime.now(pytz.timezone("Asia/Ho_Chi_Minh")).strftime("%Y%m%d-%H%M%S") elyra_path = 'analytics-training-samples/02-reproducibility/elyra-assignment' pipeline_filename = f'{elyra_path}/starter.pipeline' output_filename = f'{elyra_path}/data/02_primary/cleaned_stories.csv' gs_bucket = 'cleaned-data-deepnote' gs_filename = f'cleaned_stories/{ts}/data-stories.csv'

# Set variable for bash execution os.environ['ELYRA_PATH'] = elyra_path os.environ['PIPELINE_FILENAME'] = pipeline_filename

%%bash ls ${ELYRA_PATH}

%%bash elyra-pipeline run ${PIPELINE_FILENAME}

df = pd.read_csv(output_filename) mnso.matrix(df.sample(100)) plt.title(f'Missing matrix: {ts}', size=16);

def upload_blob(bucket_name, source_file_name, destination_blob_name): """Uploads a file to the bucket.""" # Utilize the existing DEMO_ANALYTICS_HUB_SERVICE_ACCOUNT, Deepnote enable BQ # Create credentials to upload the output to GCS credentials = Credentials.from_service_account_info(eval(os.environ['DEMO_ANALYTICS_HUB_SERVICE_ACCOUNT'])) storage_client = storage.Client(credentials=credentials) bucket = storage_client.bucket(bucket_name) blob = bucket.blob(destination_blob_name) blob.upload_from_filename(source_file_name) print( f"UPLOAD ===> \n {source_file_name}\nTO ===>\n {destination_blob_name}." )

upload_blob(gs_bucket, output_filename, gs_filename)