import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
from google.oauth2 import service_account
from google.cloud import bigquery
bq_credentials = service_account.Credentials.from_service_account_info(json.loads(os.environ['BIGQUERY_INTEGRATION_SERVICE_ACCOUNT']))
bq_client = bigquery.Client(credentials=bq_credentials, project=bq_credentials.project_id)
query = bq_client.query('SELECT "Connection successful"')
for row in query.result():
print(row[0])
Connection successful
query = """
SELECT
Store,
StoreType
FROM
`principal-fact-279009.stores_sales.stores`
LIMIT
100
"""
query_job = bq_client.query(query)
df = query_job.to_dataframe()
df
query = """
SELECT
Date, Sales, Customers
FROM
`principal-fact-279009.stores_sales.sales`
WHERE
Store=1
"""
query_job = bq_client.query(query)
df = query_job.to_dataframe()
df
query = """
SELECT
Date, Sales, Customers
FROM
`principal-fact-279009.stores_sales.sales`
WHERE
Store=100
"""
query_job = bq_client.query(query)
df = query_job.to_dataframe()
df
sales_df = pd.read_csv("/work/train.csv", parse_dates=True)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3156: DtypeWarning: Columns (7) have mixed types.Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
sales_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Store 1017209 non-null int64
1 DayOfWeek 1017209 non-null int64
2 Date 1017209 non-null object
3 Sales 1017209 non-null int64
4 Customers 1017209 non-null int64
5 Open 1017209 non-null int64
6 Promo 1017209 non-null int64
7 StateHoliday 1017209 non-null object
8 SchoolHoliday 1017209 non-null int64
dtypes: int64(7), object(2)
memory usage: 69.8+ MB
sales_df["StateHoliday"].value_counts()
sales_df["StateHoliday"] = sales_df["StateHoliday"].astype("string")
sales_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Store 1017209 non-null int64
1 DayOfWeek 1017209 non-null int64
2 Date 1017209 non-null object
3 Sales 1017209 non-null int64
4 Customers 1017209 non-null int64
5 Open 1017209 non-null int64
6 Promo 1017209 non-null int64
7 StateHoliday 1017209 non-null string
8 SchoolHoliday 1017209 non-null int64
dtypes: int64(7), object(1), string(1)
memory usage: 69.8+ MB
sales_df.to_csv("/work/sales.csv", header=False, index=False)