import pandas as pd
#Simple Calculation
#Count
count_1 = df['var1'].count() #Count of one column (example: cases)
count_2 = df[['var1','var2']].count() #Count of more than one columns (example: cases, deaths)
count_all = df.count() #Count all varialbes in the table
#mean
mean_value = df['var1'].mean()
# mean = round(df['var1'].mean(),2) #If decimal places are needed
#standand deviation
std_value = df['var1'].std()
print('Descriptive statistics of cases')
print('Count:',count_1)
print('Mean',mean_value )
print('Stand Deviation',std_value)
Descriptive statistics
import pandas as pd
#Descriptive Statistics of one varialbe
descriptive_stats_1 = df['var1'].describe()
#Descriptive Statistics of more than one varialbes
descriptive_stats_2 = df[['var1','deaths']].describe()
#Descriptive Statistics of all numbercal varialbes in a dataframe
descriptive_stats_all = df.describe()
Extract a subset by columns and rows
subset_1 = df[['var1','var2','var3']] #Subset by column name(s)
subset_2 = df[df["var1"] > 100] #Select rows by value (example: value =100)
Data aggregation
#Collapse
#Collapse data by one varialbe, one aggregation method
df_1 = df.groupby(['var1'], dropna=True).sum().reset_index()
#Collapse data by one varialbe, by more than one aggregation methods
df_2 = df.groupby(['var1']).agg({'var2':['sum'], 'var3':['mean']}).reset_index() #var2, var3 need to be numeric data
Get Started
Google Colab Installation
Step 1: Go to your Google Workspace Marketplace and search for "colab"
Step 2: Locate "Colaboratory" and click add to drive
Step 3: Access Colab as using a regular Google doc
Basic Layout in Google Colab
Data Import
Import CSV & Excel files from local directory
Import csv files from local directory
pandas tutorial: pandas.read_csv
#Uplocad csv file from your local directory
from google.colab import files
uploaded = files.upload()
import pandas as pd
df = pd.read_csv('FILENAME.csv') #The filename of the uplpaded csv file
#df.shape
df.head()
#Uplocad Excel file from your local directory
from google.colab import files
uploaded = files.upload()
import pandas as pd
df = pd.read_excel('FILENAME.xlsx') #The filename of the uplpaded csv file
#df.shape
df.head()
Import data from URL
#Example: NYT Github COVID-19 data
#https://raw.githubusercontent.com/nytimes/covid-19-data/master/live/us-counties.csv
url = 'The URL of data'
df = pd.read_csv(url)
#df.shape
df.head()
Google Drive Import
Read data by Google Sheets Name
#gspread setup
!pip install --upgrade gspread
#Authenticate access to your Google Drive
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())
import pandas as pd
worksheet = gc.open('Google Sheets NAME').sheet1
rows = worksheet.get_all_values() # get_all_values gives a list of rows.
df = pd.DataFrame.from_records(rows) # Convert to a DataFrame and render.
df.head()
Read data by Google Sheets ID
#gspread setup
!pip install --upgrade gspread
#Authenticate access to your Google Drive
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())
worksheet = gc.open_by_key('Google Sheets ID').worksheet('NAME OF A SHEET TAB') #Call by Sheet ID & Name
rows = worksheet.get_all_values() # get_all_values gives a list of rows.
df = pd.DataFrame.from_records(rows[1:], columns=rows[0]) # Convert to a DataFrame and render. 1st Row as Headers
df.head()
Read CSV file from Google Drive - by sharing link
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth) #Copy and paste Google Authentication code
link = 'SHARING LINK' #The sharing link of the data file stored on your Google Drive
id = link.split("/")[-2]
#print(id)
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('covid_county.csv')
df = pd.read_csv('covid_county.csv')
df.head()
Read CSV file from Google Drive - by mounting Google Drive
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/PATH TO THE FILE'
df = pd.read_csv(path)
df.head()
Visualization
Exemplar data: COVID-19 case and death data (date: 202012006)
from google.colab import files
df.to_csv('FILENAME.csv')
files.download('FILENAME.csv') #Download winder pops out
Export data to CSV file to Google Drive
from google.colab import drive
drive.mount('/content/drive') #Copy and paste Google Authentication code
df.to_csv('/content/drive/PATH TO Google Drive FOLDER')
Export plot image to local directory
#Export bar chart
bar_image = bar_2.get_figure().savefig('bar_image.png')
from google.colab import files
files.download('bar_image.png') #Download window pops out
#Export line chart
line_image = line_1.get_figure().savefig('line_image.png')
from google.colab import files
files.download('line_image.png') #Download window pops out
#Export scatter plot chart
scat_image = scatter.get_figure().savefig('scatter_image.png')
from google.colab import files
files.download('scatter_image.png') #Download window pops out
Example: COVID-19 Num. of Cases 2020-12-06 (United States on State Level)
Example: COVI19 Num. of Cases 2020-12-06 (United States on State Level)
Example: COVID-19 Num. of Cases & Deaths 2020-12-06 (United States State Level)