Data Wangling

Simple calculation using Pandas

import pandas as pd
#Simple Calculation
#Count
count_1 = df['var1'].count()   #Count of one column (example: cases)
count_2 = df[['var1','var2']].count()   #Count of more than one columns (example: cases, deaths)
count_all = df.count()    #Count all varialbes in the table

#mean
mean_value =  df['var1'].mean()
# mean = round(df['var1'].mean(),2)         #If decimal places are needed
#standand deviation
std_value =  df['var1'].std()


print('Descriptive statistics of cases')
print('Count:',count_1)
print('Mean',mean_value )
print('Stand Deviation',std_value)

Descriptive statistics

import pandas as pd
#Descriptive Statistics of one varialbe
descriptive_stats_1 = df['var1'].describe()
#Descriptive Statistics of more than one varialbes
descriptive_stats_2 = df[['var1','deaths']].describe()
#Descriptive Statistics of all numbercal varialbes in a dataframe
descriptive_stats_all = df.describe()

Extract a subset by columns and rows

subset_1 = df[['var1','var2','var3']]   #Subset by column name(s)
subset_2 = df[df["var1"] > 100]  #Select rows by value (example: value =100)

Data aggregation

#Collapse
#Collapse data by one varialbe, one aggregation method
df_1 = df.groupby(['var1'], dropna=True).sum().reset_index()

#Collapse data by one varialbe, by more than one aggregation methods
df_2 = df.groupby(['var1']).agg({'var2':['sum'], 'var3':['mean']}).reset_index() #var2, var3 need to be numeric data

Last updated