# READING AND WRITING DATA FRAMES
with open(‘/path/to/file.csv’) as f:
df = pd.read_csv(f)
# alternatively
df = pd.read_csv(‘/path/to/file.csv’, sep=';', encoding='latin-1', nrows=1000, skiprows=[1,3])
# writing data; no index takes away first index column
df.to_csv(‘/path/to/new/file.csv’, index=None)
# read and write excel
# single sheet read and write
pd.read_excel(‘file.xlsx’)
pd.to_excel(‘path/to/file.xlsx’, sheet_name=‘sheet1’)
# read multiple sheets from same file
xslx = pd.ExcelFile(‘file.xlsx’)
df = pd.read_excel(xlsx, ‘sheet1’)
# CREATING DATA OBJECTS
# series
pd.Series(['4 cups', '1 cup', '2 large', '1 can'], index=['Flour', 'Milk', 'Eggs', 'Spam'])
# data frame
data = {'Apples':[20,30], 'Bananas':[10,15]}
pd.DataFrame(data, columns=['Apples', 'Bananas'], index=['2017 Sales', '2018 Sales'])
# VIEWING DATA
# data frame shape
df.shape
# quick statistical description
df.describe()
# first and last row preview
df.head()
df.tail()
# get specified row (from index)
df.loc[8]
# get column1 of 8th row
df.loc[8, 'column1']
# get subset of rows
df.loc[range(4,8)]
# FILTERING
# filter by value in given column
df[df[‘col’] == ‘val’]
# use logical ops to further specify filters
df[(df['language']=='french') | (df['year']==1989) & ~(df['city']=='London')]
# return language column elements that take on one of the given values
df[df['language'].isin(['english','french'])]
# UPDATING DATA; first index are rows, second is column
data.loc[df['language']=='french', 'language'] = 'French'
# MEDIUM LEVEL FUNCTIONS
# counting occurrences
df['langauge'].value_counts()
# map: applies function to each column element; returns same size sequence
df['language'].map(len)
# chain maps together; eg col values -> col of value lens -> lambda func
df['langauge'].map(len).map(lambda x: x/100)
# apply function along given axis; eg sum up whole columns
df.apply(sum)
# applymap applies a map to all elements, returns same size dataframe
df.applymap(lambda x: x+2)
# get correlation relations between columns
df.corr()
# show scatter matrix across columns
pd.plotting.scatter_matrix(df, figsize=(12,8))
# ADVANCED LEVEL FUNCTIONS
# join matrices by specified columns like an SQL join
df.merge(df2, on=['column1', 'column2'], how='inner')
# groupby unique column values
df.groupby('language')
# iterating over rows
for i, row in df.iterrows():
print('Row {} has value {}'.format(i,row))