Pandas

Python data management library

Basic operations

# READING AND WRITING DATA FRAMES
with open(‘/path/to/file.csv’) as f:
    df = pd.read_csv(f)
# alternatively
df = pd.read_csv(‘/path/to/file.csv’, sep=';', encoding='latin-1', nrows=1000, skiprows=[1,3])

# writing data; no index takes away first index column
df.to_csv(‘/path/to/new/file.csv’, index=None)

# read and write excel
# single sheet read and write
pd.read_excel(‘file.xlsx’)
pd.to_excel(‘path/to/file.xlsx’, sheet_name=‘sheet1’)

# read multiple sheets from same file
xslx = pd.ExcelFile(‘file.xlsx’)
df = pd.read_excel(xlsx, ‘sheet1’)

# CREATING DATA OBJECTS
# series
pd.Series(['4 cups', '1 cup', '2 large', '1 can'], index=['Flour', 'Milk', 'Eggs', 'Spam'])

# data frame
data = {'Apples':[20,30], 'Bananas':[10,15]}
pd.DataFrame(data, columns=['Apples', 'Bananas'], index=['2017 Sales', '2018 Sales'])

# VIEWING DATA
# data frame shape
df.shape

# quick statistical description
df.describe()

# first and last row preview
df.head()
df.tail()

# get specified row (from index)
df.loc[8]

# get column1 of 8th row
df.loc[8, 'column1']

# get subset of rows
df.loc[range(4,8)]

# FILTERING
# filter by value in given column 
df[df[‘col’] == ‘val’]

# use logical ops to further specify filters
df[(df['language']=='french') | (df['year']==1989) & ~(df['city']=='London')]

# return language column elements that take on one of the given values
df[df['language'].isin(['english','french'])]

# UPDATING DATA; first index are rows, second is column
data.loc[df['language']=='french', 'language'] = 'French'

# MEDIUM LEVEL FUNCTIONS
# counting occurrences
df['langauge'].value_counts()

# map: applies function to each column element; returns same size sequence
df['language'].map(len)

# chain maps together; eg col values -> col of value lens -> lambda func
df['langauge'].map(len).map(lambda x: x/100)

# apply function along given axis; eg sum up whole columns
df.apply(sum)

# applymap applies a map to all elements, returns same size dataframe
df.applymap(lambda x: x+2)

# get correlation relations between columns
df.corr()

# show scatter matrix across columns
pd.plotting.scatter_matrix(df, figsize=(12,8))

# ADVANCED LEVEL FUNCTIONS
# join matrices by specified columns like an SQL join
df.merge(df2, on=['column1', 'column2'], how='inner')

# groupby unique column values
df.groupby('language')

# iterating over rows
for i, row in df.iterrows():
    print('Row {} has value {}'.format(i,row))

Creation

# create dataframe from raw data:
data = {'Apples':[35,41], 'Bananas':[21,34]}
pd.DataFrame(data, columns=['Apples','Bananas'], index=['2017 Sales', '2018 Sales'])

# create series:
pd.Series(['4 cups', '1 cup', '2 large', '1 can'], index=['Flour', 'Milk', 'Eggs', 'Spam'])