Skip to content

Pandas

Import

The minimum pandas imports are:

import pandas as pd
import numpy as np

Import / Export Data

Search for files

# List files in a directory
dirlist = os.listdir(inputDir)
print("Files from directory ", inputDir)
for file in dirlist:
  print("  * ", file)

# Glob search for specific files
wkbks = glob(os.path.join(os.pardir, 'input', 'xlsx_files_all', 'Ir*.xls'))
sorted(wkbks)

Excel

# import excel file
df = pd.read_excel(filename)

# import one sheet from excel file
df = pd.read_excel(filename,sheet_name='Sheet2')

# define first column as the index
df = pd.read_excel(filename,sheet_name='Sheet1', index_col=0)

# import specific columns
df = pd.read_excel(filename, sheet_name='Sheet1', header=None, skiprows=1, usecols='B,D')

# export excel file
df.to_excel(filename, index=False)

CSV

# import csv file
df = pd.read_csv(filename)

# export first 10 rows
df[:10].to_csv(filename, index=False)

H5

H5 is a file format to efficiently store multiple panda dataframes. During export a dataframe name has to be given, the same key is needed to reimport the dataframe. The H5 format is compressed and encrypted

# Import H5 File
if filename:
   with pd.HDFStore(filename) as hdf:
     df = hdf.select(datasetname)
# Import H5 file
pd.read_hdf(filename, datasetname)

# Export H5 File
with pd.HDFStore(filename, mode='w') as hdf:
  hdf.put(datasetname, df, format='table', data_columns=True)

Copy

# Copy dataframe
df = df.copy(deep=True)

Access informations

# General informations
df.info()

# Get begin data
df.head()
df.head(10)

# Get end data
df.tail()
df.tail(10)

# Get columns
df.columns
df.columns.tolist()

# Get number of columns and rows, cells
df.shape[0]     # rows
df.shape[1]     # columns
df.size         # elements (rows * columns)

# Count unique values in a column
len(df['colname'].unique())

Index

# set index
df = df.set_index('colname')

# reset index

Statistic

df.describe()

# counter values
pd.value_counts(df['colname'])

# Count non-NA values
df.count(axis=0)

# sum
df.sum(axis=0) # 0 for column wise total
df.sum(axis=1) # row wise sum

# sum based on criteria
df[df['colname'] == value].sum()
df[df['colname'].map(lambda x: x.endswith('sa')) & (df['colname2'] > 5.1)].sum()

# Average
df[df['colname'] == value].mean()
df[df['colname'].map(lambda x: x.endswith('sa')) & (df['colname2'] > 5.1)].mean()

# Max / Min
df[df['colname'] == value].max()
df[df['colname'] == value].min()

Pivot

Pivot table is similar to GroupBy Methods. I allows to group a table and quickly

Datetime

df['colname'] = pd.to_datetime(df['colname'], format='%d/%m/%Y %H:%M:%S')

for the format values see ./datetime

Access Data

# Access specific columns
df['colname']
df['colname'].tolist()
df.iloc[:,[4]]
df.loc[:,['colname']]

# Access Multiple columns
df[['colname1', 'colname2']]

# Access rows (index starts at 0)
df.loc[20:30]
df.loc[20:30, ['colname']]
# get 3 first rows
df.iloc[0:3]

# Get unique values
df['colname'].unique()

# Get random sample of rows while maintaining index
df.sample(frac=0.25)

Filter

# Filter for values in column
df[df['colname'] == value]
df[df['colname'].isin([value1, value2])]

# Filter all NOT in list
df[~df['colname'].isin(values)]

# Filter numberic conditions
df[df['colname'] == 5.1]
df[df['colname'] > 5.1]

# Custom Filter
df[df['colname'].map(lambda x: x.endswith('sa'))]

# Combine filters
df[df['colname'].map(lambda x: x.endswith('sa')) & (df['SepalLength'] > 5.1)]

# Contain filter
df[df['colname'].str.contains('set')]

# drop duplicates
df.drop_duplicates(subset=['colname'])

# get duplicates
df_duplicates = df[df.duplicated()]

Sorting

# Soft data by value
df.sort_values('colname', ascending=False)

# Soft data by value
df.sort_values('colname', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last')

Aggregating

# groupby and count
df.groupby('colname').count()

# groupby and aggregate
df.groupby(['colname']).agg({
  'colname1': "sum",
  'colname2': "count",
  'colname3': "first",
  'colname4': "last",

}).reset_index()

Add

# add new column with initial value True
df['colname'] = True

Remove

# remove columns
df.drop(['colname'], axis = 1)

# keep only a few columns
df = df[['colname1', 'colname2']]

Iterate over row indices

for idx, row in df[:2].iterrows():
  print(idx,row)