Pandas Reference
0. Session & Basics
0.1 Session Init
import pandas as pd
import numpy as np
# Standard convention for importing Pandas
# Often used alongside NumPy for vectorized operations
TERMINAL OUTPUT
(No output)
0.2 Context/Config
# Configure display options for terminal output
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)
# Get current configuration value
print(pd.get_option('display.max_rows'))
TERMINAL OUTPUT
500
0.3 Show/Display
df = pd.DataFrame({'A': range(100), 'B': range(100)})
# Peek at the first 5 rows
print(df.head(5))
# Convert entire DataFrame to string for full inspection
# (Careful with large DataFrames)
# print(df.to_string())
TERMINAL OUTPUT
A B
0 0 0
1 1 1
2 2 2
3 3 3
4 4 4
1. DataFrames & I/O
1.1 Read (CSV/Parquet)
# Read CSV with automatic type inference
df_csv = pd.read_csv("data.csv")
# Read Parquet (requires pyarrow or fastparquet)
df_parquet = pd.read_parquet("data.parquet")
TERMINAL OUTPUT
(No output)
1.2 Write
# Write to CSV without index column
df.to_csv("output.csv", index=False)
# Write to Parquet with Snappy compression
df.to_parquet("output.parquet", compression='snappy')
TERMINAL OUTPUT
(No output)
1.3 Create from Local
# Create from dictionary of lists
df_dict = pd.DataFrame({
"name": ["Alice", "Bob"],
"age": [25, 30]
})
# Create from list of records (dicts)
df_list = pd.DataFrame([
{"name": "Alice", "age": 25},
{"name": "Bob", "age": 30}
])
TERMINAL OUTPUT
(No output)
2. Schema & Types
2.1 StructType/Field
# Define explicit dtypes during creation or via dictionary
dtypes = {
'name': 'string',
'age': 'int32',
'salary': 'float64'
}
df = pd.DataFrame(columns=dtypes.keys()).astype(dtypes)
TERMINAL OUTPUT
(No output)
2.2 Casting
# Convert column types using astype
df['age'] = df['age'].astype('int64')
# Handle errors during numeric conversion
df['score'] = pd.to_numeric(df['score'], errors='coerce')
TERMINAL OUTPUT
(No output)
2.3 Inspecting Schema
# Summary of dtypes, memory usage, and non-null counts
df.info()
# Get series of dtypes for all columns
print(df.dtypes)
TERMINAL OUTPUT
<class 'pandas.core.frame.DataFrame'>
...
dtypes: float64(1), int64(1), string(1)
memory usage: ...
name string
age int64
salary float64
dtype: object
3. Selection & Filtering
3.1 Select/Alias
# Selection by label (loc) or position (iloc)
df_subset = df.loc[:, ['name', 'age']]
df_pos = df.iloc[0:5, 0:2]
# Rename columns (Aliasing)
df_renamed = df.rename(columns={'name': 'user_name', 'age': 'user_age'})
TERMINAL OUTPUT
(No output)
3.2 Filter/Where
# Boolean indexing
df_filtered = df[df['age'] > 21]
# Query syntax (highly readable for complex conditions)
df_query = df.query("age > 21 and salary < 50000")
TERMINAL OUTPUT
(No output)
3.3 Drop/Drop Duplicates
# Drop specific columns
df_less = df.drop(columns=['salary'])
# Remove duplicate rows based on subset of columns
df_unique = df.drop_duplicates(subset=['name'], keep='first')
TERMINAL OUTPUT
(No output)
4. Column Operations
4.1 withColumn
# Direct assignment (Vectorized by default)
df['is_adult'] = df['age'] >= 18
# Assign multiple columns at once
df = df.assign(
age_months = df['age'] * 12,
tax = df['salary'] * 0.2
)
TERMINAL OUTPUT
(No output)
4.2 when/otherwise
# Using numpy.where for conditional logic (Vectorized)
df['category'] = np.where(df['age'] >= 18, 'Adult', 'Minor')
# mask/where for series-level updates
df['salary'] = df['salary'].mask(df['salary'] < 0, 0)
TERMINAL OUTPUT
(No output)
4.3 String/Date Math
# String accessor for vectorized string operations
df['name_upper'] = df['name'].str.upper()
# Convert to datetime and perform date math
df['created_at'] = pd.to_datetime(df['timestamp'])
df['days_since'] = (pd.Timestamp.now() - df['created_at']).dt.days
TERMINAL OUTPUT
(No output)
5. Aggregations & Grouping
5.1 groupBy
# Group by one or more columns
group = df.groupby('category')
TERMINAL OUTPUT
(No output)
5.2 Aggregate Functions
# Multiple aggregations using agg()
df_agg = df.groupby('category').agg({
'age': ['mean', 'min', 'max'],
'salary': 'sum'
})
# Flatten multi-index columns after aggregation
df_agg.columns = ['_'.join(col) for col in df_agg.columns]
TERMINAL OUTPUT
(No output)
5.3 Pivot
# Create a spreadsheet-style pivot table
pivot = df.pivot_table(
index='category',
columns='year',
values='salary',
aggfunc='mean',
fill_value=0
)
TERMINAL OUTPUT
(No output)
6. Joins & Set Operations
6.1 Inner/Outer/Left Joins
# Database-style joins using merge
df_joined = pd.merge(df1, df2, on='id', how='left')
# Joining on different column names
df_joined_alt = pd.merge(df1, df2, left_on='u_id', right_on='id')
TERMINAL OUTPUT
(No output)
6.2 Broadcast Join
# In Pandas (single-node), "broadcasting" is achieved by mapping
# a small lookup dictionary for maximum performance
lookup = df_small.set_index('id')['value'].to_dict()
df_large['val'] = df_large['id'].map(lookup)
TERMINAL OUTPUT
(No output)
6.3 Union/Intersect
# Vertical concatenation (Union)
df_union = pd.concat([df1, df2], axis=0, ignore_index=True)
# Find common rows (Intersect)
df_intersect = pd.merge(df1, df2, how='inner')
TERMINAL OUTPUT
(No output)
7. Window Functions
7.1 WindowSpec
# Rolling window for moving averages
df['moving_avg'] = df['salary'].rolling(window=7).mean()
# Expanding window for cumulative sums
df['cum_sum'] = df['salary'].expanding().sum()
TERMINAL OUTPUT
(No output)
7.2 Ranking
# Compute numerical rank (1 through N)
df['salary_rank'] = df['salary'].rank(ascending=False, method='min')
# Percentile rank
df['salary_pct'] = df['salary'].rank(pct=True)
TERMINAL OUTPUT
(No output)
7.3 Lead/Lag
# Shift values to calculate differences (Lag)
df['prev_salary'] = df['salary'].shift(1)
# Shift values backwards (Lead)
df['next_salary'] = df['salary'].shift(-1)
TERMINAL OUTPUT
(No output)
8. UDFs & Advanced
8.1 Standard UDFs
# apply() runs a function row-wise (axis=1) or col-wise (axis=0)
# Slow for large datasets; use vectorization where possible
df['processed'] = df['name'].apply(lambda x: x[::-1])
TERMINAL OUTPUT
(No output)
8.2 Vectorized UDFs
# Use NumPy functions directly on Series for maximum speed
df['log_salary'] = np.log1p(df['salary'])
# Vectorized string slicing
df['prefix'] = df['name'].str[:3]
TERMINAL OUTPUT
(No output)
8.3 RDD Interop
# Convert to underlying NumPy array (similar to RDD mapping)
raw_array = df.to_numpy()
# Round-trip from NumPy back to Pandas
df_new = pd.DataFrame(raw_array, columns=df.columns)
TERMINAL OUTPUT
(No output)
9. Performance & Tuning
9.1 Cache/Persist
# Optimize memory usage by downcasting and using categories
df['category'] = df['category'].astype('category')
df['age'] = pd.to_numeric(df['age'], downcast='unsigned')
# This reduces memory footprint, effectively "persisting"
# a more efficient representation.
TERMINAL OUTPUT
(No output)
9.2 Repartition/Coalesce
# Pandas is single-threaded; simulate partitioning for
# parallel processing or chunking large files
chunks = np.array_split(df, 4)
# Process in chunks to avoid OOM
# for chunk in pd.read_csv("huge.csv", chunksize=100000):
# process(chunk)
TERMINAL OUTPUT
(No output)
9.3 Explain Plan
# Inspect deep memory usage
print(df.info(memory_usage='deep'))
# Profile execution time for a block
# %timeit df.groupby('A').sum() (IPython/Jupyter only)
TERMINAL OUTPUT
<class 'pandas.core.frame.DataFrame'>
...
memory usage: 1.2 MB