Pandas Tutorial: Complete Guide to Data Analysis in Python

What is Pandas?

Pandas is the most popular Python library for data manipulation and analysis. It provides powerful data structures like DataFrames that make working with structured data intuitive and efficient.

Installation

pip install pandas numpy

import pandas as pd
import numpy as np

Core Data Structures

Series

A one-dimensional labeled array:

# Create Series
s = pd.Series([1, 3, 5, 7, 9])
print(s)
# 0    1
# 1    3
# 2    5
# 3    7
# 4    9

# With custom index
s = pd.Series([1, 3, 5], index=['a', 'b', 'c'])
print(s['a'])  # 1

# From dictionary
data = {'apple': 100, 'banana': 200, 'cherry': 150}
s = pd.Series(data)

DataFrame

A two-dimensional labeled data structure:

# Create DataFrame from dictionary
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [25, 30, 35, 28],
    'city': ['NYC', 'LA', 'Chicago', 'Houston'],
    'salary': [50000, 60000, 70000, 55000]
}
df = pd.DataFrame(data)
print(df)
#       name  age     city  salary
# 0    Alice   25      NYC   50000
# 1      Bob   30       LA   60000
# 2  Charlie   35  Chicago   70000
# 3    David   28  Houston   55000

# From list of dictionaries
data = [
    {'name': 'Alice', 'age': 25},
    {'name': 'Bob', 'age': 30}
]
df = pd.DataFrame(data)

# From NumPy array
arr = np.random.randn(5, 3)
df = pd.DataFrame(arr, columns=['A', 'B', 'C'])

Reading and Writing Data

CSV Files

# Read CSV
df = pd.read_csv('data.csv')

# With options
df = pd.read_csv('data.csv',
    sep=',',              # Delimiter
    header=0,             # Row to use as header
    index_col='id',       # Column to use as index
    usecols=['name', 'age'],  # Specific columns
    dtype={'age': int},   # Data types
    na_values=['NA', 'N/A'],  # Values to treat as NaN
    parse_dates=['date'], # Parse date columns
    nrows=1000            # Read only first 1000 rows
)

# Write CSV
df.to_csv('output.csv', index=False)

Excel Files

# Read Excel
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')

# Read all sheets
dfs = pd.read_excel('data.xlsx', sheet_name=None)

# Write Excel
df.to_excel('output.xlsx', sheet_name='Results', index=False)

# Write multiple sheets
with pd.ExcelWriter('output.xlsx') as writer:
    df1.to_excel(writer, sheet_name='Data1')
    df2.to_excel(writer, sheet_name='Data2')

JSON

# Read JSON
df = pd.read_json('data.json')

# Write JSON
df.to_json('output.json', orient='records')

SQL Database

import sqlite3

# Connect to database
conn = sqlite3.connect('database.db')

# Read from SQL
df = pd.read_sql('SELECT * FROM users', conn)
df = pd.read_sql_query('SELECT * FROM users WHERE age > 25', conn)

# Write to SQL
df.to_sql('users', conn, if_exists='replace', index=False)

Data Exploration

# Basic info
df.head()           # First 5 rows
df.tail(10)         # Last 10 rows
df.shape            # (rows, columns)
df.columns          # Column names
df.dtypes           # Data types
df.info()           # Summary info
df.describe()       # Statistical summary

# Check for missing values
df.isnull().sum()
df.isna().any()

# Unique values
df['city'].unique()
df['city'].nunique()     # Count of unique
df['city'].value_counts()

# Memory usage
df.memory_usage(deep=True)

Data Selection

Column Selection

# Single column (returns Series)
df['name']

# Multiple columns (returns DataFrame)
df[['name', 'age']]

# Using dot notation (single column)
df.name

Row Selection

# By index position (iloc)
df.iloc[0]           # First row
df.iloc[0:5]         # First 5 rows
df.iloc[-1]          # Last row
df.iloc[[0, 2, 4]]   # Specific rows

# By label (loc)
df.loc[0]            # Row with index 0
df.loc[0:4]          # Rows 0 to 4 (inclusive!)
df.loc[df['age'] > 25]  # Boolean indexing

# Combined selection
df.iloc[0:5, 0:2]    # First 5 rows, first 2 columns
df.loc[0:4, ['name', 'age']]

Boolean Indexing

# Single condition
df[df['age'] > 25]

# Multiple conditions
df[(df['age'] > 25) & (df['salary'] > 50000)]
df[(df['city'] == 'NYC') | (df['city'] == 'LA')]

# Using isin()
df[df['city'].isin(['NYC', 'LA', 'Chicago'])]

# Using query() - more readable
df.query('age > 25 and salary > 50000')
df.query('city in ["NYC", "LA"]')

Data Cleaning

Handling Missing Values

# Check missing values
df.isnull().sum()
df.isna().sum()

# Drop rows with any missing values
df.dropna()

# Drop rows where specific columns are missing
df.dropna(subset=['name', 'age'])

# Fill missing values
df.fillna(0)                      # Fill with 0
df.fillna({'age': 0, 'salary': 50000})  # Different values
df['age'].fillna(df['age'].mean())      # Fill with mean
df.fillna(method='ffill')         # Forward fill
df.fillna(method='bfill')         # Backward fill

# Interpolate
df['value'].interpolate(method='linear')

Data Type Conversion

# Convert data types
df['age'] = df['age'].astype(int)
df['salary'] = df['salary'].astype(float)
df['date'] = pd.to_datetime(df['date'])

# Convert to categorical (saves memory)
df['city'] = df['city'].astype('category')

String Operations

# String methods
df['name'].str.lower()
df['name'].str.upper()
df['name'].str.strip()
df['name'].str.replace('old', 'new')
df['name'].str.contains('pattern')
df['name'].str.split(' ')
df['name'].str.len()

Removing Duplicates

# Check duplicates
df.duplicated().sum()

# Remove duplicates
df.drop_duplicates()
df.drop_duplicates(subset=['name', 'email'])
df.drop_duplicates(keep='first')  # Keep first occurrence

Data Transformation

Adding/Modifying Columns

# Add new column
df['bonus'] = df['salary'] * 0.1

# Using assign (returns new DataFrame)
df = df.assign(
    bonus=lambda x: x['salary'] * 0.1,
    total=lambda x: x['salary'] + x['bonus']
)

# Apply function to column
df['age_group'] = df['age'].apply(lambda x: 'young' if x < 30 else 'senior')

# Apply function to multiple columns
df['full_name'] = df.apply(lambda row: f"{row['first']} {row['last']}", axis=1)

Rename Columns

# Rename specific columns
df.rename(columns={'old_name': 'new_name'})

# Rename all columns
df.columns = ['col1', 'col2', 'col3']

# Function-based rename
df.rename(columns=str.lower)
df.rename(columns=lambda x: x.replace(' ', '_'))

Sorting

# Sort by column
df.sort_values('age')
df.sort_values('age', ascending=False)

# Sort by multiple columns
df.sort_values(['city', 'age'], ascending=[True, False])

# Sort by index
df.sort_index()

Grouping and Aggregation

GroupBy

# Basic groupby
df.groupby('city')['salary'].mean()

# Multiple aggregations
df.groupby('city')['salary'].agg(['mean', 'min', 'max', 'count'])

# Multiple columns
df.groupby(['city', 'department'])['salary'].mean()

# Custom aggregations
df.groupby('city').agg({
    'salary': ['mean', 'sum'],
    'age': 'mean',
    'name': 'count'
})

# Named aggregations
df.groupby('city').agg(
    avg_salary=('salary', 'mean'),
    total_salary=('salary', 'sum'),
    employee_count=('name', 'count')
)

Pivot Tables

# Simple pivot table
df.pivot_table(
    values='salary',
    index='city',
    columns='department',
    aggfunc='mean'
)

# Multiple aggregations
df.pivot_table(
    values='salary',
    index='city',
    columns='department',
    aggfunc=['mean', 'count'],
    fill_value=0,
    margins=True  # Add row/column totals
)

Merging and Joining

Merge (SQL-style joins)

# Sample DataFrames
df1 = pd.DataFrame({
    'id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie']
})

df2 = pd.DataFrame({
    'id': [1, 2, 4],
    'salary': [50000, 60000, 70000]
})

# Inner join (default)
pd.merge(df1, df2, on='id')

# Left join
pd.merge(df1, df2, on='id', how='left')

# Right join
pd.merge(df1, df2, on='id', how='right')

# Outer join
pd.merge(df1, df2, on='id', how='outer')

# Different column names
pd.merge(df1, df2, left_on='user_id', right_on='id')

Concatenation

# Vertical concatenation (stack rows)
pd.concat([df1, df2], ignore_index=True)

# Horizontal concatenation (side by side)
pd.concat([df1, df2], axis=1)

Time Series

# Create datetime index
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# Date components
df['year'] = df.index.year
df['month'] = df.index.month
df['day'] = df.index.day
df['dayofweek'] = df.index.dayofweek

# Resampling
df.resample('M').mean()    # Monthly mean
df.resample('W').sum()     # Weekly sum
df.resample('Q').first()   # Quarterly first value

# Rolling windows
df['rolling_mean'] = df['value'].rolling(window=7).mean()
df['rolling_sum'] = df['value'].rolling(window=30).sum()

# Shift/Lag
df['previous_day'] = df['value'].shift(1)
df['next_day'] = df['value'].shift(-1)

# Date range
dates = pd.date_range(start='2024-01-01', end='2024-12-31', freq='D')
dates = pd.date_range(start='2024-01-01', periods=100, freq='D')

Quick Reference

Operation	Code
Read CSV	`pd.read_csv('file.csv')`
Write CSV	`df.to_csv('file.csv')`
First n rows	`df.head(n)`
Column selection	`df['col']` or `df[['col1', 'col2']]`
Row selection	`df.iloc[0]` or `df.loc[label]`
Filter rows	`df[df['col'] > value]`
Drop missing	`df.dropna()`
Fill missing	`df.fillna(value)`
Group by	`df.groupby('col').mean()`
Sort	`df.sort_values('col')`
Merge	`pd.merge(df1, df2, on='key')`

Summary

Pandas is essential for any data work in Python. Start with:

Loading data (read_csv, read_excel)
Exploring data (head, info, describe)
Cleaning data (dropna, fillna, astype)
Selecting data (loc, iloc, boolean indexing)
Transforming data (apply, groupby, merge)

Practice with real datasets and you’ll quickly become proficient at data manipulation.