pandas

Essential library for Python data analysis. Structured data manipulation with DataFrame, support for various data sources like CSV/JSON/SQL. Foundation tool for data cleaning, transformation, aggregation, and visualization.

PythonData AnalysisData ScienceDataFrameData Manipulation

GitHub Overview

pandas-dev/pandas

Flexible and powerful data analysis / manipulation library for Python, providing labeled data structures similar to R data.frame objects, statistical functions, and much more

Repository:https://github.com/pandas-dev/pandas

Homepage:https://pandas.pydata.org

Stars46,002

Watchers1,112

Forks18,687

Created:August 24, 2010

Language:Python

License:BSD 3-Clause "New" or "Revised" License

Topics

alignmentdata-analysisdata-scienceflexiblepandaspython

Star History

Data as of: 7/16/2025, 11:10 AM

Framework

Pandas

Overview

Pandas is Python's data manipulation and analysis library.

Details

Pandas is a central library for data manipulation and analysis in Python, with development beginning in 2008 by Wes McKinney. As an essential tool for processing, transforming, and analyzing structured data, it significantly improves data scientist work efficiency. It provides major data structures called DataFrame and Series, supporting reading from and writing to various data sources including CSV, Excel, JSON, and SQL databases. Operations like data cleaning, filtering, grouping, aggregation, and joining can be implemented through intuitive APIs, with seamless integration with other Python libraries like NumPy, Matplotlib, and scikit-learn. Used across a wide range of fields including business analysis, financial data analysis, research, and machine learning data preprocessing, it occupies an unshakable position as the foundation of Python's data science ecosystem.

Pros and Cons

Pros

Intuitive API: Simple and understandable data manipulation
Rich Data Source Support: Many formats including CSV, Excel, JSON, SQL
High Performance: Efficient operations based on NumPy
Flexible Data Structures: DataFrame and Series for representing various data
Missing Value Handling: Appropriate handling of NaN values
Group Operations: Powerful GroupBy functionality
Ecosystem Integration: Seamless connection with other Python libraries

Cons

Memory Usage: Can consume large amounts of memory
Performance: Speed degradation with very large datasets
Learning Curve: Can be difficult for beginners due to many features
Data Type Constraints: Limitations with specific data types
Index Complexity: MultiIndex can be difficult to understand
Thread Safety: Limitations in multi-threaded environments

Key Links

Code Examples

Hello World

import pandas as pd
import numpy as np

# Check Pandas version
print(f"Pandas version: {pd.__version__}")

# Create Series
series = pd.Series([1, 2, 3, 4, 5], name='sample_series')
print("Series:")
print(series)

# Create DataFrame
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'age': [25, 30, 35, 28],
    'city': ['Tokyo', 'Osaka', 'Nagoya', 'Fukuoka'],
    'salary': [50000, 60000, 70000, 55000]
}

df = pd.DataFrame(data)
print("\nDataFrame:")
print(df)

# Basic information
print(f"\nData shape: {df.shape}")
print(f"Column names: {df.columns.tolist()}")
print(f"Data types:\n{df.dtypes}")

Data Reading and Saving

import pandas as pd

# Read CSV file
# df = pd.read_csv('data.csv', encoding='utf-8')

# Read Excel file
# df = pd.read_excel('data.xlsx', sheet_name='Sheet1')

# Read JSON file
# df = pd.read_json('data.json')

# Read from SQL database
# import sqlite3
# conn = sqlite3.connect('database.db')
# df = pd.read_sql_query('SELECT * FROM table_name', conn)

# Create sample data
data = {
    'product': ['A', 'B', 'C', 'D', 'E'],
    'price': [100, 150, 200, 120, 180],
    'quantity': [10, 5, 8, 15, 12],
    'category': ['Electronics', 'Clothing', 'Books', 'Electronics', 'Clothing']
}
df = pd.DataFrame(data)

print("Sample data:")
print(df)

# Save data
# Save as CSV
# df.to_csv('output.csv', index=False, encoding='utf-8')

# Save as Excel
# df.to_excel('output.xlsx', index=False, sheet_name='data')

# Save as JSON
# df.to_json('output.json', orient='records', ensure_ascii=False, indent=2)

# Save as Parquet (fast & compressed)
# df.to_parquet('output.parquet')

print("Data saving options are commented out")

Data Exploration and Selection

import pandas as pd
import numpy as np

# Create sample data
np.random.seed(42)
df = pd.DataFrame({
    'id': range(1, 101),
    'name': [f'User{i:03d}' for i in range(1, 101)],
    'age': np.random.randint(18, 65, 100),
    'salary': np.random.randint(30000, 100000, 100),
    'department': np.random.choice(['Sales', 'Development', 'HR', 'Marketing'], 100),
    'join_date': pd.date_range(start='2020-01-01', periods=100, freq='D')
})

# Data overview
print("Data overview:")
print(df.info())

# Statistical information
print("\nStatistical information:")
print(df.describe())

# First 5 rows
print("\nFirst 5 rows:")
print(df.head())

# Select specific columns
print("\nName and age only:")
print(df[['name', 'age']].head())

# Filter by condition
print("\nUsers 30 years or older:")
filtered = df[df['age'] >= 30]
print(filtered[['name', 'age', 'department']].head())

# Multiple conditions
print("\nUsers 30+ years old in Development:")
complex_filter = df[(df['age'] >= 30) & (df['department'] == 'Development')]
print(complex_filter[['name', 'age', 'salary']].head())

# Sorting
print("\nSort by age (descending):")
sorted_df = df.sort_values('age', ascending=False)
print(sorted_df[['name', 'age', 'department']].head())

Data Processing and Transformation

import pandas as pd
import numpy as np

# Sample data
df = pd.DataFrame({
    'product': ['A', 'B', 'C', 'D', 'E', 'A', 'B'],
    'price': [100, 150, 200, 120, 180, 110, 160],
    'quantity': [10, 5, 8, 15, 12, 8, 6],
    'date': pd.date_range('2023-01-01', periods=7)
})

print("Original data:")
print(df)

# Add new columns
df['total_value'] = df['price'] * df['quantity']
df['price_category'] = df['price'].apply(
    lambda x: 'High' if x >= 150 else ('Medium' if x >= 120 else 'Low')
)

print("\nAfter adding columns:")
print(df)

# String operations
df['product_code'] = 'PROD_' + df['product']
df['date_str'] = df['date'].dt.strftime('%Y-%m-%d')

# Handling missing values
df_with_nan = df.copy()
df_with_nan.loc[2, 'price'] = np.nan
df_with_nan.loc[4, 'quantity'] = np.nan

print("\nWith missing values:")
print(df_with_nan)

# Fill missing values
df_filled = df_with_nan.fillna({
    'price': df_with_nan['price'].mean(),
    'quantity': df_with_nan['quantity'].median()
})

print("\nAfter filling missing values:")
print(df_filled)

# Remove duplicates
df_no_duplicates = df.drop_duplicates(subset=['product'])
print("\nAfter removing duplicates (product-based):")
print(df_no_duplicates)

Grouping and Aggregation

import pandas as pd
import numpy as np

# Sample data (sales data)
np.random.seed(42)
df = pd.DataFrame({
    'date': pd.date_range('2023-01-01', periods=365),
    'region': np.random.choice(['Tokyo', 'Osaka', 'Nagoya', 'Fukuoka'], 365),
    'product': np.random.choice(['A', 'B', 'C'], 365),
    'sales': np.random.randint(1000, 10000, 365),
    'quantity': np.random.randint(1, 50, 365)
})

# Add date-related columns
df['month'] = df['date'].dt.month
df['quarter'] = df['date'].dt.quarter

print("Sample data:")
print(df.head())

# Regional aggregation
print("\nRegional aggregation:")
region_summary = df.groupby('region').agg({
    'sales': ['sum', 'mean', 'count'],
    'quantity': ['sum', 'mean']
}).round(2)
print(region_summary)

# Product and monthly aggregation
print("\nMonthly sales by product:")
monthly_product = df.groupby(['product', 'month'])['sales'].sum().unstack(fill_value=0)
print(monthly_product.head())

# Custom aggregation function
def custom_stats(x):
    return pd.Series({
        'total': x.sum(),
        'average': x.mean(),
        'max': x.max(),
        'min': x.min(),
        'range': x.max() - x.min()
    })

print("\nCustom aggregation:")
custom_summary = df.groupby('region')['sales'].apply(custom_stats)
print(custom_summary)

# Pivot table
print("\nPivot table (region × product):")
pivot_table = pd.pivot_table(
    df, 
    values='sales', 
    index='region', 
    columns='product', 
    aggfunc='sum',
    fill_value=0
)
print(pivot_table)

Data Joining and Merging

import pandas as pd

# Create sample data
df1 = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'age': [25, 30, 35, 28]
})

df2 = pd.DataFrame({
    'id': [1, 2, 3, 5],
    'department': ['Sales', 'Development', 'HR', 'Marketing'],
    'salary': [50000, 60000, 55000, 65000]
})

print("DataFrame 1:")
print(df1)
print("\nDataFrame 2:")
print(df2)

# Inner join
print("\nInner join:")
inner_join = pd.merge(df1, df2, on='id', how='inner')
print(inner_join)

# Left join
print("\nLeft join:")
left_join = pd.merge(df1, df2, on='id', how='left')
print(left_join)

# Outer join
print("\nOuter join:")
outer_join = pd.merge(df1, df2, on='id', how='outer')
print(outer_join)

# Concatenate DataFrames
df3 = pd.DataFrame({
    'id': [6, 7],
    'name': ['Eve', 'Frank'],
    'age': [32, 29]
})

print("\nRow-wise concatenation:")
concatenated = pd.concat([df1, df3], ignore_index=True)
print(concatenated)

# Column-wise concatenation
df4 = pd.DataFrame({
    'country': ['Japan', 'Japan', 'Japan', 'Japan'],
    'city': ['Tokyo', 'Osaka', 'Nagoya', 'Fukuoka']
})

print("\nColumn-wise concatenation:")
column_concat = pd.concat([df1, df4], axis=1)
print(column_concat)

Time Series Data Processing

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Create time series data
np.random.seed(42)
dates = pd.date_range('2023-01-01', periods=365, freq='D')
values = np.cumsum(np.random.randn(365)) + 100

ts_df = pd.DataFrame({
    'date': dates,
    'value': values
})

# Set date as index
ts_df.set_index('date', inplace=True)

print("Time series data:")
print(ts_df.head())

# Add date-related columns
ts_df['year'] = ts_df.index.year
ts_df['month'] = ts_df.index.month
ts_df['quarter'] = ts_df.index.quarter
ts_df['day_of_week'] = ts_df.index.day_name()

# Moving averages
ts_df['ma_7'] = ts_df['value'].rolling(window=7).mean()
ts_df['ma_30'] = ts_df['value'].rolling(window=30).mean()

# Day-over-day and month-over-month changes
ts_df['value_diff'] = ts_df['value'].diff()
ts_df['value_pct_change'] = ts_df['value'].pct_change()

print("\nAfter adding time series features:")
print(ts_df.head(10))

# Monthly aggregation
print("\nMonthly summary:")
monthly_summary = ts_df.resample('M').agg({
    'value': ['mean', 'max', 'min', 'std']
}).round(2)
print(monthly_summary.head())

# Extract data for specific period
print("\nJanuary 2023 data:")
january_data = ts_df['2023-01']
print(january_data.head())

# Statistical information
print("\nTime series statistics:")
print(f"Period: {ts_df.index.min()} to {ts_df.index.max()}")
print(f"Mean value: {ts_df['value'].mean():.2f}")
print(f"Max value: {ts_df['value'].max():.2f}")
print(f"Min value: {ts_df['value'].min():.2f}")