Pandas Cheatsheet

The pandas operations you actually use every day, grouped by intent. One-liner explanations, no filler.

language-tooling #python #pandas #data

Read & write

import pandas as pd

pd.read_csv("data.csv")              # CSV
pd.read_parquet("data.parquet")      # Parquet — faster, typed, smaller
pd.read_json("data.json", lines=True) # JSON Lines
pd.read_sql(query, conn)             # From SQLAlchemy connection

df.to_parquet("out.parquet")         # Prefer Parquet over CSV for anything serious
df.to_csv("out.csv", index=False)    # `index=False` 99% of the time

Inspect

df.head(10)        # First 10 rows
df.tail(5)         # Last 5
df.shape           # (rows, cols)
df.dtypes          # Column types
df.info()          # Memory + nulls + types in one shot
df.describe()      # Summary stats for numeric columns
df["col"].value_counts()      # Frequency table
df.isna().sum()               # Nulls per column

Select

df["col"]                    # One column → Series
df[["a", "b"]]               # Many columns → DataFrame
df.loc[df["age"] > 30]       # Boolean mask
df.loc[10:20, ["a", "b"]]    # Label-based slice
df.iloc[10:20, 0:3]          # Position-based slice
df.query("age > 30 and city == 'Paris'")  # SQL-like expression

Filter

df[df["status"].isin(["paid", "shipped"])]
df[df["name"].str.startswith("A")]
df[df["created_at"].between("2026-01-01", "2026-03-31")]
df.dropna(subset=["email"])           # Drop rows missing email
df.fillna({"age": 0, "country": "NA"})  # Fill specific columns

Transform

df["age_years"] = df["age_days"] / 365
df["full_name"] = df["first"] + " " + df["last"]
df["bucket"] = pd.cut(df["age"], bins=[0, 18, 35, 65, 120])
df["score_z"] = (df["score"] - df["score"].mean()) / df["score"].std()

df.assign(profit=lambda d: d.revenue - d.cost)  # Method chaining
df.rename(columns={"old": "new"})
df.drop(columns=["junk"])
df.astype({"id": "int64", "price": "float32"})

Group & aggregate

df.groupby("country")["revenue"].sum()
df.groupby("country").agg(
    total=("revenue", "sum"),
    avg=("revenue", "mean"),
    n=("id", "count"),
)
df.groupby(["country", "year"])["revenue"].mean().unstack()
df.pivot_table(index="country", columns="month",
               values="revenue", aggfunc="sum")

Merge & join

pd.merge(orders, users, on="user_id", how="left")
pd.merge(a, b, left_on="uid", right_on="id", how="inner")
pd.concat([df1, df2], axis=0)        # Stack rows
pd.concat([df1, df2], axis=1)        # Stack columns
df1.join(df2, on="key")              # Index-aligned join

Time series

df["ts"] = pd.to_datetime(df["ts"])
df = df.set_index("ts").sort_index()
df.resample("D")["sales"].sum()              # Daily totals
df["sales"].rolling("7D").mean()             # 7-day moving average
df["sales"].shift(1)                          # Lag 1 day
df["sales"].pct_change()                      # Day-over-day change
df.between_time("09:00", "17:00")             # Working hours
df.tz_localize("UTC").tz_convert("Europe/Paris")

Performance one-liners

df.memory_usage(deep=True).sum() / 1e6        # MB used
df["cat_col"] = df["cat_col"].astype("category")  # Save memory on low-cardinality strings
df.to_parquet("out.parquet")                  # Smaller, faster than CSV

When pandas is too slow, the answer is usually Polars or DuckDB, not “more pandas tricks”.