Pandas Interview Questions and Answers

These questions span beginner-to-advanced Pandas — what’s actually tested in data science, data engineering, and machine learning engineer interviews.

Fundamentals

Q1. What is the difference between a Pandas Series and DataFrame?

import pandas as pd

# Series — one-dimensional labeled array
s = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
print(s.dtype)   # int64
print(s['b'])    # 20

# DataFrame — two-dimensional labeled data structure
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Carol'],
    'dept': ['Eng', 'HR', 'Eng'],
    'salary': [95000, 72000, 88000]
})

# Each column is a Series
print(type(df['salary']))  # <class 'pandas.core.series.Series'>
print(df.shape)            # (3, 3)
print(df.dtypes)

A DataFrame is a dict of Series that share a common index. Operations on DataFrames typically operate column-wise (axis=0 along rows, axis=1 along columns).

Q2. What is the difference between .loc[] and .iloc[]?

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Carol', 'Dave'],
    'score': [88, 92, 79, 95]
}, index=[10, 20, 30, 40])

# .loc — label-based: uses index labels and column names
df.loc[10]           # Row with index label 10
df.loc[10:30]        # Rows with labels 10 through 30 (INCLUSIVE)
df.loc[10, 'name']   # Specific cell

# .iloc — integer position-based: uses 0-based positions
df.iloc[0]           # First row (position 0)
df.iloc[0:2]         # Rows at positions 0,1 (EXCLUSIVE upper bound)
df.iloc[0, 0]        # First row, first column

# Common mistake: mixing them up when index is integers
# df.loc[0] would raise KeyError — index starts at 10!
# df.iloc[0] returns first row — always position-based

Q3. How do you handle missing data in Pandas?

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'age': [25, np.nan, 30, np.nan, 28],
    'salary': [50000, 60000, np.nan, 75000, 65000],
    'dept': ['Eng', None, 'HR', 'Eng', None]
})

# Detect missing
print(df.isnull().sum())         # Count nulls per column
print(df.isnull().sum() / len(df))  # Null rate

# Drop rows with any null
df.dropna()
# Drop rows where ALL columns are null
df.dropna(how='all')
# Drop columns with more than 50% nulls
df.dropna(axis=1, thresh=int(len(df) * 0.5))

# Fill with a constant
df['dept'].fillna('Unknown')

# Fill numeric with mean/median/mode
df['age'].fillna(df['age'].mean())
df['salary'].fillna(df['salary'].median())

# Forward fill (time series — carry last valid value forward)
df['salary'].ffill()

# Interpolate
df['age'].interpolate(method='linear')

Data Manipulation

Q4. Explain groupby and provide a complex example.

groupby splits data into groups, applies a function, and combines results:

import pandas as pd

df = pd.DataFrame({
    'dept': ['Eng', 'Eng', 'HR', 'HR', 'Eng', 'Finance'],
    'level': ['Senior', 'Junior', 'Senior', 'Junior', 'Senior', 'Senior'],
    'salary': [120000, 75000, 95000, 65000, 115000, 105000],
    'bonus': [15000, 5000, 10000, 4000, 12000, 9000]
})

# Simple aggregation
print(df.groupby('dept')['salary'].mean())

# Multiple aggregations
stats = df.groupby('dept')['salary'].agg(['mean', 'max', 'count'])

# Named aggregations (clean syntax)
result = df.groupby(['dept', 'level']).agg(
    avg_salary=('salary', 'mean'),
    max_bonus=('bonus', 'max'),
    headcount=('salary', 'count')
).reset_index()

print(result)

# Transform — returns same shape as original (for feature engineering)
df['dept_avg_salary'] = df.groupby('dept')['salary'].transform('mean')
df['pct_above_dept_avg'] = df['salary'] / df['dept_avg_salary'] - 1

# Filter groups
big_depts = df.groupby('dept').filter(lambda x: len(x) >= 2)

Q5. What is the difference between merge, join, and concat?

import pandas as pd

employees = pd.DataFrame({
    'emp_id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Carol', 'Dave'],
    'dept_id': [10, 10, 20, 30]
})

departments = pd.DataFrame({
    'dept_id': [10, 20, 40],
    'dept_name': ['Engineering', 'HR', 'Finance']
})

# MERGE — SQL-style join (most powerful and explicit)
inner = pd.merge(employees, departments, on='dept_id', how='inner')
left  = pd.merge(employees, departments, on='dept_id', how='left')
outer = pd.merge(employees, departments, on='dept_id', how='outer')

# JOIN — index-based merge (shorthand when joining on index)
employees.set_index('dept_id').join(departments.set_index('dept_id'), how='left')

# CONCAT — stack DataFrames vertically or horizontally
quarterly = [q1_df, q2_df, q3_df, q4_df]
annual = pd.concat(quarterly, ignore_index=True)

# Horizontal concat (cbind equivalent)
combined = pd.concat([df1, df2], axis=1)

Key difference: merge joins on column values; join joins on index; concat stacks DataFrames together.

Q6. How does apply work and when should you avoid it?

import pandas as pd

df = pd.DataFrame({'text': ['Hello World', 'foo bar baz', 'UPPER CASE']})

# Apply a function to each element
df['word_count'] = df['text'].apply(lambda x: len(x.split()))

# Apply to rows (axis=1)
df['length_category'] = df.apply(
    lambda row: 'long' if row['word_count'] > 2 else 'short',
    axis=1
)

# Apply to entire column (returns Series or DataFrame)
def normalize(series):
    return (series - series.min()) / (series.max() - series.min())

df_numeric = pd.DataFrame({'a': [1,2,3,4], 'b': [10,20,30,40]})
df_numeric.apply(normalize)  # Applies per column

When to avoid apply: for element-wise string/numeric operations, vectorized alternatives are 10–100× faster:

# SLOW — Python loop disguised as apply
df['upper'] = df['text'].apply(str.upper)

# FAST — vectorized string method
df['upper'] = df['text'].str.upper()

# SLOW
df['total'] = df.apply(lambda r: r['a'] + r['b'], axis=1)
# FAST
df['total'] = df['a'] + df['b']

Performance & Optimization

Q7. How do you optimize Pandas for large datasets?

import pandas as pd

# 1. Read only needed columns
df = pd.read_csv('large_file.csv', usecols=['date', 'amount', 'user_id'])

# 2. Specify dtypes upfront to reduce memory
dtypes = {
    'user_id': 'int32',       # vs int64 — halves memory
    'amount': 'float32',      # vs float64
    'category': 'category'    # string → categorical (for low-cardinality columns)
}
df = pd.read_csv('large_file.csv', dtype=dtypes, parse_dates=['date'])

# 3. Check memory usage
df.info(memory_usage='deep')
df.memory_usage(deep=True)

# 4. Categorical dtype for string columns with few unique values
df['status'] = df['status'].astype('category')
# Memory: 'object' dtype stores full string per row; 'category' stores integers

# 5. Use chunking for files too large for RAM
results = []
for chunk in pd.read_csv('huge.csv', chunksize=100_000):
    summary = chunk.groupby('dept')['salary'].sum()
    results.append(summary)
final = pd.concat(results).groupby(level=0).sum()

# 6. Avoid chained indexing (SettingWithCopyWarning)
# BAD
df[df['age'] > 30]['salary'] = 0  # May not work!
# GOOD
df.loc[df['age'] > 30, 'salary'] = 0

Q8. What is vectorization in Pandas and why does it matter?

Vectorization means applying an operation across an entire array at once using NumPy’s C-level loops instead of Python’s interpreted for-loop:

import pandas as pd
import numpy as np

df = pd.DataFrame({'x': np.random.randn(1_000_000)})

# Python loop — slow (1M Python iterations)
%timeit df['x'].apply(lambda v: v ** 2)  # ~300ms

# Vectorized NumPy — fast (single C-level operation)
%timeit df['x'] ** 2                      # ~4ms  (75× faster)
%timeit np.square(df['x'])                # ~3ms

Always prefer:

Arithmetic operators (+, -, *, /) over apply(lambda x: x + 1)
str accessor for string ops: df['col'].str.upper(), str.contains(), str.extract()
dt accessor for datetime: df['date'].dt.year, dt.dayofweek
NumPy ufuncs for math: np.log(), np.sqrt(), np.where()

Advanced Topics

Q9. How do you perform time series operations in Pandas?

import pandas as pd
import numpy as np

# Create time series
dates = pd.date_range('2025-01-01', periods=365, freq='D')
df = pd.DataFrame({'date': dates, 'revenue': np.random.randint(1000, 5000, 365)})
df = df.set_index('date')

# Resample — aggregate to different frequency
weekly = df.resample('W').sum()
monthly = df.resample('ME').agg({'revenue': ['sum', 'mean', 'max']})

# Rolling window
df['7d_avg'] = df['revenue'].rolling(window=7).mean()
df['30d_std'] = df['revenue'].rolling(window=30).std()

# Expanding window (cumulative)
df['cumulative'] = df['revenue'].expanding().sum()

# Lag features
df['prev_day'] = df['revenue'].shift(1)
df['yoy_growth'] = df['revenue'].pct_change(periods=365)

# Date filtering
df['2025-03']           # March 2025
df['2025-Q1']           # Q1 2025
df.between_time('09:00', '17:00')  # Time-of-day filtering

Q10. What is the difference between pivot_table and crosstab?

import pandas as pd

df = pd.DataFrame({
    'dept': ['Eng', 'Eng', 'HR', 'HR', 'Eng'],
    'level': ['Senior', 'Junior', 'Senior', 'Junior', 'Senior'],
    'salary': [120, 75, 95, 65, 115],
    'gender': ['F', 'M', 'F', 'F', 'M']
})

# pivot_table — aggregate data (like SQL GROUP BY + PIVOT)
pivot = pd.pivot_table(
    df,
    values='salary',
    index='dept',
    columns='level',
    aggfunc=['mean', 'count'],
    fill_value=0,
    margins=True          # Add row/column totals
)

# crosstab — frequency table (counts by default)
ct = pd.crosstab(df['dept'], df['gender'], margins=True)
# Can use values + aggfunc for non-count aggregations
ct_salary = pd.crosstab(df['dept'], df['level'], values=df['salary'], aggfunc='mean')

pivot_table is more flexible (any aggregation); crosstab is a convenience wrapper for frequency counts.

Q11. What are common Pandas pitfalls and how do you avoid them?

1. Chained indexing:

# BAD — unpredictable behavior (SettingWithCopyWarning)
df[df['age'] > 25]['name'] = 'Updated'

# GOOD — use .loc
df.loc[df['age'] > 25, 'name'] = 'Updated'

2. Modifying a copy vs. the original:

subset = df[df['dept'] == 'Eng']  # May be a view or copy!
subset = df[df['dept'] == 'Eng'].copy()  # Explicit copy — safe to modify

3. Object dtype for numeric data:

df = pd.read_csv('data.csv')
# "1,234" formatted numbers read as strings!
df['revenue'] = pd.to_numeric(df['revenue'].str.replace(',', ''), errors='coerce')

4. Date parsing:

# Let Pandas parse dates during read
df = pd.read_csv('data.csv', parse_dates=['date'], date_format='%Y-%m-%d')

5. Iterating with iterrows (very slow):

# BAD — 100× slower than vectorized
for idx, row in df.iterrows():
    df.at[idx, 'total'] = row['a'] + row['b']

# GOOD
df['total'] = df['a'] + df['b']