Pandas 实用手册

常见的 pandas 用法模式及其在 DataStore 中的对应实现。大多数代码可直接运行，无需修改！

数据加载

读取 CSV 文件

# Pandas
import pandas as pd
df = pd.read_csv("data.csv")

# DataStore - same!
from chdb import datastore as pd
df = pd.read_csv("data.csv")

读取多个文件

# Pandas
import glob
dfs = [pd.read_csv(f) for f in glob.glob("data/*.csv")]
df = pd.concat(dfs)

# DataStore - more efficient with glob pattern
df = pd.read_csv("data/*.csv")

筛选

单个条件

# Pandas and DataStore - identical
df[df['age'] > 25]
df[df['city'] == 'NYC']
df[df['name'].str.contains('John')]

多个条件

# AND
df[(df['age'] > 25) & (df['city'] == 'NYC')]

# OR
df[(df['age'] < 18) | (df['age'] > 65)]

# NOT
df[~(df['status'] == 'inactive')]

使用 query() 方法

# Pandas and DataStore - identical
df.query('age > 25 and city == "NYC"')
df.query('salary > 50000')

isin()

# Pandas and DataStore - identical
df[df['city'].isin(['NYC', 'LA', 'SF'])]

between() 方法

# Pandas and DataStore - identical
df[df['age'].between(18, 65)]

选择列

单列

# Pandas and DataStore - identical
df['name']
df.name  # attribute access

多列

# Pandas and DataStore - identical
df[['name', 'age', 'city']]

选择与过滤

# Pandas and DataStore - identical
df[df['age'] > 25][['name', 'salary']]

# DataStore also supports SQL-style
df.filter(df['age'] > 25).select('name', 'salary')

排序

单列

# Pandas and DataStore - identical
df.sort_values('salary')
df.sort_values('salary', ascending=False)

多列

# Pandas and DataStore - identical
df.sort_values(['city', 'salary'], ascending=[True, False])

获取前/后 N 个

# Pandas and DataStore - identical
df.nlargest(10, 'salary')
df.nsmallest(5, 'age')

GroupBy 和聚合

简单的 GroupBy

# Pandas and DataStore - identical
df.groupby('city')['salary'].mean()
df.groupby('city')['salary'].sum()
df.groupby('city').size()  # count

多种聚合

# Pandas and DataStore - identical
df.groupby('city')['salary'].agg(['sum', 'mean', 'count'])

df.groupby('city').agg({
    'salary': ['sum', 'mean'],
    'age': ['min', 'max']
})

命名聚合

# Pandas and DataStore - identical
df.groupby('city').agg(
    total_salary=('salary', 'sum'),
    avg_salary=('salary', 'mean'),
    employee_count=('id', 'count')
)

多个分组键

# Pandas and DataStore - identical
df.groupby(['city', 'department'])['salary'].mean()

数据连接

内连接

# Pandas
pd.merge(df1, df2, on='id')

# DataStore - same API
pd.merge(df1, df2, on='id')

# DataStore also supports
df1.join(df2, on='id')

左连接

# Pandas and DataStore - identical
pd.merge(df1, df2, on='id', how='left')

按不同列进行连接

# Pandas and DataStore - identical
pd.merge(df1, df2, left_on='emp_id', right_on='id')

连接

# Pandas and DataStore - identical
pd.concat([df1, df2, df3])
pd.concat([df1, df2], axis=1)

字符串操作

大小写转换

# Pandas and DataStore - identical
df['name'].str.upper()
df['name'].str.lower()
df['name'].str.title()

子字符串

# Pandas and DataStore - identical
df['name'].str[:3]        # First 3 characters
df['name'].str.slice(0, 3)

查找

# Pandas and DataStore - identical
df['name'].str.contains('John')
df['name'].str.startswith('A')
df['name'].str.endswith('son')

替换

# Pandas and DataStore - identical
df['text'].str.replace('old', 'new')
df['text'].str.replace(r'\d+', '', regex=True)  # Remove digits

分割

# Pandas and DataStore - identical
df['name'].str.split(' ')
df['name'].str.split(' ', expand=True)

长度

# Pandas and DataStore - identical
df['name'].str.len()

日期时间操作

提取各个组件

# Pandas and DataStore - identical
df['date'].dt.year
df['date'].dt.month
df['date'].dt.day
df['date'].dt.dayofweek
df['date'].dt.hour

格式设置

# Pandas and DataStore - identical
df['date'].dt.strftime('%Y-%m-%d')

缺失数据

检查缺失值

# Pandas and DataStore - identical
df['col'].isna()
df['col'].notna()
df.isna().sum()

丢弃缺失值

# Pandas and DataStore - identical
df.dropna()
df.dropna(subset=['col1', 'col2'])

填充缺失值

# Pandas and DataStore - identical
df.fillna(0)
df.fillna({'col1': 0, 'col2': 'Unknown'})
df.fillna(method='ffill')

创建新列

简单赋值操作

# Pandas and DataStore - identical
df['total'] = df['price'] * df['quantity']
df['age_group'] = df['age'] // 10 * 10

使用 assign() 方法

# Pandas and DataStore - identical
df = df.assign(
    total=df['price'] * df['quantity'],
    is_adult=df['age'] >= 18
)

条件筛选（where/mask）

# Pandas and DataStore - identical
df['status'] = df['age'].where(df['age'] >= 18, 'minor')

使用 apply() 编写自定义逻辑

# Works, but triggers pandas execution
df['category'] = df['amount'].apply(lambda x: 'high' if x > 1000 else 'low')

# DataStore alternative (stays lazy)
df['category'] = (
    df.when(df['amount'] > 1000, 'high')
      .otherwise('low')
)

数据重塑

数据透视表

# Pandas and DataStore - identical
df.pivot_table(
    values='amount',
    index='region',
    columns='product',
    aggfunc='sum'
)

Melt（逆透视）

# Pandas and DataStore - identical
df.melt(
    id_vars=['name'],
    value_vars=['score1', 'score2', 'score3'],
    var_name='test',
    value_name='score'
)

Explode（展开）

# Pandas and DataStore - identical
df.explode('tags')  # Expand array column

窗口函数

滚动窗口

# Pandas and DataStore - identical
df['rolling_avg'] = df['price'].rolling(window=7).mean()
df['rolling_sum'] = df['amount'].rolling(window=30).sum()

扩张窗口

# Pandas and DataStore - identical
df['cumsum'] = df['amount'].expanding().sum()
df['cummax'] = df['amount'].expanding().max()

Shift（位移）

# Pandas and DataStore - identical
df['prev_value'] = df['value'].shift(1)   # Lag
df['next_value'] = df['value'].shift(-1)  # Lead

差分

# Pandas and DataStore - identical
df['change'] = df['value'].diff()
df['pct_change'] = df['value'].pct_change()

输出

输出为 CSV

# Pandas and DataStore - identical
df.to_csv("output.csv", index=False)

输出为 Parquet

# Pandas and DataStore - identical
df.to_parquet("output.parquet")

转换为 pandas DataFrame

# DataStore specific
pandas_df = ds.to_df()
pandas_df = ds.to_pandas()

DataStore 附加功能

查看 SQL 语句

# DataStore only
print(ds.to_sql())

执行计划解析

# DataStore only
ds.explain()

ClickHouse 函数

# DataStore only - extra accessors
df['domain'] = df['url'].url.domain()
df['json_value'] = df['data'].json.get_string('key')
df['ip_valid'] = df['ip'].ip.is_ipv4_string()

通用 URI

# DataStore only - read from anywhere
ds = DataStore.uri("s3://bucket/data.parquet")
ds = DataStore.uri("mysql://user:pass@host/db/table")

数据加载​

读取 CSV 文件​

读取多个文件​

筛选​

单个条件​

多个条件​

使用 query() 方法​

isin()​

between() 方法​

选择列​

单列​

多列​

选择与过滤​

排序​

单列​

多列​

获取前/后 N 个​

GroupBy 和聚合​

简单的 GroupBy​

多种聚合​

命名聚合​

多个分组键​

数据连接​

内连接​

左连接​

按不同列进行连接​

连接​

字符串操作​

大小写转换​

子字符串​

查找​

替换​

分割​

长度​

日期时间操作​

提取各个组件​

格式设置​

缺失数据​

检查缺失值​

丢弃缺失值​

填充缺失值​

创建新列​

简单赋值操作​

使用 assign() 方法​

条件筛选（where/mask）​

使用 apply() 编写自定义逻辑​

数据重塑​

数据透视表​

Melt（逆透视）​

Explode（展开）​

窗口函数​

滚动窗口​

扩张窗口​

Shift（位移）​

差分​

输出​

输出为 CSV​

输出为 Parquet​

转换为 pandas DataFrame​

DataStore 附加功能​

查看 SQL 语句​

执行计划解析​

ClickHouse 函数​

通用 URI​

数据加载

读取 CSV 文件

读取多个文件

筛选

单个条件

多个条件

使用 query() 方法

isin()

between() 方法

选择列

单列

多列

选择与过滤

排序

单列

多列

获取前/后 N 个

GroupBy 和聚合

简单的 GroupBy

多种聚合

命名聚合

多个分组键

数据连接

内连接

左连接

按不同列进行连接

连接

字符串操作

大小写转换

子字符串

查找

替换

分割

长度

日期时间操作

提取各个组件

格式设置

缺失数据

检查缺失值

丢弃缺失值

填充缺失值

创建新列

简单赋值操作

使用 assign() 方法

条件筛选（where/mask）

使用 apply() 编写自定义逻辑

数据重塑

数据透视表

Melt（逆透视）

Explode（展开）

窗口函数

滚动窗口

扩张窗口

Shift（位移）

差分

输出

输出为 CSV

输出为 Parquet

转换为 pandas DataFrame

DataStore 附加功能

查看 SQL 语句

执行计划解析

ClickHouse 函数

通用 URI