首先,你可以通过读取CSV选与众不同的常量列表usecols - usecols=['id', 'col1']
。然后按块读取csv,按id和groupby的子集读取concat块。
如果更好用列col1
,就换constants = df['col1'].unique().tolist()
。这取决于您的数据。
或者,您只能读取一列df = pd.read_csv(io.StringIO(temp), sep=",",usecols=['id'])
,这取决于您的数据。
import pandas as pd
import numpy as np
import io
#test data
temp=u"""id,col1,col2,col3
1,13,15,14
1,13,15,14
1,12,15,13
2,18,15,13
2,18,15,13
2,18,15,13
2,18,15,13
2,18,15,13
2,18,15,13
3,14,15,13
3,14,15,13
3,14,185,213"""
df = pd.read_csv(io.StringIO(temp), sep=",", usecols=['id', 'col1'])
#drop duplicities, from out you can choose constant
df = df.drop_duplicates()
print df
# id col1
#0 1 13
#2 1 12
#3 2 18
#9 3 14
#for example list of constants
constants = [1,2,3]
#or column id to list of unique values
constants = df['id'].unique().tolist()
print constants
#[1L, 2L, 3L]
for i in constants:
iter_csv = pd.read_csv(io.StringIO(temp), delimiter=",", chunksize=10)
#concat subset with rows id == constant
df = pd.concat([chunk[chunk['id'] == i] for chunk in iter_csv])
#your groupby function
data = df.reset_index(drop=True).groupby(["id","col1"], as_index=False).sum()
print data.to_csv(index=False)
#id,col1,col2,col3
#1,12,15,13
#1,13,30,28
#
#id,col1,col2,col3
#2,18,90,78
#
#id,col1,col2,col3
#3,14,215,239