import string
import numpy as np
import pandas as pd
生成具有各种 dtypes 的样本 DF
df = pd.DataFrame({
'int32': np.random.randint(0, 10**6, 10),
'int64': np.random.randint(10**7, 10**9, 10).astype(np.int64)*10,
'float': np.random.rand(10),
'string': np.random.choice([c*10 for c in string.ascii_uppercase], 10),
})
In [71]: df
Out[71]:
float int32 int64 string
0 0.649978 848354 5269162190 DDDDDDDDDD
1 0.346963 490266 6897476700 OOOOOOOOOO
2 0.035069 756373 6711566750 ZZZZZZZZZZ
3 0.066692 957474 9085243570 FFFFFFFFFF
4 0.679182 665894 3750794810 MMMMMMMMMM
5 0.861914 630527 6567684430 TTTTTTTTTT
6 0.697691 825704 8005182860 FFFFFFFFFF
7 0.474501 942131 4099797720 QQQQQQQQQQ
8 0.645817 951055 8065980030 VVVVVVVVVV
9 0.083500 349709 7417288920 EEEEEEEEEE
制作更大的 DF(10 * 100.000 = 1.000.000 行)
df = pd.concat([df] * 10**5, ignore_index=True)
创建(或打开现有的)HDFStore 文件
store = pd.HDFStore('d:/temp/example.h5')
将我们的数据框保存到 h5
(HDFStore)文件中,索引[int32,int64,string]列:
store.append('store_key', df, data_columns=['int32','int64','string'])
显示 HDFStore 详细信息
In [78]: store.get_storer('store_key').table
Out[78]:
/store_key/table (Table(10,)) ''
description := {
"index": Int64Col(shape=(), dflt=0, pos=0),
"values_block_0": Float64Col(shape=(1,), dflt=0.0, pos=1),
"int32": Int32Col(shape=(), dflt=0, pos=2),
"int64": Int64Col(shape=(), dflt=0, pos=3),
"string": StringCol(itemsize=10, shape=(), dflt=b'', pos=4)}
byteorder := 'little'
chunkshape := (1724,)
autoindex := True
colindexes := {
"index": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"int32": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"string": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"int64": Index(6, medium, shuffle, zlib(1)).is_csi=False}
显示索引列
In [80]: store.get_storer('store_key').table.colindexes
Out[80]:
{
"int32": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"index": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"string": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"int64": Index(6, medium, shuffle, zlib(1)).is_csi=False}
关闭(刷新到磁盘)我们的商店文件
store.close()