Reading hepfiles#

Note: If you have not run through the write_hepfile do that first to generate the output file from that. That output file will be used as the input here!

Reading the Entire File#

[1]:
# import the load function
from hepfile import load

We begin with a file, and load it into an empty data dictionary:

[2]:
infile = 'output_from_scratch.hdf5'
data, event = load(infile)

data is a dictionary containing counters, indices, and data for all the features we care about. event is an empty dictionary waiting to be filled by data from some new event.

[3]:
print(data)
{'_MAP_DATASETS_TO_COUNTERS_': {'_SINGLETONS_GROUP_': '_SINGLETONS_GROUP_/COUNTER', 'jet': 'jet/njet', 'muons': 'muons/nmuon', 'jet/e': 'jet/njet', 'jet/px': 'jet/njet', 'jet/py': 'jet/njet', 'jet/pz': 'jet/njet', 'jet/algorithm': 'jet/njet', 'jet/words': 'jet/njet', 'muons/e': 'muons/nmuon', 'muons/px': 'muons/nmuon', 'muons/py': 'muons/nmuon', 'muons/pz': 'muons/nmuon', 'METpx': '_SINGLETONS_GROUP_/COUNTER', 'METpy': '_SINGLETONS_GROUP_/COUNTER'}, '_MAP_DATASETS_TO_INDEX_': {'_SINGLETONS_GROUP_': '_SINGLETONS_GROUP_/COUNTER_INDEX', 'jet': 'jet/njet_INDEX', 'muons': 'muons/nmuon_INDEX', 'jet/e': 'jet/njet_INDEX', 'jet/px': 'jet/njet_INDEX', 'jet/py': 'jet/njet_INDEX', 'jet/pz': 'jet/njet_INDEX', 'jet/algorithm': 'jet/njet_INDEX', 'jet/words': 'jet/njet_INDEX', 'muons/e': 'muons/nmuon_INDEX', 'muons/px': 'muons/nmuon_INDEX', 'muons/py': 'muons/nmuon_INDEX', 'muons/pz': 'muons/nmuon_INDEX', 'METpx': '_SINGLETONS_GROUP_/COUNTER_INDEX', 'METpy': '_SINGLETONS_GROUP_/COUNTER_INDEX'}, '_LIST_OF_COUNTERS_': ['_SINGLETONS_GROUP_/COUNTER', 'jet/njet', 'muons/nmuon'], '_LIST_OF_DATASETS_': ['METpx', 'METpy', '_SINGLETONS_GROUP_', '_SINGLETONS_GROUP_/COUNTER', 'jet', 'jet/algorithm', 'jet/e', 'jet/njet', 'jet/px', 'jet/py', 'jet/pz', 'jet/words', 'muons', 'muons/e', 'muons/nmuon', 'muons/px', 'muons/py', 'muons/pz'], '_META_': {}, '_NUMBER_OF_BUCKETS_': 10000, '_SINGLETONS_GROUP_': array(['METpx', 'METpy'], dtype='<U5'), '_SINGLETONS_GROUP_/COUNTER': array([1, 1, 1, ..., 1, 1, 1]), '_SINGLETONS_GROUP_/COUNTER_INDEX': array([   0,    1,    2, ..., 9997, 9998, 9999]), 'jet/njet': array([17, 17, 17, ..., 17, 17, 17]), 'jet/njet_INDEX': array([     0,     17,     34, ..., 169949, 169966, 169983]), 'muons/nmuon': array([0, 0, 0, ..., 0, 0, 0]), 'muons/nmuon_INDEX': array([0, 0, 0, ..., 0, 0, 0]), 'METpx': array([0.3857817 , 0.12295702, 0.86312515, ..., 0.03597181, 0.16764084,
       0.02867685], dtype=float32), 'METpy': array([0.21301576, 0.92661446, 0.17768492, ..., 0.05196636, 0.3594888 ,
       0.80965936], dtype=float32), 'jet/algorithm': array([-1,  0,  0, ...,  0,  0, -1]), 'jet/e': array([0.51882523, 0.06649859, 0.8128549 , ..., 0.27712795, 0.40421703,
       0.99385214], dtype=float32), 'jet/px': array([0.72576076, 0.16795638, 0.55153126, ..., 0.49512222, 0.76267385,
       0.88863903], dtype=float32), 'jet/py': array([0.13361722, 0.6626456 , 0.12324467, ..., 0.7238448 , 0.06445312,
       0.40084764], dtype=float32), 'jet/pz': array([0.4981057 , 0.4544982 , 0.05688357, ..., 0.48145318, 0.6470511 ,
       0.5942736 ], dtype=float32), 'jet/words': array([b'aloha', b'hi', b'ciao', ..., b'bye', b'bye', b'bye'],
      dtype=object), 'muons/e': array([], dtype=float32), 'muons/px': array([], dtype=float32), 'muons/py': array([], dtype=float32), 'muons/pz': array([], dtype=float32), '_GROUPS_': {'_SINGLETONS_GROUP_': ['METpx', 'METpy'], 'jet': ['algorithm', 'e', 'njet', 'px', 'py', 'pz', 'words'], 'muons': ['e', 'nmuon', 'px', 'py', 'pz']}, '_MAP_DATASETS_TO_DATA_TYPES_': {'METpx': dtype('float32'), 'METpy': dtype('float32'), '_SINGLETONS_GROUP_': dtype('<U5'), '_SINGLETONS_GROUP_/COUNTER': dtype('int64'), 'jet/algorithm': dtype('int64'), 'jet/e': dtype('float32'), 'jet/njet': dtype('int64'), 'jet/px': dtype('float32'), 'jet/py': dtype('float32'), 'jet/pz': dtype('float32'), 'jet/words': dtype('O'), 'muons/e': dtype('float32'), 'muons/nmuon': dtype('int64'), 'muons/px': dtype('float32'), 'muons/py': dtype('float32'), 'muons/pz': dtype('float32')}, '_PROTECTED_NAMES_': {'_SINGLETONS_GROUP_/COUNTER', '_PROTECTED_NAMES_', '_META_', '_SINGLETONSGROUPFORSTORAGE_', '_MAP_DATASETS_TO_DATA_TYPES_', '_GROUPS_', '_SINGLETONS_GROUP_', '_MAP_DATASETS_TO_COUNTERS_', '_LIST_OF_COUNTERS_', '_HEADER_'}}
[4]:
print(event)
{'METpx': None, 'METpy': None, '_SINGLETONS_GROUP_/COUNTER': None, 'jet/algorithm': None, 'jet/e': None, 'jet/njet': None, 'jet/px': None, 'jet/py': None, 'jet/pz': None, 'jet/words': None, 'muons/e': None, 'muons/nmuon': None, 'muons/px': None, 'muons/py': None, 'muons/pz': None}

Reading Part of a File#

If you only want to read part of a file, you can load only certain groups. This is especially useful for very large datasets.

To do this, you can use the desired_groups and subset arguments to load:

[5]:
data,event = load(infile,desired_groups=['jet'],subset=(5,10))
[6]:
print(data.keys())
dict_keys(['_MAP_DATASETS_TO_COUNTERS_', '_MAP_DATASETS_TO_INDEX_', '_LIST_OF_COUNTERS_', '_LIST_OF_DATASETS_', '_META_', '_NUMBER_OF_BUCKETS_', '_SINGLETONS_GROUP_', '_SINGLETONS_GROUP_/COUNTER', '_SINGLETONS_GROUP_/COUNTER_INDEX', 'jet/njet', 'jet/njet_INDEX', 'muons/nmuon', 'muons/nmuon_INDEX', 'jet/algorithm', 'jet/e', 'jet/px', 'jet/py', 'jet/pz', 'jet/words', '_GROUPS_', '_MAP_DATASETS_TO_DATA_TYPES_', '_PROTECTED_NAMES_'])

Reading into Awkward Arrays#

Awkward arrays are a very fast datatype for heterogeneous datasets. It is relatively easy to read hepfiles into them, all you need to do is add the flag return_type='awkward' to load. Note: the event return will still just be a simple dictionary.

[7]:
data,event = load(infile, return_type='awkward')
[8]:
data.show() # display data
print()
data['jet'].show() # display just the jet data
print()
data.jet.px.show() # display the px data from the jet dataset
[{METpx: 0.386, METpy: 0.213, jet: {...}, muons: {...}},
 {METpx: 0.123, METpy: 0.927, jet: {...}, muons: {...}},
 {METpx: 0.863, METpy: 0.178, jet: {...}, muons: {...}},
 {METpx: 0.0628, METpy: 0.754, jet: {...}, muons: {...}},
 {METpx: 0.161, METpy: 0.408, jet: {...}, muons: {...}},
 {METpx: 0.217, METpy: 0.853, jet: {...}, muons: {...}},
 {METpx: 0.539, METpy: 0.761, jet: {...}, muons: {...}},
 {METpx: 0.631, METpy: 0.723, jet: {...}, muons: {...}},
 {METpx: 0.376, METpy: 0.846, jet: {...}, muons: {...}},
 {METpx: 0.091, METpy: 0.517, jet: {...}, muons: {...}},
 ...,
 {METpx: 0.0554, METpy: 0.0152, jet: {...}, muons: {...}},
 {METpx: 0.404, METpy: 0.41, jet: {...}, muons: {e: [], ...}},
 {METpx: 0.825, METpy: 0.688, jet: {...}, muons: {...}},
 {METpx: 0.392, METpy: 0.177, jet: {...}, muons: {...}},
 {METpx: 0.187, METpy: 0.311, jet: {...}, muons: {...}},
 {METpx: 0.873, METpy: 0.356, jet: {...}, muons: {...}},
 {METpx: 0.036, METpy: 0.052, jet: {...}, muons: {...}},
 {METpx: 0.168, METpy: 0.359, jet: {...}, muons: {...}},
 {METpx: 0.0287, METpy: 0.81, jet: {...}, muons: {...}}]

[{algorithm: [-1, 0, 0, -1, ..., 0, -1, 0], e: [0.519, ...], px: [...], ...},
 {algorithm: [-1, 0, -1, -1, ..., -1, 0, 0], e: [0.214, ...], px: [...], ...},
 {algorithm: [0, -1, 0, -1, ..., 0, 0, -1], e: [0.0682, ...], px: [...], ...},
 {algorithm: [0, 0, 0, 0, ..., -1, -1, 0], e: [0.518, ...], px: [...], ...},
 {algorithm: [-1, -1, 0, 0, ..., 0, 0, 0], e: [0.0882, ...], px: [...], ...},
 {algorithm: [-1, 0, -1, 0, ..., 0, -1, 0], e: [0.667, ...], px: [...], ...},
 {algorithm: [0, -1, 0, 0, ..., 0, 0, 0], e: [0.75, ...], px: [...], ...},
 {algorithm: [0, 0, 0, 0, ..., 0, -1, 0], e: [0.564, ...], px: [...], ...},
 {algorithm: [0, -1, 0, -1, ..., -1, 0, 0], e: [0.344, ...], px: [...], ...},
 {algorithm: [0, 0, 0, 0, ..., -1, -1, 0], e: [0.289, ...], px: [...], ...},
 ...,
 {algorithm: [0, -1, -1, 0, ..., 0, -1, 0], e: [0.132, ...], px: [...], ...},
 {algorithm: [-1, -1, -1, 0, ..., 0, -1, 0], e: [0.951, ...], px: [...], ...},
 {algorithm: [-1, 0, -1, 0, ..., 0, -1, 0], e: [0.124, ...], px: [...], ...},
 {algorithm: [0, 0, 0, 0, ..., -1, 0, -1], e: [0.935, ...], px: [...], ...},
 {algorithm: [-1, 0, -1, 0, ..., 0, 0, -1], e: [0.406, ...], px: [...], ...},
 {algorithm: [-1, 0, 0, 0, ..., 0, -1, 0], e: [0.321, ...], px: [...], ...},
 {algorithm: [-1, -1, -1, 0, ..., -1, 0, 0], e: [0.946, ...], px: [...], ...},
 {algorithm: [0, -1, -1, -1, ..., -1, 0, 0], e: [0.766, ...], px: [...], ...},
 {algorithm: [0, -1, 0, -1, ..., 0, 0, -1], e: [0.453, ...], px: [...], ...}]

[[0.726, 0.168, 0.552, 0.459, 0.521, ..., 0.4, 0.597, 0.735, 0.908, 0.135],
 [0.582, 0.0818, 0.678, 0.416, 0.555, ..., 0.57, 0.233, 0.202, 0.543, 0.586],
 [0.224, 0.952, 0.747, 0.64, 0.0682, ..., 0.648, 0.796, 0.483, 0.546, 0.558],
 [0.939, 0.043, 0.0537, 0.326, 0.188, ..., 0.433, 0.829, 0.315, 0.217, 0.967],
 [0.0541, 0.392, 0.289, 0.738, 0.00133, ..., 0.699, 0.154, 0.552, 0.984, 0.798],
 [0.891, 0.684, 0.719, 1, 0.843, 0.973, ..., 0.74, 0.663, 0.204, 0.787, 0.666],
 [0.867, 0.442, 0.319, 0.476, 0.162, ..., 0.285, 0.848, 0.907, 0.99, 0.627],
 [0.47, 0.867, 0.454, 0.656, 0.66, ..., 0.381, 0.67, 0.232, 0.814, 0.883],
 [0.769, 0.144, 0.452, 0.582, 0.921, ..., 0.513, 0.978, 0.605, 0.103, 0.788],
 [0.584, 0.447, 0.955, 0.419, 0.613, ..., 0.688, 0.236, 0.206, 0.87, 0.596],
 ...,
 [0.61, 0.415, 0.617, 0.395, 0.282, ..., 0.282, 0.0597, 0.916, 0.809, 0.181],
 [0.0727, 0.447, 0.205, 0.481, 0.0703, ..., 0.735, 0.507, 0.891, 0.665, 0.753],
 [0.932, 0.0572, 0.802, 0.535, 0.87, ..., 0.682, 0.424, 0.708, 0.168, 0.122],
 [0.184, 0.374, 0.975, 0.0478, 0.616, ..., 0.55, 0.737, 0.537, 0.585, 0.989],
 [0.924, 0.544, 0.229, 0.996, 0.531, ..., 0.917, 0.961, 0.642, 0.081, 0.0539],
 [0.352, 0.569, 0.966, 0.96, 0.986, ..., 0.673, 0.964, 0.839, 0.187, 0.715],
 [0.732, 0.823, 0.751, 0.523, 0.934, ..., 0.339, 0.706, 0.499, 0.0917, 0.904],
 [0.605, 0.777, 0.791, 0.568, 0.073, ..., 0.0806, 0.614, 0.808, 0.152, 0.00686],
 [0.833, 0.473, 0.0947, 0.419, 0.0112, ..., 0.863, 0.846, 0.495, 0.763, 0.889]]
[9]:
event
[9]:
{'METpx': None,
 'METpy': None,
 '_SINGLETONS_GROUP_/COUNTER': None,
 'jet/algorithm': None,
 'jet/e': None,
 'jet/njet': None,
 'jet/px': None,
 'jet/py': None,
 'jet/pz': None,
 'jet/words': None,
 'muons/e': None,
 'muons/nmuon': None,
 'muons/px': None,
 'muons/py': None,
 'muons/pz': None}

With the return_type=awkward flag, you can still select a subset of the data in the same way!

[10]:
data,event = load(infile, return_type='awkward', desired_groups=['jet'], subset=(5,10))
[11]:
data.show() # display data
print()
data['jet'].show() # display just the jet data
print()
data.jet.px.show() # display the px data from the jet dataset
[{jet: {algorithm: [-1, 0, -1, ..., -1, 0], e: [...], px: [...], ...}},
 {jet: {algorithm: [0, -1, 0, ..., 0, 0, 0], e: [...], px: [...], ...}},
 {jet: {algorithm: [0, 0, 0, ..., 0, -1, 0], e: [...], px: [...], ...}},
 {jet: {algorithm: [0, -1, 0, ..., 0, 0], e: [...], px: [...], ...}},
 {jet: {algorithm: [0, 0, 0, ..., -1, 0], e: [...], px: [...], ...}}]

[{algorithm: [-1, 0, -1, 0, ..., 0, -1, 0], e: [0.667, ...], px: [...], ...},
 {algorithm: [0, -1, 0, 0, ..., 0, 0, 0], e: [0.75, ...], px: [...], ...},
 {algorithm: [0, 0, 0, 0, ..., 0, -1, 0], e: [0.564, ...], px: [...], ...},
 {algorithm: [0, -1, 0, -1, ..., -1, 0, 0], e: [0.344, ...], px: [...], ...},
 {algorithm: [0, 0, 0, 0, ..., -1, -1, 0], e: [0.289, ...], px: [...], ...}]

[[0.891, 0.684, 0.719, 1, 0.843, 0.973, ..., 0.74, 0.663, 0.204, 0.787, 0.666],
 [0.867, 0.442, 0.319, 0.476, 0.162, ..., 0.285, 0.848, 0.907, 0.99, 0.627],
 [0.47, 0.867, 0.454, 0.656, 0.66, ..., 0.381, 0.67, 0.232, 0.814, 0.883],
 [0.769, 0.144, 0.452, 0.582, 0.921, ..., 0.513, 0.978, 0.605, 0.103, 0.788],
 [0.584, 0.447, 0.955, 0.419, 0.613, ..., 0.688, 0.236, 0.206, 0.87, 0.596]]
[12]:
event
[12]:
{'jet/algorithm': None,
 'jet/e': None,
 'jet/njet': None,
 'jet/px': None,
 'jet/py': None,
 'jet/pz': None,
 'jet/words': None}

Reading into a Dictionary of Pandas DataFrames#

To read into a dictionary of pandas dataframes where each dataframe represents data on a different group all we need to do is provide return_type='pandas' to load.

[13]:
data, event = load(infile, return_type='pandas')
[14]:
print(f'Group Names: {data.keys()}')
Group Names: dict_keys(['_SINGLETONS_GROUP_', 'jet', 'muons'])
[15]:
print('jet information:')
data['jet']
jet information:
[15]:
algorithm e px py pz words event_num
0 -1 0.518825 0.725761 0.133617 0.498106 b'aloha' 0
1 0 0.066499 0.167956 0.662646 0.454498 b'hi' 0
2 0 0.812855 0.551531 0.123245 0.056884 b'ciao' 0
3 -1 0.292169 0.459002 0.758781 0.953022 b'ciao' 0
4 0 0.512365 0.520725 0.240334 0.485343 b'bye' 0
... ... ... ... ... ... ... ...
169995 0 0.625886 0.863305 0.751234 0.550784 b'aloha' 9999
169996 0 0.069708 0.845667 0.879986 0.359886 b'bye' 9999
169997 0 0.277128 0.495122 0.723845 0.481453 b'bye' 9999
169998 0 0.404217 0.762674 0.064453 0.647051 b'bye' 9999
169999 -1 0.993852 0.888639 0.400848 0.594274 b'bye' 9999

170000 rows × 7 columns

Once again, we can use a subset of the data with specific groups. However, note how the event numbers get reset to 0-4 when we use a subset with 5 rows. If this is a problem, you should look at converting the default output of load to a dictionary of pandas dataframes by hand using the hf.df_tools.hepfile_to_df method.

[16]:
data,event = load(infile, return_type='pandas', desired_groups=['jet'], subset=(5,10))
[17]:
data['jet']
[17]:
algorithm e px py pz words event_num
0 -1 0.667102 0.891190 0.718513 0.768162 b'bye' 0
1 0 0.146582 0.683707 0.756508 0.472253 b'ciao' 0
2 -1 0.865275 0.718874 0.927169 0.794849 b'hi' 0
3 0 0.776318 0.999773 0.350176 0.440168 b'hi' 0
4 0 0.462614 0.843460 0.351398 0.929219 b'hi' 0
... ... ... ... ... ... ... ...
80 -1 0.544807 0.688257 0.273543 0.637789 b'ciao' 4
81 -1 0.239300 0.235639 0.579898 0.609811 b'bye' 4
82 -1 0.971387 0.206133 0.797268 0.155473 b'aloha' 4
83 -1 0.576240 0.870437 0.513300 0.039285 b'bye' 4
84 0 0.126184 0.596369 0.360929 0.931362 b'bye' 4

85 rows × 7 columns