这是一个脚本,可以在几秒钟内(在我的机器上)解析一百万个<instrumentConfiguration/>
元素(967MB
文件),40
而不会占??用大量内存。
吞吐量为24MB/s
。该cElementTree page (2005)
报告47MB/s
。
#!/usr/bin/env python
from itertools import imap, islice, izip
from operator import itemgetter
from xml.etree import cElementTree as etree
def parsexml(filename):
it = imap(itemgetter(1),
iter(etree.iterparse(filename, events=('start',))))
root = next(it) # get root element
for elem in it:
if elem.tag == '{http://psi.hupo.org/ms/mzml}instrumentConfiguration':
values = [('Id', elem.get('id')),
('Parameter1', next(it).get('name'))] # cvParam
componentList_count = int(next(it).get('count'))
for parent, child in islice(izip(it, it), componentList_count):
key = parent.tag.partition('}')[2]
value = child.get('name')
assert child.tag.endswith('cvParam')
values.append((key, value))
yield values
root.clear() # preserve memory
def print_values(it):
for line in (': '.join(val) for conf in it for val in conf):
print(line)
print_values(parsexml(filename))