这其实很容易提取简单的CSS / XPath的。这是相同的lxml的LIB给你。
def get_element(node):
# for XPATH we have to count only for nodes with same type!
length = len(list(node.prevIoUs_siblings)) + 1
if (length) > 1:
return '%s:nth-child(%s)' % (node.name, length)
else:
return node.name
def get_css_path(node):
path = [get_element(node)]
for parent in node.parents:
if parent.name == 'body':
break
path.insert(0, get_element(parent))
return ' > '.join(path)
soup = bs4.BeautifulSoup('<div></div><div><strong><i>bla</i></strong></div>')
assert get_css_path(soup.i) == 'div:nth-child(2) > strong > i'