分层数据：为每个节点有效地构建每个后代的列表

这是一种使用numpy一次遍历树的方法。

import numpy as np
import pandas as pd  # only used to return a dataframe


def list_ancestors(edges):
    """
    Take edge list of a rooted tree as a numpy array with shape (E, 2),
    child nodes in edges[:, 0], parent nodes in edges[:, 1]
    Return pandas dataframe of all descendant/ancestor node pairs

    Ex:
        df = pd.DataFrame({'child': [200, 201, 300, 301, 302, 400],
                           'parent': [100, 100, 200, 200, 201, 300]})

        df
           child  parent
        0    200     100
        1    201     100
        2    300     200
        3    301     200
        4    302     201
        5    400     300

        list_ancestors(df.values)

        returns

            descendant  ancestor
        0          200       100
        1          201       100
        2          300       200
        3          300       100
        4          301       200
        5          301       100
        6          302       201
        7          302       100
        8          400       300
        9          400       200
        10         400       100
    """
    ancestors = []
    for ar in trace_nodes(edges):
        ancestors.append(np.c_[np.repeat(ar[:, 0], ar.shape[1]-1),
                               ar[:, 1:].flatten()])
    return pd.DataFrame(np.concatenate(ancestors),
                        columns=['descendant', 'ancestor'])


def trace_nodes(edges):
    """
    Take edge list of a rooted tree as a numpy array with shape (E, 2),
    child nodes in edges[:, 0], parent nodes in edges[:, 1]
    Yield numpy array with cross-section of tree and associated
    ancestor nodes

    Ex:
        df = pd.DataFrame({'child': [200, 201, 300, 301, 302, 400],
                           'parent': [100, 100, 200, 200, 201, 300]})

        df
           child  parent
        0    200     100
        1    201     100
        2    300     200
        3    301     200
        4    302     201
        5    400     300

        trace_nodes(df.values)

        yields

        array([[200, 100],
               [201, 100]])

        array([[300, 200, 100],
               [301, 200, 100],
               [302, 201, 100]])

        array([[400, 300, 200, 100]])
    """
    mask = np.in1d(edges[:, 1], edges[:, 0])
    gen_branches = edges[~mask]
    edges = edges[mask]
    yield gen_branches
    while edges.size != 0:
        mask = np.in1d(edges[:, 1], edges[:, 0])
        next_gen = edges[~mask]
        gen_branches = numpy_col_inner_many_to_one_join(next_gen, gen_branches)
        edges = edges[mask]
        yield gen_branches


def numpy_col_inner_many_to_one_join(ar1, ar2):
    """
    Take two 2-d numpy arrays ar1 and ar2,
    with no duplicate values in first column of ar2
    Return inner join of ar1 and ar2 on
    last column of ar1, first column of ar2

    Ex:

        ar1 = np.array([[1,  2,  3],
                        [4,  5,  3],
                        [6,  7,  8],
                        [9, 10, 11]])

        ar2 = np.array([[ 1,  2],
                        [ 3,  4],
                        [ 5,  6],
                        [ 7,  8],
                        [ 9, 10],
                        [11, 12]])

        numpy_col_inner_many_to_one_join(ar1, ar2)

        returns

        array([[ 1,  2,  3,  4],
               [ 4,  5,  3,  4],
               [ 9, 10, 11, 12]])
    """
    ar1 = ar1[np.in1d(ar1[:, -1], ar2[:, 0])]
    ar2 = ar2[np.in1d(ar2[:, 0], ar1[:, -1])]
    if 'int' in ar1.dtype.name and ar1[:, -1].min() >= 0:
        bins = np.bincount(ar1[:, -1])
        counts = bins[bins.nonzero()[0]]
    else:
        counts = np.unique(ar1[:, -1], False, False, True)[1]
    left = ar1[ar1[:, -1].argsort()]
    right = ar2[ar2[:, 0].argsort()]
    return np.concatenate([left[:, :-1],
                           right[np.repeat(np.arange(right.shape[0]),
                                           counts)]], 1)

@ taky2提供的测试用例1和2，测试用例3和4分别比较了高大和阔树结构的性能-大多数用例可能在中间。

df = pd.DataFrame(
    {
        'child': [3102, 2010, 3011, 3000, 3033, 2110, 3111, 2100],
        'parent': [2010, 1000, 2010, 2110, 2100, 1000, 2110, 1000]
    }
)

df2 = pd.DataFrame(
    {
        'child': [4321, 3102, 4023, 2010, 5321, 4200, 4113, 6525, 4010, 4001,
                  3011, 5010, 3000, 3033, 2110, 6100, 3111, 2100, 6016, 4311],
        'parent': [3111, 2010, 3000, 1000, 4023, 3011, 3033, 5010, 3011, 3102,
                   2010, 4023, 2110, 2100, 1000, 5010, 2110, 1000, 5010, 3033]
    }
)

df3 = pd.DataFrame(np.r_[np.c_[np.arange(1, 501), np.arange(500)],
                         np.c_[np.arange(501, 1001), np.arange(500)]],
                   columns=['child', 'parent'])

df4 = pd.DataFrame(np.r_[np.c_[np.arange(1, 101), np.repeat(0, 100)],
                         np.c_[np.arange(1001, 11001),
                               np.repeat(np.arange(1, 101), 100)]],
                   columns=['child', 'parent'])

%timeit get_ancestry_dataframe_flat(df)
10 loops, best of 3: 53.4 ms per loop

%timeit add_children_of_children(df)
1000 loops, best of 3: 1.13 ms per loop

%timeit all_descendants_nx(df)
1000 loops, best of 3: 675 µs per loop

%timeit list_ancestors(df.values)
1000 loops, best of 3: 391 µs per loop

%timeit get_ancestry_dataframe_flat(df2)
10 loops, best of 3: 168 ms per loop

%timeit add_children_of_children(df2)
1000 loops, best of 3: 1.8 ms per loop

%timeit all_descendants_nx(df2)
1000 loops, best of 3: 1.06 ms per loop

%timeit list_ancestors(df2.values)
1000 loops, best of 3: 933 µs per loop

%timeit add_children_of_children(df3)
10 loops, best of 3: 156 ms per loop

%timeit all_descendants_nx(df3)
1 loop, best of 3: 952 ms per loop

%timeit list_ancestors(df3.values)
10 loops, best of 3: 104 ms per loop

%timeit add_children_of_children(df4)
1 loop, best of 3: 503 ms per loop

%timeit all_descendants_nx(df4)
1 loop, best of 3: 238 ms per loop

%timeit list_ancestors(df4.values)
100 loops, best of 3: 2.96 ms per loop

get_ancestry_dataframe_flat 由于时间和记忆的原因，未对情况3和4进行计时。

add_children_of_children修改为在内部标识根节点，但允许采用唯一的根。root_node = (set(dataframe.parent) - set(dataframe.child)).pop()添加第一行。

all_descendants_nx 修改为接受数据框作为参数，而不是从外部名称空间提取。

np.all(get_ancestry_dataframe_flat(df2).sort_values(['descendant', 'ancestor'])\
                                       .reset_index(drop=True) ==\
       list_ancestors(df2.values).sort_values(['descendant', 'ancestor'])\
                                 .reset_index(drop=True))
Out[20]: True

其他 2022/1/1 18:33:14 有470人围观

撰写回答

你尚未登录，登录后可以

和开发者交流问题的细节

关注并接收问题和回答的更新提醒

参与内容的编辑和改进，让解决方法与时俱进

请先登录

分层数据：为每个节点有效地构建每个后代的列表

撰写回答

推荐问题

分层ldd（1）

IE7 Z索引分层问题

具有分层类别，子类别的Elasticsearch聚合；限制水平

基于SpringBoot +方法的分层角色安全性：ServletContext是必需的

MySql中的分层数据

Elasticsearch分层排序

MySQL Closure Table分层数据库-如何以正确的顺序提取信息

在闭包表分层数据结构中对子树进行排序

Java Jar文件：使用资源错误：URI不分层

如何将平面数据结构显示为分层数据结构（Java）？

熊猫分层抽样

如何创建MySQL分层递归查询？

分层数据：为每个节点有效地构建每个后代的列表

Altair：无法对分层图进行分面

在SQL Server中使用FOR XML PATH查询以获取XML输出以获取分层数据

除Oracle以外的RDBMS中的CONNECT BY或分层查询

Pandas的分层样品

为分层定义的数据集创建展平的表/视图

没有Oracle CONNECT BY语法的SQL中的分层控制范围报告？

具有字符串路径的分层数据-查询节点，获取其节点的所有父级和第一级

分类汇总

您的鼓励是对我最大的支持