Cython函数比纯Python需要更多时间

使用Cython，CodeSurgeon已经给出了很好的答案。在这个答案中，我不想展示使用Numba的另一种方法。

我创建了三个版本。在naive_numba我只添加了一个功能装饰。在improved_Numba我手动组合的循环中（每个矢量化命令实际上都是一个循环）。在improved_Numba_p我已经并行化了功能。请注意，使用并行加速器时，显然存在一个错误，不允许定义常量值。还应注意，并行化版本仅对较大的输入阵列有利。但是，您还可以添加一个小的包装器，该包装器根据输入数组的大小调用单线程或并行化版本。

import numba as nb
import numpy as np
import time



@nb.njit(fastmath=True)
def naive_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6

  delta = u-DustJ
  result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

  x= u/IceI;
  result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

  return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

#error_model='numpy' sets divison by 0 to NaN instead of throwing a exception, this allows vectorization
@nb.njit(fastmath=True,error_model='numpy')
def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6
  res=np.empty(u.shape[0],dtype=u.dtype)

  for i in range(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

#there is obvIoUsly a bug in Numba (declaring const values in the function)
@nb.njit(fastmath=True,parallel=True,error_model='numpy')
def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH):
  res=np.empty((u.shape[0]),dtype=u.dtype)

  for i in nb.prange(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

u=np.array(np.random.rand(1000000),dtype=np.float32)
PorosityProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityIceProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityDustProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6

#don't measure compilation overhead on first call
res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH) 
for i in range(1000):
  res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH)

print(time.time()-t1)
print(time.time()-t1)

Arraysize np.random.rand(100)
Numpy             46.8µs
naive Numba       3.1µs
improved Numba:   1.62µs
improved_Numba_p: 17.45µs


#Arraysize np.random.rand(1000000)
Numpy             255.8ms
naive Numba       18.6ms
improved Numba:   6.13ms
improved_Numba_p: 3.54ms

如果np.float32足够，则必须在函数中显式声明所有常量值给float32。否则，Numba将使用float64。

@nb.njit(fastmath=True,error_model='numpy')
def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6)
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2),  nb.float32(6.4650e4), nb.float32(1.6935e6)
  res=np.empty(u.shape[0],dtype=u.dtype)

  for i in range(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

@nb.njit(fastmath=True,parallel=True,error_model='numpy')
def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  res=np.empty((u.shape[0]),dtype=u.dtype)
  DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6)
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2),  nb.float32(6.4650e4), nb.float32(1.6935e6)

  for i in nb.prange(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

Arraysize np.random.rand(100).astype(np.float32)
Numpy             29.3µs
improved Numba:   1.33µs
improved_Numba_p: 18µs


Arraysize np.random.rand(1000000).astype(np.float32)
Numpy             117ms
improved Numba:   2.46ms
improved_Numba_p: 1.56ms

与@CodeSurgeon提供的Cython版本的比较并不十分公平，因为他没有使用启用的AVX2和FMA3指令编译该功能。Numba默认使用-march = native进行编译，这会在我的Core i7-4xxx上启用AVX2和FMA3指令。

但是，如果您不希望分发已编译的Cython版本的代码，就会产生这种感觉，因为如果启用了该优化功能，默认情况下，它将不会在Haswell之前的处理器（或所有Pentium和Celerons）上运行。应该可以编译多个代码路径，但这是编译器的依赖和更多的工作。

python 2022/1/1 18:34:09 有229人围观

撰写回答

你尚未登录，登录后可以

和开发者交流问题的细节

关注并接收问题和回答的更新提醒

参与内容的编辑和改进，让解决方法与时俱进

请先登录

Cython函数比纯Python需要更多时间

撰写回答

推荐问题

JavaScriptWays to circumvent the same-origin policy

Rails Redis设置maxmemory和maxmemory-policy

Webpack开发服务器React Content Security Policy错误

如何在另一个recyclerView中添加一个recyclerView

使用BouncyCastle从文件中读取椭圆曲线私钥

UnsatisfiedDependencyException：创建bean时出错（通过BeanNotOfRequiredTypeException）

Spring Boot Core依赖项被maven-dependency-plugin视为未使用

如何从SQL Server内存中清除SqlDependency？

如果没有什么过期日期，Redis可以将maxmemory-policy volatile-lru逐出吗？

Android recyclerview滚动到顶部

如何在log4j2中创建自定义RewritePolicy？

在fancybox中打开html页面

Java 为什么RecyclerView没有onItemClickListener（）？

使用cypress选择下拉列表项

Flutter中RecyclerView的替代品是什么？

如何将fancybox绑定到动态添加的元素？

安装elastic-search-curator时的Python pip包RequestsDependencyWarning

使用Cygwin从Windows运行fork（2）。可能？

无法使用KeycloakAutoConfiguration使用Spring Boot Security UnsatisfiedDependencyException设置Keycloak

fancybox2 / fancybox导致页面跳到顶部

分类汇总

您的鼓励是对我最大的支持