Programming Language Interoperability (Interop)
Contents
Programming Language Interoperability (Interop)#
Python#
using PythonCall
@pyeval "3+3"
Python int: 6
np = pyimport("numpy")
Python module: <module 'numpy' from '/usr/local/lib/python3.9/dist-packages/numpy/__init__.py'>
np.linalg.eigvals(np.random.rand(5, 5))
Python ndarray:
array([ 2.84550887+0.j , -0.41993296+0.34707055j,
-0.41993296-0.34707055j, 0.01357372+0.j ,
0.22480288+0.j ])
M = rand(5, 5)
np.linalg.eigvals(M)
Python ndarray:
array([ 2.80377766+0.j , 0.09378935+0.13467381j,
0.09378935-0.13467381j, -0.50506104+0.09779223j,
-0.50506104-0.09779223j])
@pyexec """
global sinpi, np
import numpy as np
def sinpi(x):
return np.sin(np.pi * x)
"""
py_sinpi(x) = pyconvert(Float64, @pyeval("sinpi")(x))
py_sinpi (generic function with 1 method)
py_sinpi(10)
-1.2246467991473533e-15
using BenchmarkTools
@btime py_sinpi(10);
@btime sinpi(10); # built-in Julia function
2.424 μs (3 allocations: 48 bytes)
1.695 ns (0 allocations: 0 bytes)
C#
c_code = """
#include <stddef.h>
double c_sum(size_t n, double *X) {
double s = 0.0;
for (size_t i = 0; i < n; ++i) {
s += X[i];
}
return s;
}
""";
Compile to a shared library by piping c_code
to gcc:
using Libdl
const Clib = tempname() * "." * Libdl.dlext
open(`gcc -fPIC -O3 -msse3 -xc -shared -o $Clib -`, "w") do f
print(f, c_code)
end
Clib
"/tmp/jl_2jmZCLKNKp.so"
Binding the function from the shared library:
c_sum(X::Array{Float64}) = @ccall Clib.c_sum(length(X)::Csize_t, X::Ptr{Float64})::Float64
c_sum (generic function with 1 method)
c_sum(rand(10))
4.8775442958142055
x = rand(10)
@btime c_sum($x);
7.047 ns (0 allocations: 0 bytes)
Mixing Julia, Python, and C#
Julia (real
), Python/numpy (py_sinpi
), C (c_sum
)
x = rand(10);
abs(py_sinpi(c_sum(x)))
0.8710573552611643
@btime abs(py_sinpi(c_sum($x)));
2.353 μs (3 allocations: 48 bytes)
See JuliaInterop for more, such as RCall.jl, JavaCall.jl, and MATLAB.jl.
Julia Microbenchmark: Summation#
Let’s look at and benchmark the sum function:
\[\mathrm{sum}(x) = \sum_{i=1}^n x_i\]
x = rand(10^7);
sum(x)
5.000210398449396e6
d = Dict() # to store the measurement results
Dict{Any, Any}()
Python#
using BenchmarkTools
using PythonCall
numpy#
np = pyimport("numpy")
Python module: <module 'numpy' from '/usr/local/lib/python3.9/dist-packages/numpy/__init__.py'>
numpy_sum = np.sum
Python function: <function sum at 0x7f7fe4408550>
b = @benchmark $numpy_sum($x)
BenchmarkTools.Trial: 747 samples with 1 evaluation.
Range (min … max): 6.490 ms … 9.029 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 6.623 ms ┊ GC (median): 0.00%
Time (mean ± σ): 6.682 ms ± 218.156 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▅█▃▃ ▁
▄▅▅████████▅▆▆▆▄▄▄▃▃▃▂▃▂▂▂▃▂▃▃▂▁▂▂▁▂▁▁▂▂▁▁▁▁▁▂▁▁▁▁▂▁▁▁▁▃▂▂▂ ▃
6.49 ms Histogram: frequency by time 7.52 ms <
Memory estimate: 928 bytes, allocs estimate: 23.
d["Python (numpy)"] = minimum(b.times) / 1e6
6.489832
hand-written#
@pyexec """
global mysum
def mysum(a):
s = 0.0
for x in a:
s = s + x
return s
"""
mysum_py = @pyeval("mysum")
Python function: <function mysum at 0x7f7f64bcaca0>
x_py = pylist(x);
b = @benchmark $mysum_py($x_py)
BenchmarkTools.Trial: 16 samples with 1 evaluation.
Range (min … max): 327.984 ms … 330.202 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 329.236 ms ┊ GC (median): 0.00%
Time (mean ± σ): 329.168 ms ± 585.419 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▁ ▁ ▁▁ █ ▁ ▁ ▁ ▁▁ ▁ ▁ ▁ ▁ ▁
█▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁██▁▁▁▁█▁▁▁█▁▁▁▁█▁▁▁█▁██▁▁█▁▁█▁█▁▁▁▁▁▁█▁▁▁▁▁█ ▁
328 ms Histogram: frequency by time 330 ms <
Memory estimate: 16 bytes, allocs estimate: 1.
d["Python (hand-written)"] = minimum(b.times) / 1e6
327.983876
built-in#
# get the Python built-in "sum" function:
pysum = pybuiltins.sum
Python builtin_function_or_method: <built-in function sum>
b = @benchmark $pysum($x_py)
BenchmarkTools.Trial: 100 samples with 1 evaluation.
Range (min … max): 50.255 ms … 52.900 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 50.403 ms ┊ GC (median): 0.00%
Time (mean ± σ): 50.429 ms ± 261.018 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▄▄ ▂ ▄ █ █ █▂▂▄▄▆ ▂
▄▄▁▁███▄█▄██▄███▆█▁▆███████▁█▆▁▄▄▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▄
50.3 ms Histogram: frequency by time 50.8 ms <
Memory estimate: 16 bytes, allocs estimate: 1.
d["Python (built-in)"] = minimum(b.times) / 1e6
50.255377
C#
hand-written#
c_code = """
#include <stddef.h>
double c_sum(size_t n, double *X) {
double s = 0.0;
for (size_t i = 0; i < n; ++i) {
s += X[i];
}
return s;
}
""";
# compile to a shared library by piping C_code to gcc:
# (only works if you have gcc installed)
using Libdl
const Clib = tempname() * "." * Libdl.dlext
WARNING: redefinition of constant Clib. This may fail, cause incorrect answers, or produce other errors.
"/tmp/jl_etCdKLkncX.so"
open(`gcc -fPIC -O3 -msse3 -xc -shared -o $Clib -`, "w") do f
print(f, c_code)
end
c_sum(X::Array{Float64}) = @ccall Clib.c_sum(length(X)::Csize_t, X::Ptr{Float64})::Float64
c_sum (generic function with 1 method)
c_sum(x) ≈ sum(x)
true
b = @benchmark c_sum($x)
BenchmarkTools.Trial: 354 samples with 1 evaluation.
Range (min … max): 14.021 ms … 14.806 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 14.086 ms ┊ GC (median): 0.00%
Time (mean ± σ): 14.134 ms ± 137.974 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▂▅▆▇▇▆▇█▇▄▁ ▁▂ ▁
██████████████▇▅▆▅▅▁▇▇▆▇▅▅▁▁▅▁▅▁▁▆▁▁▆▅▅▆▅▆▆▅▁▁▇▆▇▅▆▆█▁▁▁▅█▆▇ ▇
14 ms Histogram: log(frequency) by time 14.6 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
d["C"] = minimum(b.times) / 1e6
14.02061
hand-written (with -fast-math
)#
const Clib_fastmath = tempname() * "." * Libdl.dlext
# The same as above but with a -ffast-math flag added
open(`gcc -fPIC -O3 -msse3 -xc -shared -ffast-math -o $Clib_fastmath -`, "w") do f
print(f, c_code)
end
# define a Julia function that calls the C function:
c_sum_fastmath(X::Array{Float64}) = @ccall Clib_fastmath.c_sum(length(X)::Csize_t, X::Ptr{Float64})::Float64
c_sum_fastmath (generic function with 1 method)
b = @benchmark c_sum_fastmath($x)
BenchmarkTools.Trial: 589 samples with 1 evaluation.
Range (min … max): 8.342 ms … 9.601 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 8.490 ms ┊ GC (median): 0.00%
Time (mean ± σ): 8.492 ms ± 55.446 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▂▃▇▇█▅▇▅▁
▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▂▁▂▁▁▂▂▃▃▂▃▅▇█████████▇▅▃▂▁▁▁▁▁▁▂ ▃
8.34 ms Histogram: frequency by time 8.54 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
d["C (fastmath)"] = minimum(b.times) / 1e6
8.342244
Julia#
built-in#
b = @benchmark sum($x)
BenchmarkTools.Trial: 830 samples with 1 evaluation.
Range (min … max): 5.940 ms … 8.481 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 5.999 ms ┊ GC (median): 0.00%
Time (mean ± σ): 6.020 ms ± 112.819 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
█▅▆▄▂▄▇▅▂▃
▂▅███████████▇█▄▅▄▄▃▄▂▃▂▃▃▃▃▃▃▂▃▃▃▃▂▂▂▃▂▃▃▃▂▂▂▂▁▂▃▁▁▁▂▂▂▁▁▂ ▄
5.94 ms Histogram: frequency by time 6.31 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
d["Julia (built-in)"] = minimum(b.times) / 1e6
5.939751
built-in (with Vector{Any}
)#
x_any = Vector{Any}(x)
b = @benchmark sum($x_any)
BenchmarkTools.Trial: 19 samples with 1 evaluation.
Range (min … max): 269.164 ms … 280.384 ms ┊ GC (min … max): 0.00% … 5.39%
Time (median): 272.752 ms ┊ GC (median): 5.47%
Time (mean ± σ): 273.541 ms ± 3.135 ms ┊ GC (mean ± σ): 4.31% ± 2.28%
▃ ▃ █▃ ▃
▇█▁▁▁▁▁▁▁▁▁▁▁▁▁█▇▁██▁▁█▁▁▁▇▁▇▁▁▁▁▁▁▁▁▁▁▇▁▁▁▁▁▁▁▁▁▇▁▁▁▁▁▁▇▁▁▁▇ ▁
269 ms Histogram: frequency by time 280 ms <
Memory estimate: 152.59 MiB, allocs estimate: 9999999.
d["Julia (built-in, Any)"] = minimum(b.times) / 1e6
269.164236
hand-written#
function mysum(A)
s = zero(eltype(A)) # the correct type of zero for A
for a in A
s += a
end
return s
end
mysum (generic function with 1 method)
b = @benchmark mysum($x)
BenchmarkTools.Trial: 350 samples with 1 evaluation.
Range (min … max): 14.128 ms … 15.609 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 14.204 ms ┊ GC (median): 0.00%
Time (mean ± σ): 14.293 ms ± 178.717 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▄▄█ ▂ █▃ ▂ ▁
██████████▆▆▂█▇▄▄▅▄▂▄▃▁▁▂▁▃▃▂▁▁▁▂▁▁▂▁▁▁▁▁▂▂▂▂▂▂▂▅▆▆▄▄▄▃▆██▇▆ ▄
14.1 ms Histogram: frequency by time 14.6 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
d["Julia (hand-written)"] = minimum(b.times) / 1e6
14.128444
hand-written (with @fastmath
)#
function mysum_fastmath(A)
s = zero(eltype(A)) # the correct type of zero for A
@fastmath for a in A
s += a
end
return s
end
mysum_fastmath (generic function with 1 method)
b = @benchmark mysum_fastmath($x)
BenchmarkTools.Trial: 834 samples with 1 evaluation.
Range (min … max): 5.868 ms … 6.362 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 5.955 ms ┊ GC (median): 0.00%
Time (mean ± σ): 5.987 ms ± 106.014 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▁█▅▆▃█▃▂ ▂ ▁
▃▅████████▇▇█▆▇▇█▇█▇▆▇▇▅▄▄▃▅▃▃▄▃▃▂▂▃▂▁▂▂▁▂▁▁▁▂▂▃▃▂▂▁▃▂▄▃▄▃▄ ▄
5.87 ms Histogram: frequency by time 6.33 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
d["Julia (hand-written, fastmath)"] = minimum(b.times) / 1e6
5.867671
Summary#
for (key, value) in sort(collect(d), by=x -> x[2])
println(rpad(key, 30, "."), lpad(round(value, digits=2), 10, "."))
end
Julia (hand-written, fastmath)......5.87
Julia (built-in)....................5.94
Python (numpy)......................6.49
C (fastmath)........................8.34
C..................................14.02
Julia (hand-written)...............14.13
Python (built-in)..................50.26
Julia (built-in, Any).............269.16
Python (hand-written).............327.98
And of course, our hand-written Julia implementation is type-generic!
mysum_fastmath(rand(ComplexF64, 10))
4.305286014426977 + 5.647487693490065im
Supplement: What about other functions?#
Log#
@which log(1.2)
log(x::Float64) in Base.Math at special/log.jl:267
using BenchmarkTools
# uses the system C library
clog(x) = ccall(:log, Float64, (Float64,), x)
# uses LLVM's log
llvmlog(x) = ccall(Symbol("llvm.log.f64"), llvmcall, Float64, (Float64,), x)
@btime log(1.2)
@btime clog(1.2)
@btime llvmlog(1.2);
1.695 ns (0 allocations: 0 bytes)
7.391 ns (0 allocations: 0 bytes)
3.033 ns (0 allocations: 0 bytes)
Exp#
@which exp(1.2)
exp(x::Union{Float16, Float32, Float64}) in Base.Math at special/exp.jl:326
using BenchmarkTools
# uses the system C library
cexp(x) = ccall(:exp, Float64, (Float64,), x)
# uses LLVM's
llvmexp(x) = ccall(Symbol("llvm.exp.f64"), llvmcall, Float64, (Float64,), x)
@btime exp(1.2);
@btime cexp(1.2);
@btime llvmexp(1.2);
1.695 ns (0 allocations: 0 bytes)
8.647 ns (0 allocations: 0 bytes)
3.040 ns (0 allocations: 0 bytes)
Matrix multiplication#
N = 10
C = zeros(N, N);
A = rand(N, N);
B = rand(N, N);
using LinearAlgebra
mul!(C, A, B); # "built-in", calls underlying BLAS/LAPACK
C ≈ A * B
true
using BenchmarkTools
function mul_naive!(C, A, B)
for m in axes(A, 1)
for n in axes(B, 2)
Cmn = zero(eltype(C))
for k in axes(A, 2)
@inbounds Cmn += A[m, k] * B[k, n]
end
C[m, n] = Cmn
end
end
end
mul_naive! (generic function with 1 method)
mul_naive!(C, A, B)
C ≈ A * B
true
using LoopVectorization
function mul_turbo!(C, A, B)
@turbo for m in axes(A, 1)
for n in axes(B, 2)
Cmn = zero(eltype(C))
for k in axes(A, 2)
@inbounds Cmn += A[m, k] * B[k, n]
end
C[m, n] = Cmn
end
end
end
mul_turbo! (generic function with 1 method)
mul_turbo!(C, A, B)
C ≈ A * B
true
c_code = """
#include <stddef.h>
#include <math.h>
void gemm(double* restrict C, double* restrict A, double* restrict B, long M, long K, long N){
for (long i = 0; i < M*N; i++){
C[i] = 0.0;
}
for (long n = 0; n < N; n++){
for (long k = 0; k < K; k++){
for (long m = 0; m < M; m++){
C[m + n*M] += A[m + k*M] * B[k + n*K];
}
}
}
return;
}
""";
# compile to a shared library by piping C_code to gcc:
# (only works if you have gcc installed)
using Libdl
const Clib_gemm = tempname() * "." * Libdl.dlext
open(`gcc -fPIC -O3 -msse3 -xc -shared -o $Clib_gemm -`, "w") do f
print(f, c_code)
end
c_gemm(C::Array{Float64}, A::Array{Float64}, B::Array{Float64}) = @ccall Clib_gemm.gemm(C::Ptr{Float64}, A::Ptr{Float64}, B::Ptr{Float64}, size(A, 1)::Clong, size(A, 2)::Clong, size(B, 2)::Clong)::Cvoid
c_gemm (generic function with 1 method)
c_gemm(C, A, B)
C ≈ A * B
true
@btime mul_naive!($C, $A, $B);
@btime mul_turbo!($C, $A, $B);
@btime mul!($C, $A, $B);
@btime c_gemm($C, $A, $B)
803.176 ns (0 allocations: 0 bytes)
133.059 ns (0 allocations: 0 bytes)
236.029 ns (0 allocations: 0 bytes)
439.843 ns (0 allocations: 0 bytes)
Note for larger N
: BLAS is multithreaded for larger N
. In this case our mul_avx!
can be slower than mul!
.