Programming Language Interoperability (Interop)#

Python#

using PythonCall

@pyeval "3+3"

Python int: 6

np = pyimport("numpy")

Python module: <module 'numpy' from '/usr/local/lib/python3.9/dist-packages/numpy/__init__.py'>

np.linalg.eigvals(np.random.rand(5, 5))

Python ndarray:
array([ 2.84550887+0.j        , -0.41993296+0.34707055j,
       -0.41993296-0.34707055j,  0.01357372+0.j        ,
        0.22480288+0.j        ])

M = rand(5, 5)
np.linalg.eigvals(M)

Python ndarray:
array([ 2.80377766+0.j        ,  0.09378935+0.13467381j,
        0.09378935-0.13467381j, -0.50506104+0.09779223j,
       -0.50506104-0.09779223j])

@pyexec """
global sinpi, np
import numpy as np

def sinpi(x):
    return np.sin(np.pi * x)
"""

py_sinpi(x) = pyconvert(Float64, @pyeval("sinpi")(x))

py_sinpi (generic function with 1 method)

py_sinpi(10)

-1.2246467991473533e-15

using BenchmarkTools
@btime py_sinpi(10);
@btime sinpi(10); # built-in Julia function

  2.424 μs (3 allocations: 48 bytes)

  1.695 ns (0 allocations: 0 bytes)

C#

c_code = """
#include <stddef.h>
double c_sum(size_t n, double *X) {
    double s = 0.0;
    for (size_t i = 0; i < n; ++i) {
        s += X[i];
    }
    return s;
}
""";

Compile to a shared library by piping c_code to gcc:

using Libdl
const Clib = tempname() * "." * Libdl.dlext

open(`gcc -fPIC -O3 -msse3 -xc -shared -o $Clib -`, "w") do f
    print(f, c_code)
end

Clib

"/tmp/jl_2jmZCLKNKp.so"

Binding the function from the shared library:

c_sum(X::Array{Float64}) = @ccall Clib.c_sum(length(X)::Csize_t, X::Ptr{Float64})::Float64

c_sum (generic function with 1 method)

c_sum(rand(10))

4.8775442958142055

x = rand(10)
@btime c_sum($x);

  7.047 ns (0 allocations: 0 bytes)

Mixing Julia, Python, and C#

Julia (real), Python/numpy (py_sinpi), C (c_sum)

x = rand(10);

abs(py_sinpi(c_sum(x)))

0.8710573552611643

@btime abs(py_sinpi(c_sum($x)));

  2.353 μs (3 allocations: 48 bytes)

See JuliaInterop for more, such as RCall.jl, JavaCall.jl, and MATLAB.jl.

Julia Microbenchmark: Summation#

Let’s look at and benchmark the sum function:

\[\mathrm{sum}(x) = \sum_{i=1}^n x_i\]

x = rand(10^7);

sum(x)

5.000210398449396e6

d = Dict() # to store the measurement results

Dict{Any, Any}()

Python#

using BenchmarkTools
using PythonCall

numpy#

np = pyimport("numpy")

Python module: <module 'numpy' from '/usr/local/lib/python3.9/dist-packages/numpy/__init__.py'>

numpy_sum = np.sum

Python function: <function sum at 0x7f7fe4408550>

b = @benchmark $numpy_sum($x)

BenchmarkTools.Trial: 747 samples with 1 evaluation.
 Range (min … max):  6.490 ms …   9.029 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     6.623 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   6.682 ms ± 218.156 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

      ▅█▃▃ ▁                                                   
  ▄▅▅████████▅▆▆▆▄▄▄▃▃▃▂▃▂▂▂▃▂▃▃▂▁▂▂▁▂▁▁▂▂▁▁▁▁▁▂▁▁▁▁▂▁▁▁▁▃▂▂▂ ▃
  6.49 ms         Histogram: frequency by time        7.52 ms <

 Memory estimate: 928 bytes, allocs estimate: 23.

d["Python (numpy)"] = minimum(b.times) / 1e6

6.489832

hand-written#

@pyexec """
global mysum

def mysum(a):
    s = 0.0
    for x in a:
        s = s + x
    return s
"""

mysum_py = @pyeval("mysum")

Python function: <function mysum at 0x7f7f64bcaca0>

x_py = pylist(x);

b = @benchmark $mysum_py($x_py)

BenchmarkTools.Trial: 16 samples with 1 evaluation.
 Range (min … max):  327.984 ms … 330.202 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     329.236 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   329.168 ms ± 585.419 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▁            ▁   ▁▁    █   ▁    ▁   ▁ ▁▁  ▁  ▁ ▁      ▁     ▁  
  █▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁██▁▁▁▁█▁▁▁█▁▁▁▁█▁▁▁█▁██▁▁█▁▁█▁█▁▁▁▁▁▁█▁▁▁▁▁█ ▁
  328 ms           Histogram: frequency by time          330 ms <

 Memory estimate: 16 bytes, allocs estimate: 1.

d["Python (hand-written)"] = minimum(b.times) / 1e6

327.983876

built-in#

# get the Python built-in "sum" function:
pysum = pybuiltins.sum

Python builtin_function_or_method: <built-in function sum>

b = @benchmark $pysum($x_py)

BenchmarkTools.Trial: 100 samples with 1 evaluation.
 Range (min … max):  50.255 ms …  52.900 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     50.403 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   50.429 ms ± 261.018 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

      ▄▄  ▂  ▄  █  █  █▂▂▄▄▆  ▂                                 
  ▄▄▁▁███▄█▄██▄███▆█▁▆███████▁█▆▁▄▄▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▄
  50.3 ms         Histogram: frequency by time         50.8 ms <

 Memory estimate: 16 bytes, allocs estimate: 1.

d["Python (built-in)"] = minimum(b.times) / 1e6

50.255377

C#

hand-written#

c_code = """
#include <stddef.h>
double c_sum(size_t n, double *X) {
    double s = 0.0;
    for (size_t i = 0; i < n; ++i) {
        s += X[i];
    }
    return s;
}
""";

# compile to a shared library by piping C_code to gcc:
# (only works if you have gcc installed)
using Libdl
const Clib = tempname() * "." * Libdl.dlext

WARNING: redefinition of constant Clib. This may fail, cause incorrect answers, or produce other errors.

"/tmp/jl_etCdKLkncX.so"

open(`gcc -fPIC -O3 -msse3 -xc -shared -o $Clib -`, "w") do f
    print(f, c_code)
end

c_sum(X::Array{Float64}) = @ccall Clib.c_sum(length(X)::Csize_t, X::Ptr{Float64})::Float64

c_sum (generic function with 1 method)

c_sum(x) ≈ sum(x)

true

b = @benchmark c_sum($x)

BenchmarkTools.Trial: 354 samples with 1 evaluation.
 Range (min … max):  14.021 ms …  14.806 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     14.086 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   14.134 ms ± 137.974 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▂▅▆▇▇▆▇█▇▄▁ ▁▂                                      ▁         
  ██████████████▇▅▆▅▅▁▇▇▆▇▅▅▁▁▅▁▅▁▁▆▁▁▆▅▅▆▅▆▆▅▁▁▇▆▇▅▆▆█▁▁▁▅█▆▇ ▇
  14 ms         Histogram: log(frequency) by time      14.6 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

d["C"] = minimum(b.times) / 1e6

14.02061

hand-written (with `-fast-math`)#

const Clib_fastmath = tempname() * "." * Libdl.dlext

# The same as above but with a -ffast-math flag added
open(`gcc -fPIC -O3 -msse3 -xc -shared -ffast-math -o $Clib_fastmath -`, "w") do f
    print(f, c_code)
end

# define a Julia function that calls the C function:
c_sum_fastmath(X::Array{Float64}) = @ccall Clib_fastmath.c_sum(length(X)::Csize_t, X::Ptr{Float64})::Float64

c_sum_fastmath (generic function with 1 method)

b = @benchmark c_sum_fastmath($x)

BenchmarkTools.Trial: 589 samples with 1 evaluation.
 Range (min … max):  8.342 ms …  9.601 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     8.490 ms              ┊ GC (median):    0.00%
 Time  (mean ± σ):   8.492 ms ± 55.446 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

                                        ▂▃▇▇█▅▇▅▁             
  ▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▂▁▂▁▁▂▂▃▃▂▃▅▇█████████▇▅▃▂▁▁▁▁▁▁▂ ▃
  8.34 ms        Histogram: frequency by time        8.54 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

d["C (fastmath)"] = minimum(b.times) / 1e6

8.342244

Julia#

built-in#

b = @benchmark sum($x)

BenchmarkTools.Trial: 830 samples with 1 evaluation.
 Range (min … max):  5.940 ms …   8.481 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     5.999 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   6.020 ms ± 112.819 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

     █▅▆▄▂▄▇▅▂▃                                                
  ▂▅███████████▇█▄▅▄▄▃▄▂▃▂▃▃▃▃▃▃▂▃▃▃▃▂▂▂▃▂▃▃▃▂▂▂▂▁▂▃▁▁▁▂▂▂▁▁▂ ▄
  5.94 ms         Histogram: frequency by time        6.31 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

d["Julia (built-in)"] = minimum(b.times) / 1e6

5.939751

built-in (with `Vector{Any}`)#

x_any = Vector{Any}(x)
b = @benchmark sum($x_any)

BenchmarkTools.Trial: 19 samples with 1 evaluation.
 Range (min … max):  269.164 ms … 280.384 ms  ┊ GC (min … max): 0.00% … 5.39%
 Time  (median):     272.752 ms               ┊ GC (median):    5.47%
 Time  (mean ± σ):   273.541 ms ±   3.135 ms  ┊ GC (mean ± σ):  4.31% ± 2.28%

   ▃             ▃  █▃  ▃                                        
  ▇█▁▁▁▁▁▁▁▁▁▁▁▁▁█▇▁██▁▁█▁▁▁▇▁▇▁▁▁▁▁▁▁▁▁▁▇▁▁▁▁▁▁▁▁▁▇▁▁▁▁▁▁▇▁▁▁▇ ▁
  269 ms           Histogram: frequency by time          280 ms <

 Memory estimate: 152.59 MiB, allocs estimate: 9999999.

d["Julia (built-in, Any)"] = minimum(b.times) / 1e6

269.164236

hand-written#

function mysum(A)
    s = zero(eltype(A)) # the correct type of zero for A
    for a in A
        s += a
    end
    return s
end

mysum (generic function with 1 method)

b = @benchmark mysum($x)

BenchmarkTools.Trial: 350 samples with 1 evaluation.
 Range (min … max):  14.128 ms …  15.609 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     14.204 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   14.293 ms ± 178.717 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▄▄█  ▂  █▃   ▂                                          ▁     
  ██████████▆▆▂█▇▄▄▅▄▂▄▃▁▁▂▁▃▃▂▁▁▁▂▁▁▂▁▁▁▁▁▂▂▂▂▂▂▂▅▆▆▄▄▄▃▆██▇▆ ▄
  14.1 ms         Histogram: frequency by time         14.6 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

d["Julia (hand-written)"] = minimum(b.times) / 1e6

14.128444

hand-written (with `@fastmath`)#

function mysum_fastmath(A)
    s = zero(eltype(A)) # the correct type of zero for A
    @fastmath for a in A
        s += a
    end
    return s
end

mysum_fastmath (generic function with 1 method)

b = @benchmark mysum_fastmath($x)

BenchmarkTools.Trial: 834 samples with 1 evaluation.
 Range (min … max):  5.868 ms …   6.362 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     5.955 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   5.987 ms ± 106.014 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

    ▁█▅▆▃█▃▂  ▂   ▁                                            
  ▃▅████████▇▇█▆▇▇█▇█▇▆▇▇▅▄▄▃▅▃▃▄▃▃▂▂▃▂▁▂▂▁▂▁▁▁▂▂▃▃▂▂▁▃▂▄▃▄▃▄ ▄
  5.87 ms         Histogram: frequency by time        6.33 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

d["Julia (hand-written, fastmath)"] = minimum(b.times) / 1e6

5.867671

Summary#

for (key, value) in sort(collect(d), by=x -> x[2])
    println(rpad(key, 30, "."), lpad(round(value, digits=2), 10, "."))
end

Julia (hand-written, fastmath)......5.87
Julia (built-in)....................5.94
Python (numpy)......................6.49
C (fastmath)........................8.34
C..................................14.02
Julia (hand-written)...............14.13
Python (built-in)..................50.26
Julia (built-in, Any).............269.16
Python (hand-written).............327.98

And of course, our hand-written Julia implementation is type-generic!

mysum_fastmath(rand(ComplexF64, 10))

4.305286014426977 + 5.647487693490065im

Supplement: What about other functions?#

Log#

@which log(1.2)

log(x::Float64) in Base.Math at special/log.jl:267

using BenchmarkTools

# uses the system C library
clog(x) = ccall(:log, Float64, (Float64,), x)
# uses LLVM's log
llvmlog(x) = ccall(Symbol("llvm.log.f64"), llvmcall, Float64, (Float64,), x)

@btime log(1.2)
@btime clog(1.2)
@btime llvmlog(1.2);

  1.695 ns (0 allocations: 0 bytes)

  7.391 ns (0 allocations: 0 bytes)

  3.033 ns (0 allocations: 0 bytes)

Exp#

@which exp(1.2)

exp(x::Union{Float16, Float32, Float64}) in Base.Math at special/exp.jl:326

using BenchmarkTools

# uses the system C library
cexp(x) = ccall(:exp, Float64, (Float64,), x)
# uses LLVM's
llvmexp(x) = ccall(Symbol("llvm.exp.f64"), llvmcall, Float64, (Float64,), x)

@btime exp(1.2);
@btime cexp(1.2);
@btime llvmexp(1.2);

  1.695 ns (0 allocations: 0 bytes)

  8.647 ns (0 allocations: 0 bytes)

  3.040 ns (0 allocations: 0 bytes)

Matrix multiplication#

N = 10
C = zeros(N, N);
A = rand(N, N);
B = rand(N, N);

using LinearAlgebra

mul!(C, A, B); # "built-in", calls underlying BLAS/LAPACK

C ≈ A * B

true

using BenchmarkTools

function mul_naive!(C, A, B)
    for m in axes(A, 1)
        for n in axes(B, 2)
            Cmn = zero(eltype(C))
            for k in axes(A, 2)
                @inbounds Cmn += A[m, k] * B[k, n]
            end
            C[m, n] = Cmn
        end
    end
end

mul_naive! (generic function with 1 method)

mul_naive!(C, A, B)
C ≈ A * B

true

LoopVectorization.jl

using LoopVectorization

function mul_turbo!(C, A, B)
    @turbo for m in axes(A, 1)
        for n in axes(B, 2)
            Cmn = zero(eltype(C))
            for k in axes(A, 2)
                @inbounds Cmn += A[m, k] * B[k, n]
            end
            C[m, n] = Cmn
        end
    end
end

mul_turbo! (generic function with 1 method)

mul_turbo!(C, A, B)
C ≈ A * B

true

c_code = """
#include <stddef.h>
#include <math.h>

void gemm(double* restrict C, double* restrict A, double* restrict B, long M, long K, long N){
  for (long i = 0; i < M*N; i++){
    C[i] = 0.0;
  }
  for (long n = 0; n < N; n++){
    for (long k = 0; k < K; k++){
      for (long m = 0; m < M; m++){
	C[m + n*M] += A[m + k*M] * B[k + n*K];
      }
    }
  }
  return;
}
""";

# compile to a shared library by piping C_code to gcc:
# (only works if you have gcc installed)
using Libdl
const Clib_gemm = tempname() * "." * Libdl.dlext

open(`gcc -fPIC -O3 -msse3 -xc -shared -o $Clib_gemm -`, "w") do f
    print(f, c_code)
end

c_gemm(C::Array{Float64}, A::Array{Float64}, B::Array{Float64}) = @ccall Clib_gemm.gemm(C::Ptr{Float64}, A::Ptr{Float64}, B::Ptr{Float64}, size(A, 1)::Clong, size(A, 2)::Clong, size(B, 2)::Clong)::Cvoid

c_gemm (generic function with 1 method)

c_gemm(C, A, B)
C ≈ A * B

true

@btime mul_naive!($C, $A, $B);
@btime mul_turbo!($C, $A, $B);
@btime mul!($C, $A, $B);
@btime c_gemm($C, $A, $B)

  803.176 ns (0 allocations: 0 bytes)

  133.059 ns (0 allocations: 0 bytes)

  236.029 ns (0 allocations: 0 bytes)

  439.843 ns (0 allocations: 0 bytes)

Note for larger N: BLAS is multithreaded for larger N. In this case our mul_avx! can be slower than mul!.

Julia for HPC
EuXFEL Workshop

Programming Language Interoperability (Interop)

Contents

Programming Language Interoperability (Interop)#

Python#

C#

Mixing Julia, Python, and C#

Julia Microbenchmark: Summation#

Python#

numpy#

hand-written#

built-in#

C#

hand-written#

hand-written (with `-fast-math`)#

Julia#

built-in#

built-in (with `Vector{Any}`)#

hand-written#

hand-written (with `@fastmath`)#

Summary#

Supplement: What about other functions?#

Log#

Exp#

Matrix multiplication#

Julia for HPCEuXFEL Workshop

Programming Language Interoperability (Interop)

Contents

Programming Language Interoperability (Interop)#

Python#

C#

Mixing Julia, Python, and C#

Julia Microbenchmark: Summation#

Python#

numpy#

hand-written#

built-in#

C#

hand-written#

hand-written (with -fast-math)#

Julia#

built-in#

built-in (with Vector{Any})#

hand-written#

hand-written (with @fastmath)#

Summary#

Supplement: What about other functions?#

Log#

Exp#

Matrix multiplication#

Julia for HPC
EuXFEL Workshop

hand-written (with `-fast-math`)#

built-in (with `Vector{Any}`)#

hand-written (with `@fastmath`)#