Performance tips

using DataFrames
using BenchmarkTools
using CategoricalArrays
using PooledArrays
using Random

Access by column number is faster than by name¶

x = DataFrame(rand(5, 1000), :auto)
@btime $x[!, 500]; ## Faster

  3.085 ns (0 allocations: 0 bytes)

@btime $x.x500;  ## Slower

  11.533 ns (0 allocations: 0 bytes)

When working with data DataFrame use barrier functions or type annotation¶

function f_bad() ## this function will be slow
    Random.seed!(1)
    x = DataFrame(rand(1000000, 2), :auto)
    y, z = x[!, 1], x[!, 2]
    p = 0.0
    for i in 1:nrow(x)
        p += y[i] * z[i]
    end
    p
end

@btime f_bad();
# if you run @code_warntype f_bad() then you notice
# that Julia does not know column types of `DataFrame`

  109.958 ms (5999021 allocations: 122.06 MiB)

solution 1 is to use barrier function (it should be possible to use it in almost any code) for the calculation. You will notice much less memopry allocations and faster performance.

function f_inner(y, z)
    p = 0.0
    for i in eachindex(y, z)
        p += y[i] * z[i]
    end
    p
end

function f_barrier()
    Random.seed!(1)
    x = DataFrame(rand(1000000, 2), :auto)
    f_inner(x[!, 1], x[!, 2])
end

@btime f_barrier();

  4.595 ms (43 allocations: 30.52 MiB)

or use inbuilt function if possible

using LinearAlgebra

function f_inbuilt()
    Random.seed!(1)
    x = DataFrame(rand(1000000, 2), :auto)
    dot(x[!, 1], x[!, 2])
end

@btime f_inbuilt();

  4.003 ms (43 allocations: 30.52 MiB)

solution 2 is to provide the types of extracted columns. However, there are cases in which you will not know these types.

function f_typed()
    Random.seed!(1)
    x = DataFrame(rand(1000000, 2), :auto)
    y::Vector{Float64}, z::Vector{Float64} = x[!, 1], x[!, 2]
    p = 0.0
    for i in 1:nrow(x)
        p += y[i] * z[i]
    end
    p
end

@btime f_typed();

  4.593 ms (43 allocations: 30.52 MiB)

In general for tall and narrow tables it is often useful to use Tables.rowtable, Tables.columntable or Tables.namedtupleiterator for intermediate processing of data in a type-stable way.

Consider using delayed `DataFrame` creation technique¶

also notice the difference in performance between copying vs non-copying data frame creation

function f1()
    x = DataFrame([Vector{Float64}(undef, 10^4) for i in 1:100], :auto, copycols=false) ## we work with a DataFrame directly
    for c in 1:ncol(x)
        d = x[!, c]
        for r in 1:nrow(x)
            d[r] = rand()
        end
    end
    x
end

function f1a()
    x = DataFrame([Vector{Float64}(undef, 10^4) for i in 1:100], :auto) ## we work with a DataFrame directly
    for c in 1:ncol(x)
        d = x[!, c]
        for r in 1:nrow(x)
            d[r] = rand()
        end
    end
    x
end

function f2()
    x = Vector{Any}(undef, 100)
    for c in 1:length(x)
        d = Vector{Float64}(undef, 10^4)
        for r in eachindex(d)
            d[r] = rand()
        end
        x[c] = d
    end
    DataFrame(x, :auto, copycols=false) ## we delay creation of DataFrame after we have our job done
end

function f2a()
    x = Vector{Any}(undef, 100)
    for c in eachindex(x)
        d = Vector{Float64}(undef, 10^4)
        for r in eachindex(d)
            d[r] = rand()
        end
        x[c] = d
    end
    DataFrame(x, :auto) ## we delay creation of DataFrame after we have our job done
end

@btime f1();
@btime f1a();
@btime f2();
@btime f2a();

  25.865 ms (1949727 allocations: 37.40 MiB)
  29.826 ms (1950027 allocations: 45.03 MiB)
  1.199 ms (727 allocations: 7.66 MiB)
  1.676 ms (1027 allocations: 15.29 MiB)

You can add rows to a DataFrame in place and it is fast¶

x = DataFrame(rand(10^6, 5), :auto)
y = DataFrame(transpose(1.0:5.0), :auto)
z = [1.0:5.0;]

@btime vcat($x, $y); ## creates a new DataFrame - slow
@btime append!($x, $y); ## in place - fast

x = DataFrame(rand(10^6, 5), :auto) ## reset to the same starting point
@btime push!($x, $z); ## add a single row in place - fast

  2.347 ms (210 allocations: 38.16 MiB)
  1.086 μs (30 allocations: 1.52 KiB)
  425.663 ns (16 allocations: 256 bytes)

Allowing missing as well as categorical slows down computations¶

using StatsBase

function test(data) ## uses countmap function to test performance
    println(eltype(data))
    x = rand(data, 10^6)
    y = categorical(x)
    println(" raw:")
    @btime countmap($x)
    println(" categorical:")
    @btime countmap($y)
    nothing
end

test(1:10)
test([randstring() for i in 1:10])
test(allowmissing(1:10))
test(allowmissing([randstring() for i in 1:10]))

Int64
 raw:
  1.860 ms (8 allocations: 7.63 MiB)
 categorical:
  9.999 ms (4 allocations: 576 bytes)
String
 raw:
  25.790 ms (4 allocations: 448 bytes)
 categorical:
  30.166 ms (4 allocations: 576 bytes)
Union{Missing, Int64}
 raw:
  7.343 ms (4 allocations: 464 bytes)
 categorical:
  21.793 ms (1000004 allocations: 30.52 MiB)
Union{Missing, String}
 raw:
  18.649 ms (4 allocations: 448 bytes)
 categorical:
  33.867 ms (1000004 allocations: 30.52 MiB)

When aggregating use column selector and prefer integer, categorical, or pooled array grouping variable¶

df = DataFrame(x=rand('a':'d', 10^7), y=1);

gdf = groupby(df, :x)

traditional syntax, slow

@btime combine(v -> sum(v.y), $gdf)

  16.925 ms (318 allocations: 19.10 MiB)

use column selector

@btime combine($gdf, :y => sum)

  7.016 ms (193 allocations: 9.30 KiB)

transform!(df, :x => categorical => :x);
gdf = groupby(df, :x)

@btime combine($gdf, :y => sum)

  7.095 ms (203 allocations: 9.88 KiB)

transform!(df, :x => PooledArray{Char} => :x)

gdf = groupby(df, :x)

@btime combine($gdf, :y => sum)

  7.005 ms (195 allocations: 9.36 KiB)

Use views instead of materializing a new DataFrame¶

x = DataFrame(rand(100, 1000), :auto)
@btime $x[1:1, :]

  447.464 μs (3014 allocations: 143.77 KiB)

@btime $x[1, :]

  18.188 ns (0 allocations: 0 bytes)

@btime view($x, 1:1, :)

  18.810 ns (0 allocations: 0 bytes)

@btime $x[1:1, 1:20]

  9.297 μs (69 allocations: 3.08 KiB)

@btime $x[1, 1:20]

  19.926 ns (0 allocations: 0 bytes)

@btime view($x, 1:1, 1:20)

  21.353 ns (0 allocations: 0 bytes)

This notebook was generated using Literate.jl.