Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

using DataFrames
using BenchmarkTools
using CategoricalArrays
using PooledArrays
using Random

Access by column number is faster than by name

x = DataFrame(rand(5, 1000), :auto)
@btime $x[!, 500]; ## Faster
  2.313 ns (0 allocations: 0 bytes)
@btime $x.x500;  ## Slower
  10.366 ns (0 allocations: 0 bytes)

When working with data DataFrame use barrier functions or type annotation

function f_bad() ## this function will be slow
    Random.seed!(1)
    x = DataFrame(rand(1000000, 2), :auto)
    y, z = x[!, 1], x[!, 2]
    p = 0.0
    for i in 1:nrow(x)
        p += y[i] * z[i]
    end
    p
end

@btime f_bad();
# if you run @code_warntype f_bad() then you notice
# that Julia does not know column types of `DataFrame`
  97.710 ms (5999022 allocations: 122.06 MiB)

solution 1 is to use barrier function (it should be possible to use it in almost any code) for the calculation. You will notice much less memopry allocations and faster performance.

function f_inner(y, z)
    p = 0.0
    for i in eachindex(y, z)
        p += y[i] * z[i]
    end
    p
end

function f_barrier()
    Random.seed!(1)
    x = DataFrame(rand(1000000, 2), :auto)
    f_inner(x[!, 1], x[!, 2])
end

@btime f_barrier();
  4.925 ms (44 allocations: 30.52 MiB)

or use inbuilt function if possible

using LinearAlgebra

function f_inbuilt()
    Random.seed!(1)
    x = DataFrame(rand(1000000, 2), :auto)
    dot(x[!, 1], x[!, 2])
end

@btime f_inbuilt();
  4.252 ms (44 allocations: 30.52 MiB)

solution 2 is to provide the types of extracted columns. However, there are cases in which you will not know these types.

function f_typed()
    Random.seed!(1)
    x = DataFrame(rand(1000000, 2), :auto)
    y::Vector{Float64}, z::Vector{Float64} = x[!, 1], x[!, 2]
    p = 0.0
    for i in 1:nrow(x)
        p += y[i] * z[i]
    end
    p
end

@btime f_typed();
  4.970 ms (44 allocations: 30.52 MiB)

In general for tall and narrow tables it is often useful to use Tables.rowtable, Tables.columntable or Tables.namedtupleiterator for intermediate processing of data in a type-stable way.

Consider using delayed DataFrame creation technique

also notice the difference in performance between copying vs non-copying data frame creation

function f1()
    x = DataFrame([Vector{Float64}(undef, 10^4) for i in 1:100], :auto, copycols=false) ## we work with a DataFrame directly
    for c in 1:ncol(x)
        d = x[!, c]
        for r in 1:nrow(x)
            d[r] = rand()
        end
    end
    x
end

function f1a()
    x = DataFrame([Vector{Float64}(undef, 10^4) for i in 1:100], :auto) ## we work with a DataFrame directly
    for c in 1:ncol(x)
        d = x[!, c]
        for r in 1:nrow(x)
            d[r] = rand()
        end
    end
    x
end

function f2()
    x = Vector{Any}(undef, 100)
    for c in 1:length(x)
        d = Vector{Float64}(undef, 10^4)
        for r in eachindex(d)
            d[r] = rand()
        end
        x[c] = d
    end
    DataFrame(x, :auto, copycols=false) ## we delay creation of DataFrame after we have our job done
end

function f2a()
    x = Vector{Any}(undef, 100)
    for c in eachindex(x)
        d = Vector{Float64}(undef, 10^4)
        for r in eachindex(d)
            d[r] = rand()
        end
        x[c] = d
    end
    DataFrame(x, :auto) ## we delay creation of DataFrame after we have our job done
end

@btime f1();
@btime f1a();
@btime f2();
@btime f2a();
  23.581 ms (1949728 allocations: 37.40 MiB)
  24.488 ms (1950028 allocations: 45.03 MiB)
  1.528 ms (728 allocations: 7.66 MiB)
  2.491 ms (1028 allocations: 15.29 MiB)

You can add rows to a DataFrame in place and it is fast

x = DataFrame(rand(10^6, 5), :auto)
y = DataFrame(transpose(1.0:5.0), :auto)
z = [1.0:5.0;]

@btime vcat($x, $y); ## creates a new DataFrame - slow
@btime append!($x, $y); ## in place - fast

x = DataFrame(rand(10^6, 5), :auto) ## reset to the same starting point
@btime push!($x, $z); ## add a single row in place - fast
  5.428 ms (213 allocations: 38.16 MiB)
  1.075 μs (30 allocations: 1.52 KiB)
  361.010 ns (16 allocations: 256 bytes)

Allowing missing as well as categorical slows down computations

using StatsBase

function test(data) ## uses countmap function to test performance
    println(eltype(data))
    x = rand(data, 10^6)
    y = categorical(x)
    println(" raw:")
    @btime countmap($x)
    println(" categorical:")
    @btime countmap($y)
    nothing
end

test(1:10)
test([randstring() for i in 1:10])
test(allowmissing(1:10))
test(allowmissing([randstring() for i in 1:10]))
Int64
 raw:
  3.141 ms (8 allocations: 7.63 MiB)
 categorical:
  9.271 ms (4 allocations: 576 bytes)
String
 raw:
  21.586 ms (4 allocations: 448 bytes)
 categorical:
  25.050 ms (4 allocations: 576 bytes)
Union{Missing, Int64}
 raw:
  7.271 ms (4 allocations: 464 bytes)
 categorical:
  22.368 ms (1000004 allocations: 30.52 MiB)
Union{Missing, String}
 raw:
  19.614 ms (4 allocations: 448 bytes)
 categorical:
  38.038 ms (1000004 allocations: 30.52 MiB)

When aggregating use column selector and prefer integer, categorical, or pooled array grouping variable

df = DataFrame(x=rand('a':'d', 10^7), y=1);
gdf = groupby(df, :x)
Loading...

traditional syntax, slow

@btime combine(v -> sum(v.y), $gdf)
  30.855 ms (333 allocations: 19.08 MiB)
Loading...

use column selector

@btime combine($gdf, :y => sum)
  12.215 ms (199 allocations: 9.41 KiB)
Loading...
transform!(df, :x => categorical => :x);
gdf = groupby(df, :x)
Loading...
@btime combine($gdf, :y => sum)
  12.404 ms (209 allocations: 9.98 KiB)
Loading...
transform!(df, :x => PooledArray{Char} => :x)
Loading...
gdf = groupby(df, :x)
Loading...
@btime combine($gdf, :y => sum)
  12.246 ms (201 allocations: 9.47 KiB)
Loading...

Use views instead of materializing a new DataFrame

x = DataFrame(rand(100, 1000), :auto)
@btime $x[1:1, :]
  423.873 μs (3015 allocations: 143.79 KiB)
Loading...
@btime $x[1, :]
  18.433 ns (0 allocations: 0 bytes)
Loading...
@btime view($x, 1:1, :)
  19.417 ns (0 allocations: 0 bytes)
Loading...
@btime $x[1:1, 1:20]
  8.982 μs (70 allocations: 3.09 KiB)
Loading...
@btime $x[1, 1:20]
  21.644 ns (0 allocations: 0 bytes)
Loading...
@btime view($x, 1:1, 1:20)
  22.732 ns (0 allocations: 0 bytes)
Loading...

This notebook was generated using Literate.jl.