using DataFrames
using BenchmarkTools
using CategoricalArrays
using PooledArrays
using RandomAccess by column number is faster than by name¶
x = DataFrame(rand(5, 1000), :auto)
@btime $x[!, 500]; ## Faster 2.313 ns (0 allocations: 0 bytes)
@btime $x.x500; ## Slower 10.366 ns (0 allocations: 0 bytes)
When working with data DataFrame use barrier functions or type annotation¶
function f_bad() ## this function will be slow
Random.seed!(1)
x = DataFrame(rand(1000000, 2), :auto)
y, z = x[!, 1], x[!, 2]
p = 0.0
for i in 1:nrow(x)
p += y[i] * z[i]
end
p
end
@btime f_bad();
# if you run @code_warntype f_bad() then you notice
# that Julia does not know column types of `DataFrame` 97.710 ms (5999022 allocations: 122.06 MiB)
solution 1 is to use barrier function (it should be possible to use it in almost any code) for the calculation. You will notice much less memopry allocations and faster performance.
function f_inner(y, z)
p = 0.0
for i in eachindex(y, z)
p += y[i] * z[i]
end
p
end
function f_barrier()
Random.seed!(1)
x = DataFrame(rand(1000000, 2), :auto)
f_inner(x[!, 1], x[!, 2])
end
@btime f_barrier(); 4.925 ms (44 allocations: 30.52 MiB)
or use inbuilt function if possible
using LinearAlgebra
function f_inbuilt()
Random.seed!(1)
x = DataFrame(rand(1000000, 2), :auto)
dot(x[!, 1], x[!, 2])
end
@btime f_inbuilt(); 4.252 ms (44 allocations: 30.52 MiB)
solution 2 is to provide the types of extracted columns. However, there are cases in which you will not know these types.
function f_typed()
Random.seed!(1)
x = DataFrame(rand(1000000, 2), :auto)
y::Vector{Float64}, z::Vector{Float64} = x[!, 1], x[!, 2]
p = 0.0
for i in 1:nrow(x)
p += y[i] * z[i]
end
p
end
@btime f_typed(); 4.970 ms (44 allocations: 30.52 MiB)
In general for tall and narrow tables it is often useful to use Tables.rowtable, Tables.columntable or Tables.namedtupleiterator for intermediate processing of data in a type-stable way.
Consider using delayed DataFrame creation technique¶
also notice the difference in performance between copying vs non-copying data frame creation
function f1()
x = DataFrame([Vector{Float64}(undef, 10^4) for i in 1:100], :auto, copycols=false) ## we work with a DataFrame directly
for c in 1:ncol(x)
d = x[!, c]
for r in 1:nrow(x)
d[r] = rand()
end
end
x
end
function f1a()
x = DataFrame([Vector{Float64}(undef, 10^4) for i in 1:100], :auto) ## we work with a DataFrame directly
for c in 1:ncol(x)
d = x[!, c]
for r in 1:nrow(x)
d[r] = rand()
end
end
x
end
function f2()
x = Vector{Any}(undef, 100)
for c in 1:length(x)
d = Vector{Float64}(undef, 10^4)
for r in eachindex(d)
d[r] = rand()
end
x[c] = d
end
DataFrame(x, :auto, copycols=false) ## we delay creation of DataFrame after we have our job done
end
function f2a()
x = Vector{Any}(undef, 100)
for c in eachindex(x)
d = Vector{Float64}(undef, 10^4)
for r in eachindex(d)
d[r] = rand()
end
x[c] = d
end
DataFrame(x, :auto) ## we delay creation of DataFrame after we have our job done
end
@btime f1();
@btime f1a();
@btime f2();
@btime f2a(); 23.581 ms (1949728 allocations: 37.40 MiB)
24.488 ms (1950028 allocations: 45.03 MiB)
1.528 ms (728 allocations: 7.66 MiB)
2.491 ms (1028 allocations: 15.29 MiB)
You can add rows to a DataFrame in place and it is fast¶
x = DataFrame(rand(10^6, 5), :auto)
y = DataFrame(transpose(1.0:5.0), :auto)
z = [1.0:5.0;]
@btime vcat($x, $y); ## creates a new DataFrame - slow
@btime append!($x, $y); ## in place - fast
x = DataFrame(rand(10^6, 5), :auto) ## reset to the same starting point
@btime push!($x, $z); ## add a single row in place - fast 5.428 ms (213 allocations: 38.16 MiB)
1.075 μs (30 allocations: 1.52 KiB)
361.010 ns (16 allocations: 256 bytes)
Allowing missing as well as categorical slows down computations¶
using StatsBase
function test(data) ## uses countmap function to test performance
println(eltype(data))
x = rand(data, 10^6)
y = categorical(x)
println(" raw:")
@btime countmap($x)
println(" categorical:")
@btime countmap($y)
nothing
end
test(1:10)
test([randstring() for i in 1:10])
test(allowmissing(1:10))
test(allowmissing([randstring() for i in 1:10]))Int64
raw:
3.141 ms (8 allocations: 7.63 MiB)
categorical:
9.271 ms (4 allocations: 576 bytes)
String
raw:
21.586 ms (4 allocations: 448 bytes)
categorical:
25.050 ms (4 allocations: 576 bytes)
Union{Missing, Int64}
raw:
7.271 ms (4 allocations: 464 bytes)
categorical:
22.368 ms (1000004 allocations: 30.52 MiB)
Union{Missing, String}
raw:
19.614 ms (4 allocations: 448 bytes)
categorical:
38.038 ms (1000004 allocations: 30.52 MiB)
When aggregating use column selector and prefer integer, categorical, or pooled array grouping variable¶
df = DataFrame(x=rand('a':'d', 10^7), y=1);gdf = groupby(df, :x)Loading...
traditional syntax, slow
@btime combine(v -> sum(v.y), $gdf) 30.855 ms (333 allocations: 19.08 MiB)
Loading...
use column selector
@btime combine($gdf, :y => sum) 12.215 ms (199 allocations: 9.41 KiB)
Loading...
transform!(df, :x => categorical => :x);
gdf = groupby(df, :x)Loading...
@btime combine($gdf, :y => sum) 12.404 ms (209 allocations: 9.98 KiB)
Loading...
transform!(df, :x => PooledArray{Char} => :x)Loading...
gdf = groupby(df, :x)Loading...
@btime combine($gdf, :y => sum) 12.246 ms (201 allocations: 9.47 KiB)
Loading...
Use views instead of materializing a new DataFrame¶
x = DataFrame(rand(100, 1000), :auto)
@btime $x[1:1, :] 423.873 μs (3015 allocations: 143.79 KiB)
Loading...
@btime $x[1, :] 18.433 ns (0 allocations: 0 bytes)
Loading...
@btime view($x, 1:1, :) 19.417 ns (0 allocations: 0 bytes)
Loading...
@btime $x[1:1, 1:20] 8.982 μs (70 allocations: 3.09 KiB)
Loading...
@btime $x[1, 1:20] 21.644 ns (0 allocations: 0 bytes)
Loading...
@btime view($x, 1:1, 1:20) 22.732 ns (0 allocations: 0 bytes)
Loading...
This notebook was generated using Literate.jl.