{
"cells": [
{
"cell_type": "markdown",
"source": [
"# Possible pitfalls"
],
"metadata": {}
},
{
"outputs": [],
"cell_type": "code",
"source": [
"using DataFrames"
],
"metadata": {},
"execution_count": 1
},
{
"cell_type": "markdown",
"source": [
"## Know what is copied when creating a DataFrame"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m3×5 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\n │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼────────────────────────────────────────────────────\n 1 │ 0.587516 0.718981 0.62205 0.791086 0.505925\n 2 │ 0.900673 0.0775141 0.210494 0.15541 0.0177345\n 3 │ 0.436133 0.866412 0.603696 0.360206 0.424278",
"text/html": [
"
1 | 0.587516 | 0.718981 | 0.62205 | 0.791086 | 0.505925 |
2 | 0.900673 | 0.0775141 | 0.210494 | 0.15541 | 0.0177345 |
3 | 0.436133 | 0.866412 | 0.603696 | 0.360206 | 0.424278 |
"
]
},
"metadata": {},
"execution_count": 2
}
],
"cell_type": "code",
"source": [
"x = DataFrame(rand(3, 5), :auto)"
],
"metadata": {},
"execution_count": 2
},
{
"cell_type": "markdown",
"source": [
"x and y are not the same object"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "false"
},
"metadata": {},
"execution_count": 3
}
],
"cell_type": "code",
"source": [
"y = copy(x)\n",
"x === y"
],
"metadata": {},
"execution_count": 3
},
{
"cell_type": "markdown",
"source": [
"x and y are not the same object"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "false"
},
"metadata": {},
"execution_count": 4
}
],
"cell_type": "code",
"source": [
"y = DataFrame(x)\n",
"x === y"
],
"metadata": {},
"execution_count": 4
},
{
"cell_type": "markdown",
"source": [
"the columns are also not the same"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "false"
},
"metadata": {},
"execution_count": 5
}
],
"cell_type": "code",
"source": [
"any(x[!, i] === y[!, i] for i in ncol(x))"
],
"metadata": {},
"execution_count": 5
},
{
"cell_type": "markdown",
"source": [
"x and y are not the same object"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "false"
},
"metadata": {},
"execution_count": 6
}
],
"cell_type": "code",
"source": [
"y = DataFrame(x, copycols=false)\n",
"x === y"
],
"metadata": {},
"execution_count": 6
},
{
"cell_type": "markdown",
"source": [
"But the columns are the same"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "true"
},
"metadata": {},
"execution_count": 7
}
],
"cell_type": "code",
"source": [
"all(x[!, i] === y[!, i] for i in ncol(x))"
],
"metadata": {},
"execution_count": 7
},
{
"cell_type": "markdown",
"source": [
"the same when creating data frames using `kwarg` syntax"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m3×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m x \u001b[0m\u001b[1m y \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼──────────────\n 1 │ 1 1\n 2 │ 2 2\n 3 │ 3 3",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 8
}
],
"cell_type": "code",
"source": [
"x = 1:3;\n",
"y = [1, 2, 3];\n",
"df = DataFrame(x=x, y=y)"
],
"metadata": {},
"execution_count": 8
},
{
"cell_type": "markdown",
"source": [
"different object"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "false"
},
"metadata": {},
"execution_count": 9
}
],
"cell_type": "code",
"source": [
"y === df.y"
],
"metadata": {},
"execution_count": 9
},
{
"cell_type": "markdown",
"source": [
"range is converted to a vector"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "(UnitRange{Int64}, Vector{Int64})"
},
"metadata": {},
"execution_count": 10
}
],
"cell_type": "code",
"source": [
"typeof(x), typeof(df.x)"
],
"metadata": {},
"execution_count": 10
},
{
"cell_type": "markdown",
"source": [
"slicing rows always creates a copy"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "false"
},
"metadata": {},
"execution_count": 11
}
],
"cell_type": "code",
"source": [
"y === df[:, :y]"
],
"metadata": {},
"execution_count": 11
},
{
"cell_type": "markdown",
"source": [
"you can avoid copying by using copycols=false keyword argument in functions."
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m3×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m x \u001b[0m\u001b[1m y \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼──────────────\n 1 │ 1 1\n 2 │ 2 2\n 3 │ 3 3",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 12
}
],
"cell_type": "code",
"source": [
"df = DataFrame(x=x, y=y, copycols=false)"
],
"metadata": {},
"execution_count": 12
},
{
"cell_type": "markdown",
"source": [
"now it is the same"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "true"
},
"metadata": {},
"execution_count": 13
}
],
"cell_type": "code",
"source": [
"y === df.y"
],
"metadata": {},
"execution_count": 13
},
{
"cell_type": "markdown",
"source": [
"not the same object"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "false"
},
"metadata": {},
"execution_count": 14
}
],
"cell_type": "code",
"source": [
"select(df, :y)[!, 1] === y"
],
"metadata": {},
"execution_count": 14
},
{
"cell_type": "markdown",
"source": [
"the same object"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "true"
},
"metadata": {},
"execution_count": 15
}
],
"cell_type": "code",
"source": [
"select(df, :y, copycols=false)[!, 1] === y"
],
"metadata": {},
"execution_count": 15
},
{
"cell_type": "markdown",
"source": [
"## Do not modify the parent of `GroupedDataFrame` or view"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "GroupedDataFrame with 2 groups based on key: id\nFirst Group (3 rows): id = 2\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼──────────────\n 1 │ 2 1\n 2 │ 2 3\n 3 │ 1 5\n⋮\nLast Group (3 rows): id = 2\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼──────────────\n 1 │ 2 2\n 2 │ 2 4\n 3 │ 2 6",
"text/html": [
"GroupedDataFrame with 2 groups based on key: id
First Group (3 rows): id = 2
⋮
Last Group (3 rows): id = 2
"
]
},
"metadata": {},
"execution_count": 16
}
],
"cell_type": "code",
"source": [
"x = DataFrame(id=repeat([1, 2], outer=3), x=1:6)\n",
"g = groupby(x, :id)\n",
"\n",
"x[1:3, 1] = [2, 2, 2]\n",
"g ## well - it is wrong now, g is only a view"
],
"metadata": {},
"execution_count": 16
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m2×2 SubDataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼──────────────\n 1 │ 1 5\n 2 │ 2 6",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 17
}
],
"cell_type": "code",
"source": [
"s = view(x, 5:6, :)"
],
"metadata": {},
"execution_count": 17
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m2×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼──────────────\n 1 │ 2 1\n 2 │ 2 2",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 18
}
],
"cell_type": "code",
"source": [
"delete!(x, 3:6)"
],
"metadata": {},
"execution_count": 18
},
{
"cell_type": "markdown",
"source": [
"This is an error\n",
"\n",
"```julia\n",
"s ## Will return BoundsError\n",
"```"
],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"## Single column selection for `DataFrame` creates aliases with ! and `getproperty` syntax and copies with :"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "\u001b[1m3×5 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m a \u001b[0m\u001b[1m b \u001b[0m\u001b[1m c \u001b[0m\u001b[1m d \u001b[0m\u001b[1m e \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼───────────────────────────────────\n 1 │ 1 1 1 1 1\n 2 │ 2 2 2 2 2\n 3 │ 3 3 3 3 3",
"text/html": [
""
]
},
"metadata": {}
}
],
"cell_type": "code",
"source": [
"x = DataFrame(a=1:3)\n",
"x.b = x[!, 1] ## alias\n",
"x.c = x[:, 1] ## copy\n",
"x.d = x[!, 1][:] ## copy\n",
"x.e = copy(x[!, 1]) ## explicit copy\n",
"display(x)"
],
"metadata": {},
"execution_count": 19
},
{
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "\u001b[1m3×5 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m a \u001b[0m\u001b[1m b \u001b[0m\u001b[1m c \u001b[0m\u001b[1m d \u001b[0m\u001b[1m e \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼───────────────────────────────────\n 1 │ 100 100 1 1 1\n 2 │ 2 2 2 2 2\n 3 │ 3 3 3 3 3",
"text/html": [
""
]
},
"metadata": {}
}
],
"cell_type": "code",
"source": [
"x[1, 1] = 100\n",
"display(x)"
],
"metadata": {},
"execution_count": 20
},
{
"cell_type": "markdown",
"source": [
"## When iterating rows of a data frame\n",
"\n",
"- use `eachrow` to avoid compilation cost (wide tables),\n",
"- but `Tables.namedtupleiterator` for fast execution (tall tables)\n",
"\n",
"this table is wide"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m2×900 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\u001b[1m x6 \u001b[0m\u001b[1m x7 \u001b[0m\u001b[1m x8 \u001b[0m\u001b[1m x9 \u001b[0m\u001b[1m x10 \u001b[0m\u001b[1m x\u001b[0m ⋯\n │\u001b[90m Bool \u001b[0m\u001b[90m Char \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Char \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m C\u001b[0m ⋯\n─────┼──────────────────────────────────────────────────────────────────────────\n 1 │ false a false 1 1 1 1.0 a 1 1 a ⋯\n 2 │ true b true 2 2 2 2.0 b 2 2 b\n\u001b[36m 890 columns omitted\u001b[0m",
"text/html": [
"2×900 DataFrame
800 columns omitted
1 | false | a | false | 1 | 1 | 1 | 1.0 | a | 1 | 1 | a | 1.0 | 1.0 | 1 | a | 1 | a | 1 | 1 | 1.0 | a | 1.0 | false | a | 1 | false | 1 | 1.0 | 1.0 | a | 1.0 | false | 1 | false | a | 1 | 1 | 1 | a | 1 | 1.0 | 1 | false | 1 | a | 1.0 | false | false | false | 1.0 | a | 1.0 | a | a | a | false | 1.0 | a | a | a | 1.0 | a | 1 | false | false | a | false | 1.0 | 1.0 | 1 | 1 | a | 1.0 | a | false | 1 | 1.0 | 1.0 | 1.0 | 1 | 1 | a | 1 | false | 1 | a | a | 1.0 | 1 | false | a | 1.0 | 1 | 1 | a | false | 1.0 | 1.0 | 1 | 1.0 | ⋯ |
2 | true | b | true | 2 | 2 | 2 | 2.0 | b | 2 | 2 | b | 2.0 | 2.0 | 2 | b | 2 | b | 2 | 2 | 2.0 | b | 2.0 | true | b | 2 | true | 2 | 2.0 | 2.0 | b | 2.0 | true | 2 | true | b | 2 | 2 | 2 | b | 2 | 2.0 | 2 | true | 2 | b | 2.0 | true | true | true | 2.0 | b | 2.0 | b | b | b | true | 2.0 | b | b | b | 2.0 | b | 2 | true | true | b | true | 2.0 | 2.0 | 2 | 2 | b | 2.0 | b | true | 2 | 2.0 | 2.0 | 2.0 | 2 | 2 | b | 2 | true | 2 | b | b | 2.0 | 2 | true | b | 2.0 | 2 | 2 | b | true | 2.0 | 2.0 | 2 | 2.0 | ⋯ |
"
]
},
"metadata": {},
"execution_count": 21
}
],
"cell_type": "code",
"source": [
"df1 = DataFrame([rand([1:2, 'a':'b', false:true, 1.0:2.0]) for i in 1:900], :auto)"
],
"metadata": {},
"execution_count": 21
},
{
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.041341 seconds (52.71 k allocations: 3.614 MiB, 99.91% compilation time)\n"
]
}
],
"cell_type": "code",
"source": [
"@time collect(eachrow(df1));"
],
"metadata": {},
"execution_count": 22
},
{
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 7.539398 seconds (885.91 k allocations: 70.446 MiB, 99.68% compilation time)\n"
]
}
],
"cell_type": "code",
"source": [
"@time collect(Tables.namedtupleiterator(df1));"
],
"metadata": {},
"execution_count": 23
},
{
"cell_type": "markdown",
"source": [
"as you can see the time to compile `Tables.namedtupleiterator` is very large in this case, and it would get much worse if the table was wider (that is why we include this tip in pitfalls notebook)"
],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"the table below is tall"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m1000000×10 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\u001b[1m x6 \u001b[0m\u001b[1m \u001b[0m ⋯\n │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m \u001b[0m ⋯\n─────────┼──────────────────────────────────────────────────────────────────────\n 1 │ 0.791452 0.297286 0.049207 0.88973 0.309781 0.124727 ⋯\n 2 │ 0.220364 0.701363 0.37604 0.853463 0.292551 0.160833\n 3 │ 0.375985 0.0946538 0.401264 0.259818 0.426253 0.992351\n 4 │ 0.190554 0.799914 0.814485 0.664285 0.243209 0.538027\n 5 │ 0.794052 0.707524 0.982855 0.679856 0.33846 0.377405 ⋯\n 6 │ 0.388176 0.641699 0.793341 0.455715 0.24483 0.519513\n 7 │ 0.432196 0.0478298 0.691303 0.952995 0.352697 0.359571\n 8 │ 0.796189 0.0284523 0.102914 0.570933 0.207063 0.212477\n ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱\n 999994 │ 0.165754 0.677366 0.134086 0.644506 0.594977 0.0533597 ⋯\n 999995 │ 0.168486 0.869069 0.0727793 0.0418242 0.226054 0.0179684\n 999996 │ 0.0873745 0.77549 0.224858 0.176993 0.387384 0.100599\n 999997 │ 0.792855 0.471462 0.23574 0.18291 0.235345 0.801913\n 999998 │ 0.82707 0.813067 0.0496293 0.81687 0.0583434 0.825218 ⋯\n 999999 │ 0.805874 0.665029 0.376935 0.849447 0.672471 0.246654\n 1000000 │ 0.357941 0.279164 0.286021 0.229009 0.569281 0.348493\n\u001b[36m 4 columns and 999985 rows omitted\u001b[0m",
"text/html": [
"1000000×10 DataFrame
999975 rows omitted
1 | 0.791452 | 0.297286 | 0.049207 | 0.88973 | 0.309781 | 0.124727 | 0.526994 | 0.376758 | 0.982059 | 0.201495 |
2 | 0.220364 | 0.701363 | 0.37604 | 0.853463 | 0.292551 | 0.160833 | 0.403648 | 0.806145 | 0.686896 | 0.319294 |
3 | 0.375985 | 0.0946538 | 0.401264 | 0.259818 | 0.426253 | 0.992351 | 0.351296 | 0.897025 | 0.642545 | 0.36698 |
4 | 0.190554 | 0.799914 | 0.814485 | 0.664285 | 0.243209 | 0.538027 | 0.281022 | 0.716538 | 0.735261 | 0.0711319 |
5 | 0.794052 | 0.707524 | 0.982855 | 0.679856 | 0.33846 | 0.377405 | 0.393751 | 0.78178 | 0.44972 | 0.205827 |
6 | 0.388176 | 0.641699 | 0.793341 | 0.455715 | 0.24483 | 0.519513 | 0.697198 | 0.371501 | 0.707692 | 0.252749 |
7 | 0.432196 | 0.0478298 | 0.691303 | 0.952995 | 0.352697 | 0.359571 | 0.597546 | 0.151098 | 0.413731 | 0.00542862 |
8 | 0.796189 | 0.0284523 | 0.102914 | 0.570933 | 0.207063 | 0.212477 | 0.598974 | 0.391049 | 0.290673 | 0.416863 |
9 | 0.514341 | 0.574274 | 0.00955344 | 0.0426117 | 0.874077 | 0.198039 | 0.504121 | 0.848645 | 0.607743 | 0.464585 |
10 | 0.678113 | 0.601279 | 0.30789 | 0.037617 | 0.198317 | 0.0906097 | 0.0780964 | 0.339857 | 0.0657225 | 0.0773766 |
11 | 0.216809 | 0.91802 | 0.663682 | 0.813137 | 0.988659 | 0.0123706 | 0.322914 | 0.79604 | 0.382519 | 0.227306 |
12 | 0.952948 | 0.797773 | 0.919703 | 0.983742 | 0.976059 | 0.818607 | 0.29084 | 0.657554 | 0.921209 | 0.362167 |
13 | 0.370819 | 0.777084 | 0.343126 | 0.457476 | 0.411953 | 0.506055 | 0.829612 | 0.894446 | 0.113823 | 0.804729 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
999989 | 0.59812 | 0.341671 | 0.756817 | 0.129677 | 0.558078 | 0.119235 | 0.662959 | 0.619881 | 0.902543 | 0.332156 |
999990 | 0.795765 | 0.528571 | 0.711405 | 0.0577124 | 0.930984 | 0.551859 | 0.703604 | 0.269038 | 0.557876 | 0.433196 |
999991 | 0.579624 | 0.233688 | 0.215393 | 0.304601 | 0.851172 | 0.204315 | 0.0941204 | 0.294733 | 0.745845 | 0.504573 |
999992 | 0.495098 | 0.165949 | 0.462764 | 0.83711 | 0.11577 | 0.769074 | 0.10481 | 0.630765 | 0.154507 | 0.641631 |
999993 | 0.695556 | 0.318069 | 0.398515 | 0.740937 | 0.858254 | 0.235751 | 0.184672 | 0.0391809 | 0.419516 | 0.183319 |
999994 | 0.165754 | 0.677366 | 0.134086 | 0.644506 | 0.594977 | 0.0533597 | 0.616334 | 0.391897 | 0.161337 | 0.967639 |
999995 | 0.168486 | 0.869069 | 0.0727793 | 0.0418242 | 0.226054 | 0.0179684 | 0.614555 | 0.911016 | 0.256836 | 0.658037 |
999996 | 0.0873745 | 0.77549 | 0.224858 | 0.176993 | 0.387384 | 0.100599 | 0.333415 | 0.283454 | 0.292832 | 0.447891 |
999997 | 0.792855 | 0.471462 | 0.23574 | 0.18291 | 0.235345 | 0.801913 | 0.563771 | 0.558161 | 0.508129 | 0.595255 |
999998 | 0.82707 | 0.813067 | 0.0496293 | 0.81687 | 0.0583434 | 0.825218 | 0.63372 | 0.902962 | 0.823672 | 0.417822 |
999999 | 0.805874 | 0.665029 | 0.376935 | 0.849447 | 0.672471 | 0.246654 | 0.568449 | 0.506283 | 0.0738984 | 0.650809 |
1000000 | 0.357941 | 0.279164 | 0.286021 | 0.229009 | 0.569281 | 0.348493 | 0.592393 | 0.922544 | 0.825604 | 0.48944 |
"
]
},
"metadata": {},
"execution_count": 24
}
],
"cell_type": "code",
"source": [
"df2 = DataFrame(rand(10^6, 10), :auto)"
],
"metadata": {},
"execution_count": 24
},
{
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 2.127519 seconds (60.08 M allocations: 1.056 GiB, 6.17% gc time, 3.69% compilation time)\n"
]
}
],
"cell_type": "code",
"source": [
"@time map(sum, eachrow(df2));"
],
"metadata": {},
"execution_count": 25
},
{
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 2.119773 seconds (59.99 M allocations: 1.050 GiB, 6.50% gc time)\n"
]
}
],
"cell_type": "code",
"source": [
"@time map(sum, eachrow(df2));"
],
"metadata": {},
"execution_count": 26
},
{
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.195608 seconds (200.06 k allocations: 21.156 MiB, 24.98% gc time, 96.98% compilation time)\n"
]
}
],
"cell_type": "code",
"source": [
"@time map(sum, Tables.namedtupleiterator(df2));"
],
"metadata": {},
"execution_count": 27
},
{
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.005788 seconds (22 allocations: 7.631 MiB)\n"
]
}
],
"cell_type": "code",
"source": [
"@time map(sum, Tables.namedtupleiterator(df2));"
],
"metadata": {},
"execution_count": 28
},
{
"cell_type": "markdown",
"source": [
"as you can see - this time it is much faster to iterate a type stable container\n",
"still you might want to use the `select` syntax, which is optimized for such reductions:"
],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"this includes compilation time"
],
"metadata": {}
},
{
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.392389 seconds (509.34 k allocations: 41.875 MiB, 98.52% compilation time: 93% of which was recompilation)\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": "1000000-element Vector{Float64}:\n 4.549489608990553\n 4.820598260856793\n 4.808170226496026\n 5.054427780801031\n 5.711228368654866\n 5.0724123056568216\n 4.00439573440345\n 3.61558727488398\n 4.637989455369663\n 2.474878422732406\n ⋮\n 4.377478339806004\n 4.073770679842652\n 4.40725544523935\n 3.836624901881449\n 3.110289532802179\n 4.945542466292135\n 6.168372468024979\n 5.415848010904047\n 4.899889662604407"
},
"metadata": {},
"execution_count": 29
}
],
"cell_type": "code",
"source": [
"@time select(df2, AsTable(:) => ByRow(sum) => \"sum\").sum"
],
"metadata": {},
"execution_count": 29
},
{
"cell_type": "markdown",
"source": [
"Do it again"
],
"metadata": {}
},
{
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.004722 seconds (125 allocations: 7.635 MiB)\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": "1000000-element Vector{Float64}:\n 4.549489608990553\n 4.820598260856793\n 4.808170226496026\n 5.054427780801031\n 5.711228368654866\n 5.0724123056568216\n 4.00439573440345\n 3.61558727488398\n 4.637989455369663\n 2.474878422732406\n ⋮\n 4.377478339806004\n 4.073770679842652\n 4.40725544523935\n 3.836624901881449\n 3.110289532802179\n 4.945542466292135\n 6.168372468024979\n 5.415848010904047\n 4.899889662604407"
},
"metadata": {},
"execution_count": 30
}
],
"cell_type": "code",
"source": [
"@time select(df2, AsTable(:) => ByRow(sum) => \"sum\").sum"
],
"metadata": {},
"execution_count": 30
},
{
"cell_type": "markdown",
"source": [
"---\n",
"\n",
"*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*"
],
"metadata": {}
}
],
"nbformat_minor": 3,
"metadata": {
"language_info": {
"file_extension": ".jl",
"mimetype": "application/julia",
"name": "julia",
"version": "1.10.5"
},
"kernelspec": {
"name": "julia-1.10",
"display_name": "Julia 1.10.5",
"language": "julia"
}
},
"nbformat": 4
}