{
"cells": [
{
"cell_type": "markdown",
"source": [
"# Transformation to DataFrames\n",
"Split-apply-combine"
],
"metadata": {}
},
{
"outputs": [],
"cell_type": "code",
"source": [
"using DataFrames"
],
"metadata": {},
"execution_count": 1
},
{
"cell_type": "markdown",
"source": [
"## Grouping a data frame\n",
"`groupby`"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m8×3 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m v \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼─────────────────────────\n 1 │ 1 1 0.325215\n 2 │ 2 2 0.369038\n 3 │ 3 1 0.814633\n 4 │ 4 2 0.377291\n 5 │ 1 1 0.0800426\n 6 │ 2 2 0.796786\n 7 │ 3 1 0.21076\n 8 │ 4 2 0.75457",
"text/html": [
"
1 | 1 | 1 | 0.325215 |
2 | 2 | 2 | 0.369038 |
3 | 3 | 1 | 0.814633 |
4 | 4 | 2 | 0.377291 |
5 | 1 | 1 | 0.0800426 |
6 | 2 | 2 | 0.796786 |
7 | 3 | 1 | 0.21076 |
8 | 4 | 2 | 0.75457 |
"
]
},
"metadata": {},
"execution_count": 2
}
],
"cell_type": "code",
"source": [
"x = DataFrame(id=[1, 2, 3, 4, 1, 2, 3, 4], id2=[1, 2, 1, 2, 1, 2, 1, 2], v=rand(8))"
],
"metadata": {},
"execution_count": 2
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "GroupedDataFrame with 4 groups based on key: id\nFirst Group (2 rows): id = 1\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m v \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼─────────────────────────\n 1 │ 1 1 0.325215\n 2 │ 1 1 0.0800426\n⋮\nLast Group (2 rows): id = 4\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m v \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼────────────────────────\n 1 │ 4 2 0.377291\n 2 │ 4 2 0.75457",
"text/html": [
"GroupedDataFrame with 4 groups based on key: id
First Group (2 rows): id = 1
⋮
Last Group (2 rows): id = 4
"
]
},
"metadata": {},
"execution_count": 3
}
],
"cell_type": "code",
"source": [
"groupby(x, :id)"
],
"metadata": {},
"execution_count": 3
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "GroupedDataFrame with 1 group based on key: \nFirst Group (8 rows): \n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m v \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼─────────────────────────\n 1 │ 1 1 0.325215\n 2 │ 2 2 0.369038\n 3 │ 3 1 0.814633\n 4 │ 4 2 0.377291\n 5 │ 1 1 0.0800426\n 6 │ 2 2 0.796786\n 7 │ 3 1 0.21076\n 8 │ 4 2 0.75457",
"text/html": [
"GroupedDataFrame with 1 group based on key:
1 | 1 | 1 | 0.325215 |
2 | 2 | 2 | 0.369038 |
3 | 3 | 1 | 0.814633 |
4 | 4 | 2 | 0.377291 |
5 | 1 | 1 | 0.0800426 |
6 | 2 | 2 | 0.796786 |
7 | 3 | 1 | 0.21076 |
8 | 4 | 2 | 0.75457 |
"
]
},
"metadata": {},
"execution_count": 4
}
],
"cell_type": "code",
"source": [
"groupby(x, [])"
],
"metadata": {},
"execution_count": 4
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "GroupedDataFrame with 4 groups based on keys: id, id2\nFirst Group (2 rows): id = 1, id2 = 1\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m v \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼─────────────────────────\n 1 │ 1 1 0.325215\n 2 │ 1 1 0.0800426\n⋮\nLast Group (2 rows): id = 4, id2 = 2\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m v \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼────────────────────────\n 1 │ 4 2 0.377291\n 2 │ 4 2 0.75457",
"text/html": [
"GroupedDataFrame with 4 groups based on keys: id, id2
First Group (2 rows): id = 1, id2 = 1
⋮
Last Group (2 rows): id = 4, id2 = 2
"
]
},
"metadata": {},
"execution_count": 5
}
],
"cell_type": "code",
"source": [
"gx2 = groupby(x, [:id, :id2])"
],
"metadata": {},
"execution_count": 5
},
{
"cell_type": "markdown",
"source": [
"get the parent DataFrame"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m8×3 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m v \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼─────────────────────────\n 1 │ 1 1 0.325215\n 2 │ 2 2 0.369038\n 3 │ 3 1 0.814633\n 4 │ 4 2 0.377291\n 5 │ 1 1 0.0800426\n 6 │ 2 2 0.796786\n 7 │ 3 1 0.21076\n 8 │ 4 2 0.75457",
"text/html": [
"1 | 1 | 1 | 0.325215 |
2 | 2 | 2 | 0.369038 |
3 | 3 | 1 | 0.814633 |
4 | 4 | 2 | 0.377291 |
5 | 1 | 1 | 0.0800426 |
6 | 2 | 2 | 0.796786 |
7 | 3 | 1 | 0.21076 |
8 | 4 | 2 | 0.75457 |
"
]
},
"metadata": {},
"execution_count": 6
}
],
"cell_type": "code",
"source": [
"parent(gx2)"
],
"metadata": {},
"execution_count": 6
},
{
"cell_type": "markdown",
"source": [
"back to the DataFrame, but in a different order of rows than the original"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m8×3 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m v \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼─────────────────────────\n 1 │ 1 1 0.325215\n 2 │ 1 1 0.0800426\n 3 │ 2 2 0.369038\n 4 │ 2 2 0.796786\n 5 │ 3 1 0.814633\n 6 │ 3 1 0.21076\n 7 │ 4 2 0.377291\n 8 │ 4 2 0.75457",
"text/html": [
"1 | 1 | 1 | 0.325215 |
2 | 1 | 1 | 0.0800426 |
3 | 2 | 2 | 0.369038 |
4 | 2 | 2 | 0.796786 |
5 | 3 | 1 | 0.814633 |
6 | 3 | 1 | 0.21076 |
7 | 4 | 2 | 0.377291 |
8 | 4 | 2 | 0.75457 |
"
]
},
"metadata": {},
"execution_count": 7
}
],
"cell_type": "code",
"source": [
"vcat(gx2...)"
],
"metadata": {},
"execution_count": 7
},
{
"cell_type": "markdown",
"source": [
"the same as above"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m8×3 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m v \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼─────────────────────────\n 1 │ 1 1 0.325215\n 2 │ 1 1 0.0800426\n 3 │ 2 2 0.369038\n 4 │ 2 2 0.796786\n 5 │ 3 1 0.814633\n 6 │ 3 1 0.21076\n 7 │ 4 2 0.377291\n 8 │ 4 2 0.75457",
"text/html": [
"1 | 1 | 1 | 0.325215 |
2 | 1 | 1 | 0.0800426 |
3 | 2 | 2 | 0.369038 |
4 | 2 | 2 | 0.796786 |
5 | 3 | 1 | 0.814633 |
6 | 3 | 1 | 0.21076 |
7 | 4 | 2 | 0.377291 |
8 | 4 | 2 | 0.75457 |
"
]
},
"metadata": {},
"execution_count": 8
}
],
"cell_type": "code",
"source": [
"DataFrame(gx2)"
],
"metadata": {},
"execution_count": 8
},
{
"cell_type": "markdown",
"source": [
"drop grouping columns when creating a data frame"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m8×1 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m v \u001b[0m\n │\u001b[90m Float64 \u001b[0m\n─────┼───────────\n 1 │ 0.325215\n 2 │ 0.0800426\n 3 │ 0.369038\n 4 │ 0.796786\n 5 │ 0.814633\n 6 │ 0.21076\n 7 │ 0.377291\n 8 │ 0.75457",
"text/html": [
"1 | 0.325215 |
2 | 0.0800426 |
3 | 0.369038 |
4 | 0.796786 |
5 | 0.814633 |
6 | 0.21076 |
7 | 0.377291 |
8 | 0.75457 |
"
]
},
"metadata": {},
"execution_count": 9
}
],
"cell_type": "code",
"source": [
"DataFrame(gx2, keepkeys=false)"
],
"metadata": {},
"execution_count": 9
},
{
"cell_type": "markdown",
"source": [
"vector of names of grouping variables"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "2-element Vector{Symbol}:\n :id\n :id2"
},
"metadata": {},
"execution_count": 10
}
],
"cell_type": "code",
"source": [
"groupcols(gx2)"
],
"metadata": {},
"execution_count": 10
},
{
"cell_type": "markdown",
"source": [
"and non-grouping variables"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "1-element Vector{Symbol}:\n :v"
},
"metadata": {},
"execution_count": 11
}
],
"cell_type": "code",
"source": [
"valuecols(gx2)"
],
"metadata": {},
"execution_count": 11
},
{
"cell_type": "markdown",
"source": [
"group indices in parent(gx2)"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "8-element Vector{Union{Missing, Int64}}:\n 1\n 2\n 3\n 4\n 1\n 2\n 3\n 4"
},
"metadata": {},
"execution_count": 12
}
],
"cell_type": "code",
"source": [
"groupindices(gx2)"
],
"metadata": {},
"execution_count": 12
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "4-element DataFrames.GroupKeys{DataFrames.GroupedDataFrame{DataFrames.DataFrame}}:\n GroupKey: (id = 1, id2 = 1)\n GroupKey: (id = 2, id2 = 2)\n GroupKey: (id = 3, id2 = 1)\n GroupKey: (id = 4, id2 = 2)"
},
"metadata": {},
"execution_count": 13
}
],
"cell_type": "code",
"source": [
"kgx2 = keys(gx2)"
],
"metadata": {},
"execution_count": 13
},
{
"cell_type": "markdown",
"source": [
"You can index into a `GroupedDataFrame` like to a vector or to a dictionary. The second form accepts `GroupKey`, `NamedTuple` or a `Tuple`."
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "GroupedDataFrame with 4 groups based on keys: id, id2\nFirst Group (2 rows): id = 1, id2 = 1\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m v \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼─────────────────────────\n 1 │ 1 1 0.325215\n 2 │ 1 1 0.0800426\n⋮\nLast Group (2 rows): id = 4, id2 = 2\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m v \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼────────────────────────\n 1 │ 4 2 0.377291\n 2 │ 4 2 0.75457",
"text/html": [
"GroupedDataFrame with 4 groups based on keys: id, id2
First Group (2 rows): id = 1, id2 = 1
⋮
Last Group (2 rows): id = 4, id2 = 2
"
]
},
"metadata": {},
"execution_count": 14
}
],
"cell_type": "code",
"source": [
"gx2"
],
"metadata": {},
"execution_count": 14
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "GroupKey: (id = 1, id2 = 1)"
},
"metadata": {},
"execution_count": 15
}
],
"cell_type": "code",
"source": [
"k = keys(gx2)[1]"
],
"metadata": {},
"execution_count": 15
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "(id = 1, id2 = 1)"
},
"metadata": {},
"execution_count": 16
}
],
"cell_type": "code",
"source": [
"ntk = NamedTuple(k)"
],
"metadata": {},
"execution_count": 16
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "(1, 1)"
},
"metadata": {},
"execution_count": 17
}
],
"cell_type": "code",
"source": [
"tk = Tuple(k)"
],
"metadata": {},
"execution_count": 17
},
{
"cell_type": "markdown",
"source": [
"the operations below produce the same result and are proformant"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "(\u001b[1m2×3 SubDataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m v \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼─────────────────────────\n 1 │ 1 1 0.325215\n 2 │ 1 1 0.0800426, \u001b[1m2×3 SubDataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m v \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼─────────────────────────\n 1 │ 1 1 0.325215\n 2 │ 1 1 0.0800426, \u001b[1m2×3 SubDataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m v \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼─────────────────────────\n 1 │ 1 1 0.325215\n 2 │ 1 1 0.0800426, \u001b[1m2×3 SubDataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m v \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼─────────────────────────\n 1 │ 1 1 0.325215\n 2 │ 1 1 0.0800426)"
},
"metadata": {},
"execution_count": 18
}
],
"cell_type": "code",
"source": [
"gx2[1], gx2[k], gx2[ntk], gx2[tk]"
],
"metadata": {},
"execution_count": 18
},
{
"cell_type": "markdown",
"source": [
"handling missing values"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m5×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼────────────────\n 1 │\u001b[90m missing \u001b[0m 1\n 2 │ 5 2\n 3 │ 1 3\n 4 │ 3 4\n 5 │\u001b[90m missing \u001b[0m 5",
"text/html": [
"1 | missing | 1 |
2 | 5 | 2 |
3 | 1 | 3 |
4 | 3 | 4 |
5 | missing | 5 |
"
]
},
"metadata": {},
"execution_count": 19
}
],
"cell_type": "code",
"source": [
"x = DataFrame(id=[missing, 5, 1, 3, missing], x=1:5)"
],
"metadata": {},
"execution_count": 19
},
{
"cell_type": "markdown",
"source": [
"by default groups include missing values and their order is not guaranteed"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "GroupedDataFrame with 4 groups based on key: id\nFirst Group (1 row): id = 1\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼───────────────\n 1 │ 1 3\n⋮\nLast Group (2 rows): id = missing\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼────────────────\n 1 │\u001b[90m missing \u001b[0m 1\n 2 │\u001b[90m missing \u001b[0m 5",
"text/html": [
"GroupedDataFrame with 4 groups based on key: id
First Group (1 row): id = 1
⋮
Last Group (2 rows): id = missing
"
]
},
"metadata": {},
"execution_count": 20
}
],
"cell_type": "code",
"source": [
"groupby(x, :id)"
],
"metadata": {},
"execution_count": 20
},
{
"cell_type": "markdown",
"source": [
"but we can change it; now they are sorted"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "GroupedDataFrame with 3 groups based on key: id\nFirst Group (1 row): id = 1\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼───────────────\n 1 │ 1 3\n⋮\nLast Group (1 row): id = 5\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼───────────────\n 1 │ 5 2",
"text/html": [
"GroupedDataFrame with 3 groups based on key: id
First Group (1 row): id = 1
⋮
Last Group (1 row): id = 5
"
]
},
"metadata": {},
"execution_count": 21
}
],
"cell_type": "code",
"source": [
"groupby(x, :id, sort=true, skipmissing=true)"
],
"metadata": {},
"execution_count": 21
},
{
"cell_type": "markdown",
"source": [
"and now they are in the order they appear in the source data frame"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "GroupedDataFrame with 4 groups based on key: id\nFirst Group (2 rows): id = missing\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼────────────────\n 1 │\u001b[90m missing \u001b[0m 1\n 2 │\u001b[90m missing \u001b[0m 5\n⋮\nLast Group (1 row): id = 3\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼───────────────\n 1 │ 3 4",
"text/html": [
"GroupedDataFrame with 4 groups based on key: id
First Group (2 rows): id = missing
⋮
Last Group (1 row): id = 3
"
]
},
"metadata": {},
"execution_count": 22
}
],
"cell_type": "code",
"source": [
"groupby(x, :id, sort=false)"
],
"metadata": {},
"execution_count": 22
},
{
"cell_type": "markdown",
"source": [
"## Performing transformations\n",
"by group using combine, select, select!, transform, and transform!"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m100×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m v \u001b[0m\n │\u001b[90m Char \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼─────────────────\n 1 │ d 0.911682\n 2 │ a 0.336318\n 3 │ c 0.569249\n 4 │ a 0.860833\n 5 │ d 0.990385\n 6 │ d 0.671075\n 7 │ b 0.456261\n 8 │ a 0.663899\n ⋮ │ ⋮ ⋮\n 94 │ d 0.6734\n 95 │ c 0.581931\n 96 │ d 0.732172\n 97 │ a 0.647178\n 98 │ a 0.746204\n 99 │ a 0.0444478\n 100 │ c 0.206397\n\u001b[36m 85 rows omitted\u001b[0m",
"text/html": [
"100×2 DataFrame
75 rows omitted
1 | d | 0.911682 |
2 | a | 0.336318 |
3 | c | 0.569249 |
4 | a | 0.860833 |
5 | d | 0.990385 |
6 | d | 0.671075 |
7 | b | 0.456261 |
8 | a | 0.663899 |
9 | a | 0.673223 |
10 | c | 0.822798 |
11 | b | 0.948415 |
12 | d | 0.462216 |
13 | b | 0.772046 |
⋮ | ⋮ | ⋮ |
89 | b | 0.736596 |
90 | c | 0.010389 |
91 | d | 0.958571 |
92 | b | 0.132037 |
93 | c | 0.132203 |
94 | d | 0.6734 |
95 | c | 0.581931 |
96 | d | 0.732172 |
97 | a | 0.647178 |
98 | a | 0.746204 |
99 | a | 0.0444478 |
100 | c | 0.206397 |
"
]
},
"metadata": {},
"execution_count": 23
}
],
"cell_type": "code",
"source": [
"using Statistics\n",
"using Chain\n",
"\n",
"x = DataFrame(id=rand('a':'d', 100), v=rand(100))"
],
"metadata": {},
"execution_count": 23
},
{
"cell_type": "markdown",
"source": [
"apply a function to each group of a data frame combine keeps as many rows as are returned from the function"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m4×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m v_mean \u001b[0m\n │\u001b[90m Char \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼────────────────\n 1 │ d 0.579567\n 2 │ a 0.50468\n 3 │ c 0.512914\n 4 │ b 0.584901",
"text/html": [
"1 | d | 0.579567 |
2 | a | 0.50468 |
3 | c | 0.512914 |
4 | b | 0.584901 |
"
]
},
"metadata": {},
"execution_count": 24
}
],
"cell_type": "code",
"source": [
"@chain x begin\n",
" groupby(:id)\n",
" combine(:v => mean)\n",
"end"
],
"metadata": {},
"execution_count": 24
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "Base.OneTo(100)"
},
"metadata": {},
"execution_count": 25
}
],
"cell_type": "code",
"source": [
"x.id2 = axes(x, 1)"
],
"metadata": {},
"execution_count": 25
},
{
"cell_type": "markdown",
"source": [
"Select and transform keep as many rows as are in the source data frame and in correct order.\n",
"Additionally, transform keeps all columns from the source."
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m100×4 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m v \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m v_mean \u001b[0m\n │\u001b[90m Char \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼──────────────────────────────────\n 1 │ d 0.911682 1 0.579567\n 2 │ a 0.336318 2 0.50468\n 3 │ c 0.569249 3 0.512914\n 4 │ a 0.860833 4 0.50468\n 5 │ d 0.990385 5 0.579567\n 6 │ d 0.671075 6 0.579567\n 7 │ b 0.456261 7 0.584901\n 8 │ a 0.663899 8 0.50468\n ⋮ │ ⋮ ⋮ ⋮ ⋮\n 94 │ d 0.6734 94 0.579567\n 95 │ c 0.581931 95 0.512914\n 96 │ d 0.732172 96 0.579567\n 97 │ a 0.647178 97 0.50468\n 98 │ a 0.746204 98 0.50468\n 99 │ a 0.0444478 99 0.50468\n 100 │ c 0.206397 100 0.512914\n\u001b[36m 85 rows omitted\u001b[0m",
"text/html": [
"100×4 DataFrame
75 rows omitted
1 | d | 0.911682 | 1 | 0.579567 |
2 | a | 0.336318 | 2 | 0.50468 |
3 | c | 0.569249 | 3 | 0.512914 |
4 | a | 0.860833 | 4 | 0.50468 |
5 | d | 0.990385 | 5 | 0.579567 |
6 | d | 0.671075 | 6 | 0.579567 |
7 | b | 0.456261 | 7 | 0.584901 |
8 | a | 0.663899 | 8 | 0.50468 |
9 | a | 0.673223 | 9 | 0.50468 |
10 | c | 0.822798 | 10 | 0.512914 |
11 | b | 0.948415 | 11 | 0.584901 |
12 | d | 0.462216 | 12 | 0.579567 |
13 | b | 0.772046 | 13 | 0.584901 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
89 | b | 0.736596 | 89 | 0.584901 |
90 | c | 0.010389 | 90 | 0.512914 |
91 | d | 0.958571 | 91 | 0.579567 |
92 | b | 0.132037 | 92 | 0.584901 |
93 | c | 0.132203 | 93 | 0.512914 |
94 | d | 0.6734 | 94 | 0.579567 |
95 | c | 0.581931 | 95 | 0.512914 |
96 | d | 0.732172 | 96 | 0.579567 |
97 | a | 0.647178 | 97 | 0.50468 |
98 | a | 0.746204 | 98 | 0.50468 |
99 | a | 0.0444478 | 99 | 0.50468 |
100 | c | 0.206397 | 100 | 0.512914 |
"
]
},
"metadata": {},
"execution_count": 26
}
],
"cell_type": "code",
"source": [
"@chain x begin\n",
" groupby(:id)\n",
" transform(:v => mean)\n",
"end"
],
"metadata": {},
"execution_count": 26
},
{
"cell_type": "markdown",
"source": [
"note that combine reorders rows by group of GroupedDataFrame"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m100×3 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m v_mean \u001b[0m\n │\u001b[90m Char \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼───────────────────────\n 1 │ d 1 0.579567\n 2 │ d 5 0.579567\n 3 │ d 6 0.579567\n 4 │ d 12 0.579567\n 5 │ d 16 0.579567\n 6 │ d 22 0.579567\n 7 │ d 28 0.579567\n 8 │ d 30 0.579567\n ⋮ │ ⋮ ⋮ ⋮\n 94 │ b 71 0.584901\n 95 │ b 74 0.584901\n 96 │ b 77 0.584901\n 97 │ b 84 0.584901\n 98 │ b 88 0.584901\n 99 │ b 89 0.584901\n 100 │ b 92 0.584901\n\u001b[36m 85 rows omitted\u001b[0m",
"text/html": [
"100×3 DataFrame
75 rows omitted
1 | d | 1 | 0.579567 |
2 | d | 5 | 0.579567 |
3 | d | 6 | 0.579567 |
4 | d | 12 | 0.579567 |
5 | d | 16 | 0.579567 |
6 | d | 22 | 0.579567 |
7 | d | 28 | 0.579567 |
8 | d | 30 | 0.579567 |
9 | d | 35 | 0.579567 |
10 | d | 37 | 0.579567 |
11 | d | 38 | 0.579567 |
12 | d | 39 | 0.579567 |
13 | d | 40 | 0.579567 |
⋮ | ⋮ | ⋮ | ⋮ |
89 | b | 58 | 0.584901 |
90 | b | 59 | 0.584901 |
91 | b | 64 | 0.584901 |
92 | b | 65 | 0.584901 |
93 | b | 69 | 0.584901 |
94 | b | 71 | 0.584901 |
95 | b | 74 | 0.584901 |
96 | b | 77 | 0.584901 |
97 | b | 84 | 0.584901 |
98 | b | 88 | 0.584901 |
99 | b | 89 | 0.584901 |
100 | b | 92 | 0.584901 |
"
]
},
"metadata": {},
"execution_count": 27
}
],
"cell_type": "code",
"source": [
"@chain x begin\n",
" groupby(:id)\n",
" combine(:id2, :v => mean)\n",
"end"
],
"metadata": {},
"execution_count": 27
},
{
"cell_type": "markdown",
"source": [
"we give a custom name for the result column"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m4×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m res \u001b[0m\n │\u001b[90m Char \u001b[0m\u001b[90m Float64 \u001b[0m\n─────┼────────────────\n 1 │ d 0.579567\n 2 │ a 0.50468\n 3 │ c 0.512914\n 4 │ b 0.584901",
"text/html": [
"1 | d | 0.579567 |
2 | a | 0.50468 |
3 | c | 0.512914 |
4 | b | 0.584901 |
"
]
},
"metadata": {},
"execution_count": 28
}
],
"cell_type": "code",
"source": [
"@chain x begin\n",
" groupby(:id)\n",
" combine(:v => mean => :res)\n",
"end"
],
"metadata": {},
"execution_count": 28
},
{
"cell_type": "markdown",
"source": [
"you can have multiple operations"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m4×4 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m res1 \u001b[0m\u001b[1m res2 \u001b[0m\u001b[1m n \u001b[0m\n │\u001b[90m Char \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼─────────────────────────────────\n 1 │ d 0.579567 15.6483 27\n 2 │ a 0.50468 9.08424 18\n 3 │ c 0.512914 13.8487 27\n 4 │ b 0.584901 16.3772 28",
"text/html": [
"1 | d | 0.579567 | 15.6483 | 27 |
2 | a | 0.50468 | 9.08424 | 18 |
3 | c | 0.512914 | 13.8487 | 27 |
4 | b | 0.584901 | 16.3772 | 28 |
"
]
},
"metadata": {},
"execution_count": 29
}
],
"cell_type": "code",
"source": [
"@chain x begin\n",
" groupby(:id)\n",
" combine(:v => mean => :res1, :v => sum => :res2, nrow => :n)\n",
"end"
],
"metadata": {},
"execution_count": 29
},
{
"cell_type": "markdown",
"source": [
"Additional notes:\n",
"\n",
"+ `select!` and `transform!` perform operations in-place\n",
"+ The general syntax for transformation is `source_columns => function => target_column`\n",
"+ if you pass multiple columns to a function they are treated as positional arguments\n",
"+ `ByRow` and `AsTable` work exactly like discussed for operations on data frames in 05_columns.ipynb\n",
"+ you can automatically groupby again the result of `combine`, `select` etc. by passing `ungroup=false` keyword argument to them\n",
"+ similarly `keepkeys` keyword argument allows you to drop grouping columns from the resulting data frame\n",
"\n",
"It is also allowed to pass a function to all these functions (also - as a special case, as a first argument). In this case the return value can be a table. In particular it allows for an easy dropping of groups if you return an empty table from the function.\n",
"\n",
"If you pass a function you can use a `do` block syntax. In case of passing a function it gets a `SubDataFrame` as its argument.\n",
"\n",
"Here is an example:"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m3×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m n \u001b[0m\n │\u001b[90m Char \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼─────────────\n 1 │ d 27\n 2 │ c 27\n 3 │ b 28",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 30
}
],
"cell_type": "code",
"source": [
"combine(groupby(x, :id)) do sdf\n",
" n = nrow(sdf)\n",
" n < 25 ? DataFrame() : DataFrame(n=n) ## drop groups with low number of rows\n",
"end"
],
"metadata": {},
"execution_count": 30
},
{
"cell_type": "markdown",
"source": [
"You can also produce multiple columns in a single operation:"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m4×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m val \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼──────────────\n 1 │ 1 1\n 2 │ 1 2\n 3 │ 2 3\n 4 │ 2 4",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 31
}
],
"cell_type": "code",
"source": [
"df = DataFrame(id=[1, 1, 2, 2], val=[1, 2, 3, 4])"
],
"metadata": {},
"execution_count": 31
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m2×3 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼─────────────────────\n 1 │ 1 1 2\n 2 │ 2 3 4",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 32
}
],
"cell_type": "code",
"source": [
"@chain df begin\n",
" groupby(:id)\n",
" combine(:val => (x -> [x]) => AsTable)\n",
"end"
],
"metadata": {},
"execution_count": 32
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m2×3 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m c1 \u001b[0m\u001b[1m c2 \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼─────────────────────\n 1 │ 1 1 2\n 2 │ 2 3 4",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 33
}
],
"cell_type": "code",
"source": [
"@chain df begin\n",
" groupby(:id)\n",
" combine(:val => (x -> [x]) => [:c1, :c2])\n",
"end"
],
"metadata": {},
"execution_count": 33
},
{
"cell_type": "markdown",
"source": [
"It is easy to unnest the column into multiple columns,"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m2×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m p \u001b[0m\u001b[1m q \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼──────────────\n 1 │ 1 2\n 2 │ 3 4",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 34
}
],
"cell_type": "code",
"source": [
"df = DataFrame(a=[(p=1, q=2), (p=3, q=4)])\n",
"select(df, :a => AsTable)"
],
"metadata": {},
"execution_count": 34
},
{
"cell_type": "markdown",
"source": [
"automatic column names generated"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m2×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼──────────────\n 1 │ 1 2\n 2 │ 3 4",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 35
}
],
"cell_type": "code",
"source": [
"df = DataFrame(a=[[1, 2], [3, 4]])\n",
"select(df, :a => AsTable)"
],
"metadata": {},
"execution_count": 35
},
{
"cell_type": "markdown",
"source": [
"custom column names generated"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m2×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m C1 \u001b[0m\u001b[1m C2 \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼──────────────\n 1 │ 1 2\n 2 │ 3 4",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 36
}
],
"cell_type": "code",
"source": [
"select(df, :a => [:C1, :C2])"
],
"metadata": {},
"execution_count": 36
},
{
"cell_type": "markdown",
"source": [
"Finally, observe that one can conveniently apply multiple transformations using broadcasting:"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m100×3 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼─────────────────────\n 1 │ 1 1 101\n 2 │ 2 2 102\n 3 │ 3 3 103\n 4 │ 4 4 104\n 5 │ 5 5 105\n 6 │ 6 6 106\n 7 │ 7 7 107\n 8 │ 8 8 108\n ⋮ │ ⋮ ⋮ ⋮\n 94 │ 4 94 194\n 95 │ 5 95 195\n 96 │ 6 96 196\n 97 │ 7 97 197\n 98 │ 8 98 198\n 99 │ 9 99 199\n 100 │ 10 100 200\n\u001b[36m 85 rows omitted\u001b[0m",
"text/html": [
"100×3 DataFrame
75 rows omitted
1 | 1 | 1 | 101 |
2 | 2 | 2 | 102 |
3 | 3 | 3 | 103 |
4 | 4 | 4 | 104 |
5 | 5 | 5 | 105 |
6 | 6 | 6 | 106 |
7 | 7 | 7 | 107 |
8 | 8 | 8 | 108 |
9 | 9 | 9 | 109 |
10 | 10 | 10 | 110 |
11 | 1 | 11 | 111 |
12 | 2 | 12 | 112 |
13 | 3 | 13 | 113 |
⋮ | ⋮ | ⋮ | ⋮ |
89 | 9 | 89 | 189 |
90 | 10 | 90 | 190 |
91 | 1 | 91 | 191 |
92 | 2 | 92 | 192 |
93 | 3 | 93 | 193 |
94 | 4 | 94 | 194 |
95 | 5 | 95 | 195 |
96 | 6 | 96 | 196 |
97 | 7 | 97 | 197 |
98 | 8 | 98 | 198 |
99 | 9 | 99 | 199 |
100 | 10 | 100 | 200 |
"
]
},
"metadata": {},
"execution_count": 37
}
],
"cell_type": "code",
"source": [
"df = DataFrame(id=repeat(1:10, 10), x1=1:100, x2=101:200)"
],
"metadata": {},
"execution_count": 37
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m10×3 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x1_minimum \u001b[0m\u001b[1m x2_minimum \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼───────────────────────────────\n 1 │ 1 1 101\n 2 │ 2 2 102\n 3 │ 3 3 103\n 4 │ 4 4 104\n 5 │ 5 5 105\n 6 │ 6 6 106\n 7 │ 7 7 107\n 8 │ 8 8 108\n 9 │ 9 9 109\n 10 │ 10 10 110",
"text/html": [
"1 | 1 | 1 | 101 |
2 | 2 | 2 | 102 |
3 | 3 | 3 | 103 |
4 | 4 | 4 | 104 |
5 | 5 | 5 | 105 |
6 | 6 | 6 | 106 |
7 | 7 | 7 | 107 |
8 | 8 | 8 | 108 |
9 | 9 | 9 | 109 |
10 | 10 | 10 | 110 |
"
]
},
"metadata": {},
"execution_count": 38
}
],
"cell_type": "code",
"source": [
"@chain df begin\n",
" groupby(:id)\n",
" combine([:x1, :x2] .=> minimum)\n",
"end"
],
"metadata": {},
"execution_count": 38
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m10×5 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x1_minimum \u001b[0m\u001b[1m x2_minimum \u001b[0m\u001b[1m x1_maximum \u001b[0m\u001b[1m x2_maximum \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼───────────────────────────────────────────────────────\n 1 │ 1 1 101 91 191\n 2 │ 2 2 102 92 192\n 3 │ 3 3 103 93 193\n 4 │ 4 4 104 94 194\n 5 │ 5 5 105 95 195\n 6 │ 6 6 106 96 196\n 7 │ 7 7 107 97 197\n 8 │ 8 8 108 98 198\n 9 │ 9 9 109 99 199\n 10 │ 10 10 110 100 200",
"text/html": [
"1 | 1 | 1 | 101 | 91 | 191 |
2 | 2 | 2 | 102 | 92 | 192 |
3 | 3 | 3 | 103 | 93 | 193 |
4 | 4 | 4 | 104 | 94 | 194 |
5 | 5 | 5 | 105 | 95 | 195 |
6 | 6 | 6 | 106 | 96 | 196 |
7 | 7 | 7 | 107 | 97 | 197 |
8 | 8 | 8 | 108 | 98 | 198 |
9 | 9 | 9 | 109 | 99 | 199 |
10 | 10 | 10 | 110 | 100 | 200 |
"
]
},
"metadata": {},
"execution_count": 39
}
],
"cell_type": "code",
"source": [
"@chain df begin\n",
" groupby(:id)\n",
" combine([:x1, :x2] .=> [minimum maximum])\n",
"end"
],
"metadata": {},
"execution_count": 39
},
{
"cell_type": "markdown",
"source": [
"## Aggregation of a data frame using mapcols"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m10×10 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\u001b[1m x6 \u001b[0m\u001b[1m x7 \u001b[0m ⋯\n │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float6\u001b[0m ⋯\n─────┼──────────────────────────────────────────────────────────────────────────\n 1 │ 0.0497713 0.174038 0.335658 0.98617 0.818046 0.315802 0.1760 ⋯\n 2 │ 0.469067 0.532094 0.522563 0.698294 0.684567 0.713736 0.2363\n 3 │ 0.960727 0.792708 0.219594 0.991814 0.853047 0.661315 0.2278\n 4 │ 0.954378 0.464174 0.469449 0.207181 0.98027 0.811747 0.9080\n 5 │ 0.490458 0.356026 0.220018 0.672967 0.0462723 0.578973 0.8383 ⋯\n 6 │ 0.226693 0.975931 0.82475 0.132941 0.486135 0.189352 0.3053\n 7 │ 0.520552 0.0662263 0.154305 0.440129 0.508249 0.531972 0.6040\n 8 │ 0.881169 0.782526 0.13947 0.571747 0.240582 0.556918 0.9815\n 9 │ 0.615944 0.740941 0.155328 0.450876 0.731792 0.414014 0.9955 ⋯\n 10 │ 0.171597 0.927998 0.945204 0.761721 0.000796149 0.700339 0.8242\n\u001b[36m 4 columns omitted\u001b[0m",
"text/html": [
"1 | 0.0497713 | 0.174038 | 0.335658 | 0.98617 | 0.818046 | 0.315802 | 0.176087 | 0.88693 | 0.331478 | 0.565418 |
2 | 0.469067 | 0.532094 | 0.522563 | 0.698294 | 0.684567 | 0.713736 | 0.236339 | 0.436262 | 0.438289 | 0.501549 |
3 | 0.960727 | 0.792708 | 0.219594 | 0.991814 | 0.853047 | 0.661315 | 0.227833 | 0.796672 | 0.830572 | 0.0412893 |
4 | 0.954378 | 0.464174 | 0.469449 | 0.207181 | 0.98027 | 0.811747 | 0.908094 | 0.187583 | 0.127347 | 0.11277 |
5 | 0.490458 | 0.356026 | 0.220018 | 0.672967 | 0.0462723 | 0.578973 | 0.838352 | 0.145225 | 0.718935 | 0.254138 |
6 | 0.226693 | 0.975931 | 0.82475 | 0.132941 | 0.486135 | 0.189352 | 0.305374 | 0.478535 | 0.457187 | 0.0475593 |
7 | 0.520552 | 0.0662263 | 0.154305 | 0.440129 | 0.508249 | 0.531972 | 0.604084 | 0.38234 | 0.665801 | 0.563294 |
8 | 0.881169 | 0.782526 | 0.13947 | 0.571747 | 0.240582 | 0.556918 | 0.981525 | 0.813337 | 0.07942 | 0.616259 |
9 | 0.615944 | 0.740941 | 0.155328 | 0.450876 | 0.731792 | 0.414014 | 0.995569 | 0.825418 | 0.808066 | 0.795116 |
10 | 0.171597 | 0.927998 | 0.945204 | 0.761721 | 0.000796149 | 0.700339 | 0.824226 | 0.786892 | 0.33651 | 0.839618 |
"
]
},
"metadata": {},
"execution_count": 40
}
],
"cell_type": "code",
"source": [
"x = DataFrame(rand(10, 10), :auto)"
],
"metadata": {},
"execution_count": 40
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m1×10 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\u001b[1m x6 \u001b[0m\u001b[1m x7 \u001b[0m\u001b[1m x\u001b[0m ⋯\n │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m F\u001b[0m ⋯\n─────┼──────────────────────────────────────────────────────────────────────────\n 1 │ 0.534036 0.581266 0.398634 0.591384 0.534976 0.547417 0.609748 0 ⋯\n\u001b[36m 3 columns omitted\u001b[0m",
"text/html": [
"1 | 0.534036 | 0.581266 | 0.398634 | 0.591384 | 0.534976 | 0.547417 | 0.609748 | 0.573919 | 0.479361 | 0.433701 |
"
]
},
"metadata": {},
"execution_count": 41
}
],
"cell_type": "code",
"source": [
"mapcols(mean, x)"
],
"metadata": {},
"execution_count": 41
},
{
"cell_type": "markdown",
"source": [
"## Mapping rows and columns using eachcol and eachrow\n",
"map a function over each column and return a vector"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "10-element Vector{Float64}:\n 0.5340356166118051\n 0.581266221422634\n 0.39863395825014675\n 0.5913839384886548\n 0.5349757146385773\n 0.5474167357793596\n 0.6097484043642387\n 0.5739193910908156\n 0.47936052129134554\n 0.4337009355738717"
},
"metadata": {},
"execution_count": 42
}
],
"cell_type": "code",
"source": [
"map(mean, eachcol(x))"
],
"metadata": {},
"execution_count": 42
},
{
"cell_type": "markdown",
"source": [
"an iteration returns a Pair with column name and values"
],
"metadata": {}
},
{
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"x1: 0.5340356166118051\n",
"x2: 0.581266221422634\n",
"x3: 0.39863395825014675\n",
"x4: 0.5913839384886548\n",
"x5: 0.5349757146385773\n",
"x6: 0.5474167357793596\n",
"x7: 0.6097484043642387\n",
"x8: 0.5739193910908156\n",
"x9: 0.47936052129134554\n",
"x10: 0.4337009355738717\n"
]
}
],
"cell_type": "code",
"source": [
"foreach(c -> println(c[1], \": \", mean(c[2])), pairs(eachcol(x)))"
],
"metadata": {},
"execution_count": 43
},
{
"cell_type": "markdown",
"source": [
"now the returned value is DataFrameRow which works as a NamedTuple but is a view to a parent DataFrame"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "10-element Vector{Float64}:\n 0.28598028361408456\n 0.8815489568706143\n 1.211955077276455\n 2.056076699409006\n 1.3775932036047553\n 0.23228344373783585\n 7.860203109455978\n 1.1260573845407145\n 0.8312995463574625\n 0.18491066455919594"
},
"metadata": {},
"execution_count": 44
}
],
"cell_type": "code",
"source": [
"map(r -> r.x1 / r.x2, eachrow(x))"
],
"metadata": {},
"execution_count": 44
},
{
"cell_type": "markdown",
"source": [
"it prints like a data frame, only the caption is different so that you know the type of the object"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "10-element Vector{Float64}:\n 0.04977131658967582\n 0.4690673288817153\n 0.9607270015546927\n 0.9543781600816001\n 0.49045845644444097\n 0.22669264058140504\n 0.5205522834249062\n 0.8811686488976916\n 0.6159435670365279\n 0.17159676262539625"
},
"metadata": {},
"execution_count": 45
}
],
"cell_type": "code",
"source": [
"er = eachrow(x)\n",
"er.x1 ## you can access columns of a parent data frame directly"
],
"metadata": {},
"execution_count": 45
},
{
"cell_type": "markdown",
"source": [
"it prints like a data frame, only the caption is different so that you know the type of the object"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m10×10 DataFrameColumns\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\u001b[1m x6 \u001b[0m\u001b[1m x7 \u001b[0m ⋯\n │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float6\u001b[0m ⋯\n─────┼──────────────────────────────────────────────────────────────────────────\n 1 │ 0.0497713 0.174038 0.335658 0.98617 0.818046 0.315802 0.1760 ⋯\n 2 │ 0.469067 0.532094 0.522563 0.698294 0.684567 0.713736 0.2363\n 3 │ 0.960727 0.792708 0.219594 0.991814 0.853047 0.661315 0.2278\n 4 │ 0.954378 0.464174 0.469449 0.207181 0.98027 0.811747 0.9080\n 5 │ 0.490458 0.356026 0.220018 0.672967 0.0462723 0.578973 0.8383 ⋯\n 6 │ 0.226693 0.975931 0.82475 0.132941 0.486135 0.189352 0.3053\n 7 │ 0.520552 0.0662263 0.154305 0.440129 0.508249 0.531972 0.6040\n 8 │ 0.881169 0.782526 0.13947 0.571747 0.240582 0.556918 0.9815\n 9 │ 0.615944 0.740941 0.155328 0.450876 0.731792 0.414014 0.9955 ⋯\n 10 │ 0.171597 0.927998 0.945204 0.761721 0.000796149 0.700339 0.8242\n\u001b[36m 4 columns omitted\u001b[0m",
"text/html": [
"1 | 0.0497713 | 0.174038 | 0.335658 | 0.98617 | 0.818046 | 0.315802 | 0.176087 | 0.88693 | 0.331478 | 0.565418 |
2 | 0.469067 | 0.532094 | 0.522563 | 0.698294 | 0.684567 | 0.713736 | 0.236339 | 0.436262 | 0.438289 | 0.501549 |
3 | 0.960727 | 0.792708 | 0.219594 | 0.991814 | 0.853047 | 0.661315 | 0.227833 | 0.796672 | 0.830572 | 0.0412893 |
4 | 0.954378 | 0.464174 | 0.469449 | 0.207181 | 0.98027 | 0.811747 | 0.908094 | 0.187583 | 0.127347 | 0.11277 |
5 | 0.490458 | 0.356026 | 0.220018 | 0.672967 | 0.0462723 | 0.578973 | 0.838352 | 0.145225 | 0.718935 | 0.254138 |
6 | 0.226693 | 0.975931 | 0.82475 | 0.132941 | 0.486135 | 0.189352 | 0.305374 | 0.478535 | 0.457187 | 0.0475593 |
7 | 0.520552 | 0.0662263 | 0.154305 | 0.440129 | 0.508249 | 0.531972 | 0.604084 | 0.38234 | 0.665801 | 0.563294 |
8 | 0.881169 | 0.782526 | 0.13947 | 0.571747 | 0.240582 | 0.556918 | 0.981525 | 0.813337 | 0.07942 | 0.616259 |
9 | 0.615944 | 0.740941 | 0.155328 | 0.450876 | 0.731792 | 0.414014 | 0.995569 | 0.825418 | 0.808066 | 0.795116 |
10 | 0.171597 | 0.927998 | 0.945204 | 0.761721 | 0.000796149 | 0.700339 | 0.824226 | 0.786892 | 0.33651 | 0.839618 |
"
]
},
"metadata": {},
"execution_count": 46
}
],
"cell_type": "code",
"source": [
"ec = eachcol(x)"
],
"metadata": {},
"execution_count": 46
},
{
"cell_type": "markdown",
"source": [
"you can access columns of a parent data frame directly"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "10-element Vector{Float64}:\n 0.04977131658967582\n 0.4690673288817153\n 0.9607270015546927\n 0.9543781600816001\n 0.49045845644444097\n 0.22669264058140504\n 0.5205522834249062\n 0.8811686488976916\n 0.6159435670365279\n 0.17159676262539625"
},
"metadata": {},
"execution_count": 47
}
],
"cell_type": "code",
"source": [
"ec.x1"
],
"metadata": {},
"execution_count": 47
},
{
"cell_type": "markdown",
"source": [
"## Transposing\n",
"you can transpose a data frame using `permutedims`:"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m3×4 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼────────────────────────────\n 1 │ 1 4 7 10\n 2 │ 2 5 8 11\n 3 │ 3 6 9 12",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 48
}
],
"cell_type": "code",
"source": [
"df = DataFrame(reshape(1:12, 3, 4), :auto)"
],
"metadata": {},
"execution_count": 48
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "3-element Vector{String}:\n \"a\"\n \"b\"\n \"c\""
},
"metadata": {},
"execution_count": 49
}
],
"cell_type": "code",
"source": [
"df.names = [\"a\", \"b\", \"c\"]"
],
"metadata": {},
"execution_count": 49
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m4×4 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m names \u001b[0m\u001b[1m a \u001b[0m\u001b[1m b \u001b[0m\u001b[1m c \u001b[0m\n │\u001b[90m String \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼─────────────────────────────\n 1 │ x1 1 2 3\n 2 │ x2 4 5 6\n 3 │ x3 7 8 9\n 4 │ x4 10 11 12",
"text/html": [
"1 | x1 | 1 | 2 | 3 |
2 | x2 | 4 | 5 | 6 |
3 | x3 | 7 | 8 | 9 |
4 | x4 | 10 | 11 | 12 |
"
]
},
"metadata": {},
"execution_count": 50
}
],
"cell_type": "code",
"source": [
"permutedims(df, :names)"
],
"metadata": {},
"execution_count": 50
},
{
"cell_type": "markdown",
"source": [
"---\n",
"\n",
"*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*"
],
"metadata": {}
}
],
"nbformat_minor": 3,
"metadata": {
"language_info": {
"file_extension": ".jl",
"mimetype": "application/julia",
"name": "julia",
"version": "1.10.5"
},
"kernelspec": {
"name": "julia-1.10",
"display_name": "Julia 1.10.5",
"language": "julia"
}
},
"nbformat": 4
}