{
"cells": [
{
"cell_type": "markdown",
"source": [
"# Joining DataFrames"
],
"metadata": {}
},
{
"outputs": [],
"cell_type": "code",
"source": [
"using DataFrames"
],
"metadata": {},
"execution_count": 1
},
{
"cell_type": "markdown",
"source": [
"## Preparing DataFrames for a join"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m5×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m age \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼────────────────\n 1 │ 1 21\n 2 │ 2 22\n 3 │ 5 23\n 4 │ 6 24\n 5 │\u001b[90m missing \u001b[0m 99",
"text/html": [
"
1 | 1 | 21 |
2 | 2 | 22 |
3 | 5 | 23 |
4 | 6 | 24 |
5 | missing | 99 |
"
]
},
"metadata": {},
"execution_count": 2
}
],
"cell_type": "code",
"source": [
"x = DataFrame(ID=[1, 2, 3, 4, missing], name=[\"Alice\", \"Bob\", \"Conor\", \"Dave\", \"Zed\"])\n",
"y = DataFrame(id=[1, 2, 5, 6, missing], age=[21, 22, 23, 24, 99])"
],
"metadata": {},
"execution_count": 2
},
{
"cell_type": "markdown",
"source": [
"Rules for the `on` keyword argument:\n",
"\n",
"+ a single `Symbol` or string if joining on one column with the same name, e.g. `on=:id`\n",
"+ a `Pair` of `Symbol`s or string if joining on one column with different names, e.g. `on=:id=>:id2`\n",
"+ a vector of `Symbol`s or strings if joining on multiple columns with the same name, e.g. `on=[:id1, :id2]`\n",
"+ a vector of `Pair`s of `Symbol`s or strings if joining on multiple columns with different names, e.g. `on=[:a1=>:a2, :b1=>:b2]`\n",
"+ a vector containing a combination of `Symbol`s or strings or `Pair` of `Symbol`s or strings, e.g. `on=[:a1=>:a2, :b1]`"
],
"metadata": {}
},
{
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ArgumentError(\"Missing values in key columns are not allowed when matchmissing == :error. `missing` found in column :ID in left data frame.\")"
]
}
],
"cell_type": "code",
"source": [
"try\n",
" innerjoin(x, y, on=:ID => :id) ## missing is not allowed to join-on by default\n",
"catch e\n",
" show(e)\n",
"end"
],
"metadata": {},
"execution_count": 3
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m3×3 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m ID \u001b[0m\u001b[1m name \u001b[0m\u001b[1m age \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m String \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼────────────────────────\n 1 │ 1 Alice 21\n 2 │ 2 Bob 22\n 3 │\u001b[90m missing \u001b[0m Zed 99",
"text/html": [
"1 | 1 | Alice | 21 |
2 | 2 | Bob | 22 |
3 | missing | Zed | 99 |
"
]
},
"metadata": {},
"execution_count": 4
}
],
"cell_type": "code",
"source": [
"innerjoin(x, y, on=:ID => :id, matchmissing=:equal)"
],
"metadata": {},
"execution_count": 4
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m5×3 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m ID \u001b[0m\u001b[1m name \u001b[0m\u001b[1m age \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m String \u001b[0m\u001b[90m Int64? \u001b[0m\n─────┼──────────────────────────\n 1 │ 1 Alice 21\n 2 │ 2 Bob 22\n 3 │\u001b[90m missing \u001b[0m Zed 99\n 4 │ 3 Conor \u001b[90m missing \u001b[0m\n 5 │ 4 Dave \u001b[90m missing \u001b[0m",
"text/html": [
"1 | 1 | Alice | 21 |
2 | 2 | Bob | 22 |
3 | missing | Zed | 99 |
4 | 3 | Conor | missing |
5 | 4 | Dave | missing |
"
]
},
"metadata": {},
"execution_count": 5
}
],
"cell_type": "code",
"source": [
"leftjoin(x, y, on=\"ID\" => \"id\", matchmissing=:equal)"
],
"metadata": {},
"execution_count": 5
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m5×3 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m ID \u001b[0m\u001b[1m name \u001b[0m\u001b[1m age \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m String? \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼─────────────────────────\n 1 │ 1 Alice 21\n 2 │ 2 Bob 22\n 3 │\u001b[90m missing \u001b[0m Zed 99\n 4 │ 5 \u001b[90m missing \u001b[0m 23\n 5 │ 6 \u001b[90m missing \u001b[0m 24",
"text/html": [
"1 | 1 | Alice | 21 |
2 | 2 | Bob | 22 |
3 | missing | Zed | 99 |
4 | 5 | missing | 23 |
5 | 6 | missing | 24 |
"
]
},
"metadata": {},
"execution_count": 6
}
],
"cell_type": "code",
"source": [
"rightjoin(x, y, on=:ID => :id, matchmissing=:equal)"
],
"metadata": {},
"execution_count": 6
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m7×3 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m ID \u001b[0m\u001b[1m name \u001b[0m\u001b[1m age \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m String? \u001b[0m\u001b[90m Int64? \u001b[0m\n─────┼───────────────────────────\n 1 │ 1 Alice 21\n 2 │ 2 Bob 22\n 3 │\u001b[90m missing \u001b[0m Zed 99\n 4 │ 3 Conor \u001b[90m missing \u001b[0m\n 5 │ 4 Dave \u001b[90m missing \u001b[0m\n 6 │ 5 \u001b[90m missing \u001b[0m 23\n 7 │ 6 \u001b[90m missing \u001b[0m 24",
"text/html": [
"1 | 1 | Alice | 21 |
2 | 2 | Bob | 22 |
3 | missing | Zed | 99 |
4 | 3 | Conor | missing |
5 | 4 | Dave | missing |
6 | 5 | missing | 23 |
7 | 6 | missing | 24 |
"
]
},
"metadata": {},
"execution_count": 7
}
],
"cell_type": "code",
"source": [
"outerjoin(x, y, on=:ID => :id, matchmissing=:equal)"
],
"metadata": {},
"execution_count": 7
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m3×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m ID \u001b[0m\u001b[1m name \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m String \u001b[0m\n─────┼─────────────────\n 1 │ 1 Alice\n 2 │ 2 Bob\n 3 │\u001b[90m missing \u001b[0m Zed",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 8
}
],
"cell_type": "code",
"source": [
"semijoin(x, y, on=:ID => :id, matchmissing=:equal)"
],
"metadata": {},
"execution_count": 8
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m2×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m ID \u001b[0m\u001b[1m name \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m String \u001b[0m\n─────┼────────────────\n 1 │ 3 Conor\n 2 │ 4 Dave",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 9
}
],
"cell_type": "code",
"source": [
"antijoin(x, y, on=:ID => :id, matchmissing=:equal)"
],
"metadata": {},
"execution_count": 9
},
{
"cell_type": "markdown",
"source": [
"## Cross join\n",
"(here no `on` argument)"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m6×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m x \u001b[0m\u001b[1m y \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m String \u001b[0m\n─────┼───────────────\n 1 │ 1 a\n 2 │ 1 b\n 3 │ 1 c\n 4 │ 2 a\n 5 │ 2 b\n 6 │ 2 c",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 10
}
],
"cell_type": "code",
"source": [
"crossjoin(DataFrame(x=[1, 2]), DataFrame(y=[\"a\", \"b\", \"c\"]))"
],
"metadata": {},
"execution_count": 10
},
{
"cell_type": "markdown",
"source": [
"## Complex cases of joins"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m6×3 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id1 \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m age \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m Int64? \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼─────────────────────────\n 1 │ 1 11 21\n 2 │ 1 1 22\n 3 │ 3 31 23\n 4 │ 3 3 24\n 5 │\u001b[90m missing \u001b[0m\u001b[90m missing \u001b[0m 99\n 6 │\u001b[90m missing \u001b[0m 999 100",
"text/html": [
"1 | 1 | 11 | 21 |
2 | 1 | 1 | 22 |
3 | 3 | 31 | 23 |
4 | 3 | 3 | 24 |
5 | missing | missing | 99 |
6 | missing | 999 | 100 |
"
]
},
"metadata": {},
"execution_count": 11
}
],
"cell_type": "code",
"source": [
"x = DataFrame(id1=[1, 1, 2, 2, missing, missing],\n",
" id2=[1, 11, 2, 21, missing, 99],\n",
" name=[\"Alice\", \"Bob\", \"Conor\", \"Dave\", \"Zed\", \"Zoe\"]\n",
")\n",
"y = DataFrame(id1=[1, 1, 3, 3, missing, missing],\n",
" id2=[11, 1, 31, 3, missing, 999],\n",
" age=[21, 22, 23, 24, 99, 100]\n",
")"
],
"metadata": {},
"execution_count": 11
},
{
"cell_type": "markdown",
"source": [
"joining on two columns"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m3×4 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id1 \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m name \u001b[0m\u001b[1m age \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m Int64? \u001b[0m\u001b[90m String \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼─────────────────────────────────\n 1 │ 1 11 Bob 21\n 2 │ 1 1 Alice 22\n 3 │\u001b[90m missing \u001b[0m\u001b[90m missing \u001b[0m Zed 99",
"text/html": [
"1 | 1 | 11 | Bob | 21 |
2 | 1 | 1 | Alice | 22 |
3 | missing | missing | Zed | 99 |
"
]
},
"metadata": {},
"execution_count": 12
}
],
"cell_type": "code",
"source": [
"innerjoin(x, y, on=[:id1, :id2], matchmissing=:equal)"
],
"metadata": {},
"execution_count": 12
},
{
"cell_type": "markdown",
"source": [
"with duplicates all combinations are produced"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m12×6 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id1 \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m name \u001b[0m\u001b[1m id2_1 \u001b[0m\u001b[1m age \u001b[0m\u001b[1m source \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m Int64? \u001b[0m\u001b[90m String? \u001b[0m\u001b[90m Int64? \u001b[0m\u001b[90m Int64? \u001b[0m\u001b[90m String \u001b[0m\n─────┼─────────────────────────────────────────────────────────\n 1 │ 1 1 Alice 11 21 both\n 2 │ 1 11 Bob 11 21 both\n 3 │ 1 1 Alice 1 22 both\n 4 │ 1 11 Bob 1 22 both\n 5 │\u001b[90m missing \u001b[0m\u001b[90m missing \u001b[0m Zed \u001b[90m missing \u001b[0m 99 both\n 6 │\u001b[90m missing \u001b[0m 99 Zoe \u001b[90m missing \u001b[0m 99 both\n 7 │\u001b[90m missing \u001b[0m\u001b[90m missing \u001b[0m Zed 999 100 both\n 8 │\u001b[90m missing \u001b[0m 99 Zoe 999 100 both\n 9 │ 2 2 Conor \u001b[90m missing \u001b[0m\u001b[90m missing \u001b[0m left_only\n 10 │ 2 21 Dave \u001b[90m missing \u001b[0m\u001b[90m missing \u001b[0m left_only\n 11 │ 3 \u001b[90m missing \u001b[0m\u001b[90m missing \u001b[0m 31 23 right_only\n 12 │ 3 \u001b[90m missing \u001b[0m\u001b[90m missing \u001b[0m 3 24 right_only",
"text/html": [
"1 | 1 | 1 | Alice | 11 | 21 | both |
2 | 1 | 11 | Bob | 11 | 21 | both |
3 | 1 | 1 | Alice | 1 | 22 | both |
4 | 1 | 11 | Bob | 1 | 22 | both |
5 | missing | missing | Zed | missing | 99 | both |
6 | missing | 99 | Zoe | missing | 99 | both |
7 | missing | missing | Zed | 999 | 100 | both |
8 | missing | 99 | Zoe | 999 | 100 | both |
9 | 2 | 2 | Conor | missing | missing | left_only |
10 | 2 | 21 | Dave | missing | missing | left_only |
11 | 3 | missing | missing | 31 | 23 | right_only |
12 | 3 | missing | missing | 3 | 24 | right_only |
"
]
},
"metadata": {},
"execution_count": 13
}
],
"cell_type": "code",
"source": [
"outerjoin(x, y, on=:id1, makeunique=true, indicator=:source, matchmissing=:equal)"
],
"metadata": {},
"execution_count": 13
},
{
"cell_type": "markdown",
"source": [
"you can force validation of uniqueness of key on which you join"
],
"metadata": {}
},
{
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ArgumentError(\"Merge key(s) are not unique in both df1 and df2. df1 contains 3 duplicate keys: @NamedTuple{id1::Union{Missing, Int64}}((1,)), ..., @NamedTuple{id1::Union{Missing, Int64}}((missing,)). df2 contains 3 duplicate keys: @NamedTuple{id1::Union{Missing, Int64}}((1,)), ..., @NamedTuple{id1::Union{Missing, Int64}}((missing,)).\")"
]
}
],
"cell_type": "code",
"source": [
"try\n",
" innerjoin(x, y, on=:id1, makeunique=true, validate=(true, true), matchmissing=:equal)\n",
"catch e\n",
" show(e)\n",
"end"
],
"metadata": {},
"execution_count": 14
},
{
"cell_type": "markdown",
"source": [
"mixed `on` argument for joining on multiple columns"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m2×4 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id1 \u001b[0m\u001b[1m id2 \u001b[0m\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Char \u001b[0m\u001b[90m Char \u001b[0m\n─────┼──────────────────────────\n 1 │ 1 1 a a\n 2 │ 2 2 b b",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 15
}
],
"cell_type": "code",
"source": [
"x = DataFrame(id1=1:6, id2=[1, 2, 1, 2, 1, 2], x1='a':'f')\n",
"y = DataFrame(id1=1:6, ID2=1:6, x2='a':'f')\n",
"innerjoin(x, y, on=[:id1, :id2 => :ID2])"
],
"metadata": {},
"execution_count": 15
},
{
"cell_type": "markdown",
"source": [
"joining more than two data frames"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "5-element Vector{DataFrames.DataFrame}:\n \u001b[1m6×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m v1 \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼──────────────\n 1 │ 1 11\n 2 │ 2 12\n 3 │ 3 13\n 4 │ 4 14\n 5 │ 5 15\n 6 │ 6 16\n \u001b[1m6×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m v2 \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼──────────────\n 1 │ 1 21\n 2 │ 2 22\n 3 │ 3 23\n 4 │ 4 24\n 5 │ 5 25\n 6 │ 6 26\n \u001b[1m6×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m v3 \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼──────────────\n 1 │ 1 31\n 2 │ 2 32\n 3 │ 3 33\n 4 │ 4 34\n 5 │ 5 35\n 6 │ 6 36\n \u001b[1m6×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m v4 \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼──────────────\n 1 │ 1 41\n 2 │ 2 42\n 3 │ 3 43\n 4 │ 4 44\n 5 │ 5 45\n 6 │ 6 46\n \u001b[1m6×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m v5 \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼──────────────\n 1 │ 1 51\n 2 │ 2 52\n 3 │ 3 53\n 4 │ 4 54\n 5 │ 5 55\n 6 │ 6 56"
},
"metadata": {},
"execution_count": 16
}
],
"cell_type": "code",
"source": [
"xs = [DataFrame(\"id\" => 1:6, \"v$i\" => ((1:6) .+ 10i)) for i in 1:5]"
],
"metadata": {},
"execution_count": 16
},
{
"cell_type": "markdown",
"source": [
"`innerjoin` as an example, it also works for `outerjoin` and `crossjoin`"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m6×6 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m v1 \u001b[0m\u001b[1m v2 \u001b[0m\u001b[1m v3 \u001b[0m\u001b[1m v4 \u001b[0m\u001b[1m v5 \u001b[0m\n │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼──────────────────────────────────────────\n 1 │ 1 11 21 31 41 51\n 2 │ 2 12 22 32 42 52\n 3 │ 3 13 23 33 43 53\n 4 │ 4 14 24 34 44 54\n 5 │ 5 15 25 35 45 55\n 6 │ 6 16 26 36 46 56",
"text/html": [
"1 | 1 | 11 | 21 | 31 | 41 | 51 |
2 | 2 | 12 | 22 | 32 | 42 | 52 |
3 | 3 | 13 | 23 | 33 | 43 | 53 |
4 | 4 | 14 | 24 | 34 | 44 | 54 |
5 | 5 | 15 | 25 | 35 | 45 | 55 |
6 | 6 | 16 | 26 | 36 | 46 | 56 |
"
]
},
"metadata": {},
"execution_count": 17
}
],
"cell_type": "code",
"source": [
"innerjoin(xs..., on=:id)"
],
"metadata": {},
"execution_count": 17
},
{
"cell_type": "markdown",
"source": [
"## matchmissing keyword argument\n",
"\n",
"In general you have three options how `missing` values are handled in joins that are handled by `matchmisssing` kewyowrd argument value as follows:\n",
"\n",
"+ `:error`: throw an error if missings are encountered (this is the default)\n",
"+ `:equal`: assume `misssing` values are equal to themselves\n",
"+ `:notequal`: assume `misssing` values are not equal to themselves (not available for `outerjoin`)\n",
"\n",
"Here are some examples comparing the options:"
],
"metadata": {}
},
{
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ArgumentError(\"Missing values in key columns are not allowed when matchmissing == :error. `missing` found in column :id in left data frame.\")"
]
}
],
"cell_type": "code",
"source": [
"df1 = DataFrame(id=[1, 2, missing], x=1:3)\n",
"df2 = DataFrame(id=[1, missing, 3], y=1:3)\n",
"\n",
"try\n",
" innerjoin(df1, df2, on=:id)\n",
"catch e\n",
" show(e)\n",
"end"
],
"metadata": {},
"execution_count": 18
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m2×3 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x \u001b[0m\u001b[1m y \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼───────────────────────\n 1 │ 1 1 1\n 2 │\u001b[90m missing \u001b[0m 3 2",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 19
}
],
"cell_type": "code",
"source": [
"innerjoin(df1, df2, on=:id, matchmissing=:equal)"
],
"metadata": {},
"execution_count": 19
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m1×3 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x \u001b[0m\u001b[1m y \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼──────────────────────\n 1 │ 1 1 1",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 20
}
],
"cell_type": "code",
"source": [
"innerjoin(df1, df2, on=:id, matchmissing=:notequal)"
],
"metadata": {},
"execution_count": 20
},
{
"cell_type": "markdown",
"source": [
"Since DataFrames.jl 1.3 you can do an efficient left join of two data frames in-place. This means that the left data frame gets updated with new columns, but the columns that exist in it are not affected. This operation requires that there are no duplicates of keys in the right data frame that match keys in left data frame:"
],
"metadata": {}
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m3×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼────────────────\n 1 │ 1 1\n 2 │ 2 2\n 3 │\u001b[90m missing \u001b[0m 3",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 21
}
],
"cell_type": "code",
"source": [
"df1"
],
"metadata": {},
"execution_count": 21
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m3×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m y \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m Int64 \u001b[0m\n─────┼────────────────\n 1 │ 1 1\n 2 │\u001b[90m missing \u001b[0m 2\n 3 │ 3 3",
"text/html": [
""
]
},
"metadata": {},
"execution_count": 22
}
],
"cell_type": "code",
"source": [
"df2"
],
"metadata": {},
"execution_count": 22
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m3×3 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x \u001b[0m\u001b[1m y \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64? \u001b[0m\n─────┼─────────────────────────\n 1 │ 1 1 1\n 2 │ 2 2 \u001b[90m missing \u001b[0m\n 3 │\u001b[90m missing \u001b[0m 3 \u001b[90m missing \u001b[0m",
"text/html": [
"1 | 1 | 1 | 1 |
2 | 2 | 2 | missing |
3 | missing | 3 | missing |
"
]
},
"metadata": {},
"execution_count": 23
}
],
"cell_type": "code",
"source": [
"leftjoin!(df1, df2, on=:id, matchmissing=:notequal)"
],
"metadata": {},
"execution_count": 23
},
{
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "\u001b[1m3×3 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m id \u001b[0m\u001b[1m x \u001b[0m\u001b[1m y \u001b[0m\n │\u001b[90m Int64? \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64? \u001b[0m\n─────┼─────────────────────────\n 1 │ 1 1 1\n 2 │ 2 2 \u001b[90m missing \u001b[0m\n 3 │\u001b[90m missing \u001b[0m 3 \u001b[90m missing \u001b[0m",
"text/html": [
"1 | 1 | 1 | 1 |
2 | 2 | 2 | missing |
3 | missing | 3 | missing |
"
]
},
"metadata": {},
"execution_count": 24
}
],
"cell_type": "code",
"source": [
"df1"
],
"metadata": {},
"execution_count": 24
},
{
"cell_type": "markdown",
"source": [
"---\n",
"\n",
"*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*"
],
"metadata": {}
}
],
"nbformat_minor": 3,
"metadata": {
"language_info": {
"file_extension": ".jl",
"mimetype": "application/julia",
"name": "julia",
"version": "1.10.5"
},
"kernelspec": {
"name": "julia-1.10",
"display_name": "Julia 1.10.5",
"language": "julia"
}
},
"nbformat": 4
}