{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ " # Manipulating columns of a DataFrame\n", " ## Renaming columns\n", " Let's start with a DataFrame of Bools that has default column names." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:28.296000Z", "iopub.status.busy": "2024-06-04T16:21:27.878000Z", "iopub.status.idle": "2024-06-04T16:21:29.158000Z", "shell.execute_reply": "2024-06-04T16:21:29.110000Z" } }, "outputs": [], "source": [ "using DataFrames" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:29.378000Z", "iopub.status.busy": "2024-06-04T16:21:29.159000Z", "iopub.status.idle": "2024-06-04T16:21:31.481000Z", "shell.execute_reply": "2024-06-04T16:21:31.480000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowx1x2x3x4
BoolBoolBoolBool
1falsetruetruetrue
2truefalsetruetrue
3falsetruefalsefalse
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& x1 & x2 & x3 & x4\\\\\n", "\t\\hline\n", "\t& Bool & Bool & Bool & Bool\\\\\n", "\t\\hline\n", "\t1 & 0 & 1 & 1 & 1 \\\\\n", "\t2 & 1 & 0 & 1 & 1 \\\\\n", "\t3 & 0 & 1 & 0 & 0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\n", " │\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\n", "─────┼────────────────────────────\n", " 1 │ false true true true\n", " 2 │ true false true true\n", " 3 │ false true false false" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x = DataFrame(rand(Bool, 3, 4), :auto)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " With rename, we create new DataFrame; here we rename the column :x1 to :A.\n", " (rename also accepts collections of Pairs.)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:31.510000Z", "iopub.status.busy": "2024-06-04T16:21:31.510000Z", "iopub.status.idle": "2024-06-04T16:21:31.564000Z", "shell.execute_reply": "2024-06-04T16:21:31.564000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
RowAx2x3x4
BoolBoolBoolBool
1falsetruetruetrue
2truefalsetruetrue
3falsetruefalsefalse
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& A & x2 & x3 & x4\\\\\n", "\t\\hline\n", "\t& Bool & Bool & Bool & Bool\\\\\n", "\t\\hline\n", "\t1 & 0 & 1 & 1 & 1 \\\\\n", "\t2 & 1 & 0 & 1 & 1 \\\\\n", "\t3 & 0 & 1 & 0 & 0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m A \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\n", " │\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\n", "─────┼────────────────────────────\n", " 1 │ false true true true\n", " 2 │ true false true true\n", " 3 │ false true false false" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rename(x, :x1 => :A)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " With rename! we do an in place transformation.\n", " This time we've applied a function to every column name (note that the function gets a column names as a string)." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:31.566000Z", "iopub.status.busy": "2024-06-04T16:21:31.566000Z", "iopub.status.idle": "2024-06-04T16:21:31.609000Z", "shell.execute_reply": "2024-06-04T16:21:31.608000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowx1x1x2x2x3x3x4x4
BoolBoolBoolBool
1falsetruetruetrue
2truefalsetruetrue
3falsetruefalsefalse
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& x1x1 & x2x2 & x3x3 & x4x4\\\\\n", "\t\\hline\n", "\t& Bool & Bool & Bool & Bool\\\\\n", "\t\\hline\n", "\t1 & 0 & 1 & 1 & 1 \\\\\n", "\t2 & 1 & 0 & 1 & 1 \\\\\n", "\t3 & 0 & 1 & 0 & 0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1x1 \u001b[0m\u001b[1m x2x2 \u001b[0m\u001b[1m x3x3 \u001b[0m\u001b[1m x4x4 \u001b[0m\n", " │\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\n", "─────┼────────────────────────────\n", " 1 │ false true true true\n", " 2 │ true false true true\n", " 3 │ false true false false" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rename!(c -> c^2, x)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " We can also change the name of a particular column without knowing the\n", " original.\n", " Here we change the name of the third column, creating a new DataFrame." ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:31.612000Z", "iopub.status.busy": "2024-06-04T16:21:31.611000Z", "iopub.status.idle": "2024-06-04T16:21:31.642000Z", "shell.execute_reply": "2024-06-04T16:21:31.642000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowx1x1x2x2thirdx4x4
BoolBoolBoolBool
1falsetruetruetrue
2truefalsetruetrue
3falsetruefalsefalse
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& x1x1 & x2x2 & third & x4x4\\\\\n", "\t\\hline\n", "\t& Bool & Bool & Bool & Bool\\\\\n", "\t\\hline\n", "\t1 & 0 & 1 & 1 & 1 \\\\\n", "\t2 & 1 & 0 & 1 & 1 \\\\\n", "\t3 & 0 & 1 & 0 & 0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1x1 \u001b[0m\u001b[1m x2x2 \u001b[0m\u001b[1m third \u001b[0m\u001b[1m x4x4 \u001b[0m\n", " │\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\n", "─────┼────────────────────────────\n", " 1 │ false true true true\n", " 2 │ true false true true\n", " 3 │ false true false false" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rename(x, 3 => :third)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " If we pass a vector of names to rename!, we can change the names of all\n", " variables." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:31.645000Z", "iopub.status.busy": "2024-06-04T16:21:31.645000Z", "iopub.status.idle": "2024-06-04T16:21:31.665000Z", "shell.execute_reply": "2024-06-04T16:21:31.665000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowabcd
BoolBoolBoolBool
1falsetruetruetrue
2truefalsetruetrue
3falsetruefalsefalse
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& a & b & c & d\\\\\n", "\t\\hline\n", "\t& Bool & Bool & Bool & Bool\\\\\n", "\t\\hline\n", "\t1 & 0 & 1 & 1 & 1 \\\\\n", "\t2 & 1 & 0 & 1 & 1 \\\\\n", "\t3 & 0 & 1 & 0 & 0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m a \u001b[0m\u001b[1m b \u001b[0m\u001b[1m c \u001b[0m\u001b[1m d \u001b[0m\n", " │\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\n", "─────┼────────────────────────────\n", " 1 │ false true true true\n", " 2 │ true false true true\n", " 3 │ false true false false" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rename!(x, [:a, :b, :c, :d])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " In all the above examples you could have used strings instead of symbols,\n", " e.g." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:31.667000Z", "iopub.status.busy": "2024-06-04T16:21:31.667000Z", "iopub.status.idle": "2024-06-04T16:21:31.763000Z", "shell.execute_reply": "2024-06-04T16:21:31.762000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowabcd
BoolBoolBoolBool
1falsetruetruetrue
2truefalsetruetrue
3falsetruefalsefalse
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& a & b & c & d\\\\\n", "\t\\hline\n", "\t& Bool & Bool & Bool & Bool\\\\\n", "\t\\hline\n", "\t1 & 0 & 1 & 1 & 1 \\\\\n", "\t2 & 1 & 0 & 1 & 1 \\\\\n", "\t3 & 0 & 1 & 0 & 0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m a \u001b[0m\u001b[1m b \u001b[0m\u001b[1m c \u001b[0m\u001b[1m d \u001b[0m\n", " │\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\n", "─────┼────────────────────────────\n", " 1 │ false true true true\n", " 2 │ true false true true\n", " 3 │ false true false false" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rename!(x, string.('a':'d'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " rename! allows for circular renaming of columns, e.g.:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:31.765000Z", "iopub.status.busy": "2024-06-04T16:21:31.765000Z", "iopub.status.idle": "2024-06-04T16:21:31.766000Z", "shell.execute_reply": "2024-06-04T16:21:31.766000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowabcd
BoolBoolBoolBool
1falsetruetruetrue
2truefalsetruetrue
3falsetruefalsefalse
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& a & b & c & d\\\\\n", "\t\\hline\n", "\t& Bool & Bool & Bool & Bool\\\\\n", "\t\\hline\n", "\t1 & 0 & 1 & 1 & 1 \\\\\n", "\t2 & 1 & 0 & 1 & 1 \\\\\n", "\t3 & 0 & 1 & 0 & 0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m a \u001b[0m\u001b[1m b \u001b[0m\u001b[1m c \u001b[0m\u001b[1m d \u001b[0m\n", " │\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\n", "─────┼────────────────────────────\n", " 1 │ false true true true\n", " 2 │ true false true true\n", " 3 │ false true false false" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:31.768000Z", "iopub.status.busy": "2024-06-04T16:21:31.768000Z", "iopub.status.idle": "2024-06-04T16:21:31.799000Z", "shell.execute_reply": "2024-06-04T16:21:31.799000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowdbca
BoolBoolBoolBool
1falsetruetruetrue
2truefalsetruetrue
3falsetruefalsefalse
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& d & b & c & a\\\\\n", "\t\\hline\n", "\t& Bool & Bool & Bool & Bool\\\\\n", "\t\\hline\n", "\t1 & 0 & 1 & 1 & 1 \\\\\n", "\t2 & 1 & 0 & 1 & 1 \\\\\n", "\t3 & 0 & 1 & 0 & 0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m d \u001b[0m\u001b[1m b \u001b[0m\u001b[1m c \u001b[0m\u001b[1m a \u001b[0m\n", " │\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\n", "─────┼────────────────────────────\n", " 1 │ false true true true\n", " 2 │ true false true true\n", " 3 │ false true false false" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rename!(x, \"a\"=>\"d\", \"d\"=>\"a\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " We get an error when we try to provide duplicate names" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:31.802000Z", "iopub.status.busy": "2024-06-04T16:21:31.802000Z", "iopub.status.idle": "2024-06-04T16:21:32.552000Z", "shell.execute_reply": "2024-06-04T16:21:32.552000Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ArgumentError(\"Duplicate variable names: :a. Pass makeunique=true to make them unique using a suffix automatically.\")" ] } ], "source": [ "try\n", " rename(x, fill(:a, 4))\n", "catch e\n", " show(e)\n", "end" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " unless we pass makeunique=true, which allows us to handle duplicates in passed names." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:32.555000Z", "iopub.status.busy": "2024-06-04T16:21:32.555000Z", "iopub.status.idle": "2024-06-04T16:21:32.561000Z", "shell.execute_reply": "2024-06-04T16:21:32.560000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowaa_1a_2a_3
BoolBoolBoolBool
1falsetruetruetrue
2truefalsetruetrue
3falsetruefalsefalse
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& a & a\\_1 & a\\_2 & a\\_3\\\\\n", "\t\\hline\n", "\t& Bool & Bool & Bool & Bool\\\\\n", "\t\\hline\n", "\t1 & 0 & 1 & 1 & 1 \\\\\n", "\t2 & 1 & 0 & 1 & 1 \\\\\n", "\t3 & 0 & 1 & 0 & 0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m a \u001b[0m\u001b[1m a_1 \u001b[0m\u001b[1m a_2 \u001b[0m\u001b[1m a_3 \u001b[0m\n", " │\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\n", "─────┼────────────────────────────\n", " 1 │ false true true true\n", " 2 │ true false true true\n", " 3 │ false true false false" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rename(x, fill(:a, 4), makeunique=true)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " ## Reordering columns\n", " We can reorder the names(x) vector as needed, creating a new DataFrame." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:32.563000Z", "iopub.status.busy": "2024-06-04T16:21:32.563000Z", "iopub.status.idle": "2024-06-04T16:21:32.664000Z", "shell.execute_reply": "2024-06-04T16:21:32.664000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowdbca
BoolBoolBoolBool
1falsetruetruetrue
2truefalsetruetrue
3falsetruefalsefalse
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& d & b & c & a\\\\\n", "\t\\hline\n", "\t& Bool & Bool & Bool & Bool\\\\\n", "\t\\hline\n", "\t1 & 0 & 1 & 1 & 1 \\\\\n", "\t2 & 1 & 0 & 1 & 1 \\\\\n", "\t3 & 0 & 1 & 0 & 0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m d \u001b[0m\u001b[1m b \u001b[0m\u001b[1m c \u001b[0m\u001b[1m a \u001b[0m\n", " │\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\n", "─────┼────────────────────────────\n", " 1 │ false true true true\n", " 2 │ true false true true\n", " 3 │ false true false false" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "using Random\n", "Random.seed!(1234)\n", "x[:, shuffle(names(x))]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Also select! can be used to achieve this in place (or select to perform a copy):" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:32.667000Z", "iopub.status.busy": "2024-06-04T16:21:32.667000Z", "iopub.status.idle": "2024-06-04T16:21:32.668000Z", "shell.execute_reply": "2024-06-04T16:21:32.668000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowdbca
BoolBoolBoolBool
1falsetruetruetrue
2truefalsetruetrue
3falsetruefalsefalse
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& d & b & c & a\\\\\n", "\t\\hline\n", "\t& Bool & Bool & Bool & Bool\\\\\n", "\t\\hline\n", "\t1 & 0 & 1 & 1 & 1 \\\\\n", "\t2 & 1 & 0 & 1 & 1 \\\\\n", "\t3 & 0 & 1 & 0 & 0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m d \u001b[0m\u001b[1m b \u001b[0m\u001b[1m c \u001b[0m\u001b[1m a \u001b[0m\n", " │\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\n", "─────┼────────────────────────────\n", " 1 │ false true true true\n", " 2 │ true false true true\n", " 3 │ false true false false" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:32.670000Z", "iopub.status.busy": "2024-06-04T16:21:32.670000Z", "iopub.status.idle": "2024-06-04T16:21:32.701000Z", "shell.execute_reply": "2024-06-04T16:21:32.701000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowacbd
BoolBoolBoolBool
1truetruetruefalse
2truetruefalsetrue
3falsefalsetruefalse
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& a & c & b & d\\\\\n", "\t\\hline\n", "\t& Bool & Bool & Bool & Bool\\\\\n", "\t\\hline\n", "\t1 & 1 & 1 & 1 & 0 \\\\\n", "\t2 & 1 & 1 & 0 & 1 \\\\\n", "\t3 & 0 & 0 & 1 & 0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m a \u001b[0m\u001b[1m c \u001b[0m\u001b[1m b \u001b[0m\u001b[1m d \u001b[0m\n", " │\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\u001b[90m Bool \u001b[0m\n", "─────┼────────────────────────────\n", " 1 │ true true true false\n", " 2 │ true true false true\n", " 3 │ false false true false" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "select!(x, 4:-1:1);\n", "x" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " ## Merging/adding columns" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:32.703000Z", "iopub.status.busy": "2024-06-04T16:21:32.703000Z", "iopub.status.idle": "2024-06-04T16:21:32.918000Z", "shell.execute_reply": "2024-06-04T16:21:32.918000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowx1x2x3x4
Tuple…Tuple…Tuple…Tuple…
1(1, 1)(1, 2)(1, 3)(1, 4)
2(2, 1)(2, 2)(2, 3)(2, 4)
3(3, 1)(3, 2)(3, 3)(3, 4)
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& x1 & x2 & x3 & x4\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 2) & (1, 3) & (1, 4) \\\\\n", "\t2 & (2, 1) & (2, 2) & (2, 3) & (2, 4) \\\\\n", "\t3 & (3, 1) & (3, 2) & (3, 3) & (3, 4) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────────────\n", " 1 │ (1, 1) (1, 2) (1, 3) (1, 4)\n", " 2 │ (2, 1) (2, 2) (2, 3) (2, 4)\n", " 3 │ (3, 1) (3, 2) (3, 3) (3, 4)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x = DataFrame([(i,j) for i in 1:3, j in 1:4], :auto)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " With `hcat` we can merge two `DataFrames`. Also `[x y]` syntax is supported but only when DataFrames have unique column names." ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:32.921000Z", "iopub.status.busy": "2024-06-04T16:21:32.920000Z", "iopub.status.idle": "2024-06-04T16:21:32.943000Z", "shell.execute_reply": "2024-06-04T16:21:32.943000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×8 DataFrame
Rowx1x2x3x4x1_1x2_1x3_1x4_1
Tuple…Tuple…Tuple…Tuple…Tuple…Tuple…Tuple…Tuple…
1(1, 1)(1, 2)(1, 3)(1, 4)(1, 1)(1, 2)(1, 3)(1, 4)
2(2, 1)(2, 2)(2, 3)(2, 4)(2, 1)(2, 2)(2, 3)(2, 4)
3(3, 1)(3, 2)(3, 3)(3, 4)(3, 1)(3, 2)(3, 3)(3, 4)
" ], "text/latex": [ "\\begin{tabular}{r|cccccccc}\n", "\t& x1 & x2 & x3 & x4 & x1\\_1 & x2\\_1 & x3\\_1 & x4\\_1\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple… & Tuple… & Tuple… & Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 2) & (1, 3) & (1, 4) & (1, 1) & (1, 2) & (1, 3) & (1, 4) \\\\\n", "\t2 & (2, 1) & (2, 2) & (2, 3) & (2, 4) & (2, 1) & (2, 2) & (2, 3) & (2, 4) \\\\\n", "\t3 & (3, 1) & (3, 2) & (3, 3) & (3, 4) & (3, 1) & (3, 2) & (3, 3) & (3, 4) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×8 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x1_1 \u001b[0m\u001b[1m x2_1 \u001b[0m\u001b[1m x3_1 \u001b[0m\u001b[1m x4_1 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────────────────────────────────────────────\n", " 1 │ (1, 1) (1, 2) (1, 3) (1, 4) (1, 1) (1, 2) (1, 3) (1, 4)\n", " 2 │ (2, 1) (2, 2) (2, 3) (2, 4) (2, 1) (2, 2) (2, 3) (2, 4)\n", " 3 │ (3, 1) (3, 2) (3, 3) (3, 4) (3, 1) (3, 2) (3, 3) (3, 4)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hcat(x, x, makeunique=true)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " You can append a vector to a data frame with the following syntax:" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:32.945000Z", "iopub.status.busy": "2024-06-04T16:21:32.945000Z", "iopub.status.idle": "2024-06-04T16:21:33.030000Z", "shell.execute_reply": "2024-06-04T16:21:33.030000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×5 DataFrame
Rowx1x2x3x4A
Tuple…Tuple…Tuple…Tuple…Int64
1(1, 1)(1, 2)(1, 3)(1, 4)1
2(2, 1)(2, 2)(2, 3)(2, 4)2
3(3, 1)(3, 2)(3, 3)(3, 4)3
" ], "text/latex": [ "\\begin{tabular}{r|ccccc}\n", "\t& x1 & x2 & x3 & x4 & A\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple… & Tuple… & Int64\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 2) & (1, 3) & (1, 4) & 1 \\\\\n", "\t2 & (2, 1) & (2, 2) & (2, 3) & (2, 4) & 2 \\\\\n", "\t3 & (3, 1) & (3, 2) & (3, 3) & (3, 4) & 3 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×5 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m A \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼───────────────────────────────────────\n", " 1 │ (1, 1) (1, 2) (1, 3) (1, 4) 1\n", " 2 │ (2, 1) (2, 2) (2, 3) (2, 4) 2\n", " 3 │ (3, 1) (3, 2) (3, 3) (3, 4) 3" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y = [x DataFrame(A=[1,2,3])]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Here we do the same but add column `:A` to the front." ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:33.032000Z", "iopub.status.busy": "2024-06-04T16:21:33.032000Z", "iopub.status.idle": "2024-06-04T16:21:33.034000Z", "shell.execute_reply": "2024-06-04T16:21:33.034000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×5 DataFrame
RowAx1x2x3x4
Int64Tuple…Tuple…Tuple…Tuple…
11(1, 1)(1, 2)(1, 3)(1, 4)
22(2, 1)(2, 2)(2, 3)(2, 4)
33(3, 1)(3, 2)(3, 3)(3, 4)
" ], "text/latex": [ "\\begin{tabular}{r|ccccc}\n", "\t& A & x1 & x2 & x3 & x4\\\\\n", "\t\\hline\n", "\t& Int64 & Tuple… & Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & 1 & (1, 1) & (1, 2) & (1, 3) & (1, 4) \\\\\n", "\t2 & 2 & (2, 1) & (2, 2) & (2, 3) & (2, 4) \\\\\n", "\t3 & 3 & (3, 1) & (3, 2) & (3, 3) & (3, 4) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×5 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m A \u001b[0m\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼───────────────────────────────────────\n", " 1 │ 1 (1, 1) (1, 2) (1, 3) (1, 4)\n", " 2 │ 2 (2, 1) (2, 2) (2, 3) (2, 4)\n", " 3 │ 3 (3, 1) (3, 2) (3, 3) (3, 4)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y = [DataFrame(A=[1,2,3]) x]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " A column can also be added in the middle. Here a brute-force method is used and a new `DataFrame` is created." ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:33.036000Z", "iopub.status.busy": "2024-06-04T16:21:33.036000Z", "iopub.status.idle": "2024-06-04T16:21:36.515000Z", "shell.execute_reply": "2024-06-04T16:21:36.515000Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 3.693 μs (74 allocations: 6.39 KiB)\n" ] }, { "data": { "text/html": [ "
3×5 DataFrame
Rowx1x2Ax3x4
Tuple…Tuple…Int64Tuple…Tuple…
1(1, 1)(1, 2)1(1, 3)(1, 4)
2(2, 1)(2, 2)2(2, 3)(2, 4)
3(3, 1)(3, 2)3(3, 3)(3, 4)
" ], "text/latex": [ "\\begin{tabular}{r|ccccc}\n", "\t& x1 & x2 & A & x3 & x4\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Int64 & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 2) & 1 & (1, 3) & (1, 4) \\\\\n", "\t2 & (2, 1) & (2, 2) & 2 & (2, 3) & (2, 4) \\\\\n", "\t3 & (3, 1) & (3, 2) & 3 & (3, 3) & (3, 4) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×5 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m A \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼───────────────────────────────────────\n", " 1 │ (1, 1) (1, 2) 1 (1, 3) (1, 4)\n", " 2 │ (2, 1) (2, 2) 2 (2, 3) (2, 4)\n", " 3 │ (3, 1) (3, 2) 3 (3, 3) (3, 4)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "using BenchmarkTools\n", "@btime [$x[!, 1:2] DataFrame(A=[1,2,3]) $x[!, 3:4]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " We could also do this with a specialized in place method `insertcols!`. Let's add `:newcol` to the `DataFrame` y." ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:36.518000Z", "iopub.status.busy": "2024-06-04T16:21:36.518000Z", "iopub.status.idle": "2024-06-04T16:21:36.533000Z", "shell.execute_reply": "2024-06-04T16:21:36.533000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×6 DataFrame
RowAnewcolx1x2x3x4
Int64Int64Tuple…Tuple…Tuple…Tuple…
111(1, 1)(1, 2)(1, 3)(1, 4)
222(2, 1)(2, 2)(2, 3)(2, 4)
333(3, 1)(3, 2)(3, 3)(3, 4)
" ], "text/latex": [ "\\begin{tabular}{r|cccccc}\n", "\t& A & newcol & x1 & x2 & x3 & x4\\\\\n", "\t\\hline\n", "\t& Int64 & Int64 & Tuple… & Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & 1 & 1 & (1, 1) & (1, 2) & (1, 3) & (1, 4) \\\\\n", "\t2 & 2 & 2 & (2, 1) & (2, 2) & (2, 3) & (2, 4) \\\\\n", "\t3 & 3 & 3 & (3, 1) & (3, 2) & (3, 3) & (3, 4) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×6 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m A \u001b[0m\u001b[1m newcol \u001b[0m\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼───────────────────────────────────────────────\n", " 1 │ 1 1 (1, 1) (1, 2) (1, 3) (1, 4)\n", " 2 │ 2 2 (2, 1) (2, 2) (2, 3) (2, 4)\n", " 3 │ 3 3 (3, 1) (3, 2) (3, 3) (3, 4)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "insertcols!(y, 2, \"newcol\" => [1,2,3])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " If you want to insert the same column name several times `makeunique=true` is needed as usual." ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:36.535000Z", "iopub.status.busy": "2024-06-04T16:21:36.535000Z", "iopub.status.idle": "2024-06-04T16:21:36.542000Z", "shell.execute_reply": "2024-06-04T16:21:36.542000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×7 DataFrame
RowAnewcol_1newcolx1x2x3x4
Int64Int64Int64Tuple…Tuple…Tuple…Tuple…
1111(1, 1)(1, 2)(1, 3)(1, 4)
2222(2, 1)(2, 2)(2, 3)(2, 4)
3333(3, 1)(3, 2)(3, 3)(3, 4)
" ], "text/latex": [ "\\begin{tabular}{r|ccccccc}\n", "\t& A & newcol\\_1 & newcol & x1 & x2 & x3 & x4\\\\\n", "\t\\hline\n", "\t& Int64 & Int64 & Int64 & Tuple… & Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & 1 & 1 & 1 & (1, 1) & (1, 2) & (1, 3) & (1, 4) \\\\\n", "\t2 & 2 & 2 & 2 & (2, 1) & (2, 2) & (2, 3) & (2, 4) \\\\\n", "\t3 & 3 & 3 & 3 & (3, 1) & (3, 2) & (3, 3) & (3, 4) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×7 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m A \u001b[0m\u001b[1m newcol_1 \u001b[0m\u001b[1m newcol \u001b[0m\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼─────────────────────────────────────────────────────────\n", " 1 │ 1 1 1 (1, 1) (1, 2) (1, 3) (1, 4)\n", " 2 │ 2 2 2 (2, 1) (2, 2) (2, 3) (2, 4)\n", " 3 │ 3 3 3 (3, 1) (3, 2) (3, 3) (3, 4)" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "insertcols!(y, 2, :newcol => [1,2,3], makeunique=true)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " We can see how much faster it is to insert a column with `insertcols!` than with `hcat` using `@btime` (note that we use here a `Pair` notation as an example)." ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:36.544000Z", "iopub.status.busy": "2024-06-04T16:21:36.544000Z", "iopub.status.idle": "2024-06-04T16:21:37.983000Z", "shell.execute_reply": "2024-06-04T16:21:37.983000Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 856.750 ns (16 allocations: 1.50 KiB)\n" ] }, { "data": { "text/html": [ "
3×5 DataFrame
Rowx1x2Ax3x4
Tuple…Tuple…Int64Tuple…Tuple…
1(1, 1)(1, 2)1(1, 3)(1, 4)
2(2, 1)(2, 2)2(2, 3)(2, 4)
3(3, 1)(3, 2)3(3, 3)(3, 4)
" ], "text/latex": [ "\\begin{tabular}{r|ccccc}\n", "\t& x1 & x2 & A & x3 & x4\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Int64 & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 2) & 1 & (1, 3) & (1, 4) \\\\\n", "\t2 & (2, 1) & (2, 2) & 2 & (2, 3) & (2, 4) \\\\\n", "\t3 & (3, 1) & (3, 2) & 3 & (3, 3) & (3, 4) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×5 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m A \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼───────────────────────────────────────\n", " 1 │ (1, 1) (1, 2) 1 (1, 3) (1, 4)\n", " 2 │ (2, 1) (2, 2) 2 (2, 3) (2, 4)\n", " 3 │ (3, 1) (3, 2) 3 (3, 3) (3, 4)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "@btime insertcols!(copy($x), 3, :A => [1,2,3])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Let's use `insertcols!` to append a column in place (note that we dropped the index at which we insert the column)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:37.985000Z", "iopub.status.busy": "2024-06-04T16:21:37.985000Z", "iopub.status.idle": "2024-06-04T16:21:37.992000Z", "shell.execute_reply": "2024-06-04T16:21:37.992000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×5 DataFrame
Rowx1x2x3x4A
Tuple…Tuple…Tuple…Tuple…Int64
1(1, 1)(1, 2)(1, 3)(1, 4)1
2(2, 1)(2, 2)(2, 3)(2, 4)2
3(3, 1)(3, 2)(3, 3)(3, 4)3
" ], "text/latex": [ "\\begin{tabular}{r|ccccc}\n", "\t& x1 & x2 & x3 & x4 & A\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple… & Tuple… & Int64\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 2) & (1, 3) & (1, 4) & 1 \\\\\n", "\t2 & (2, 1) & (2, 2) & (2, 3) & (2, 4) & 2 \\\\\n", "\t3 & (3, 1) & (3, 2) & (3, 3) & (3, 4) & 3 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×5 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m A \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼───────────────────────────────────────\n", " 1 │ (1, 1) (1, 2) (1, 3) (1, 4) 1\n", " 2 │ (2, 1) (2, 2) (2, 3) (2, 4) 2\n", " 3 │ (3, 1) (3, 2) (3, 3) (3, 4) 3" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "insertcols!(x, :A => [1,2,3])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " and to in place prepend a column." ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:37.995000Z", "iopub.status.busy": "2024-06-04T16:21:37.995000Z", "iopub.status.idle": "2024-06-04T16:21:37.996000Z", "shell.execute_reply": "2024-06-04T16:21:37.996000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×6 DataFrame
RowBx1x2x3x4A
Int64Tuple…Tuple…Tuple…Tuple…Int64
11(1, 1)(1, 2)(1, 3)(1, 4)1
22(2, 1)(2, 2)(2, 3)(2, 4)2
33(3, 1)(3, 2)(3, 3)(3, 4)3
" ], "text/latex": [ "\\begin{tabular}{r|cccccc}\n", "\t& B & x1 & x2 & x3 & x4 & A\\\\\n", "\t\\hline\n", "\t& Int64 & Tuple… & Tuple… & Tuple… & Tuple… & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & (1, 1) & (1, 2) & (1, 3) & (1, 4) & 1 \\\\\n", "\t2 & 2 & (2, 1) & (2, 2) & (2, 3) & (2, 4) & 2 \\\\\n", "\t3 & 3 & (3, 1) & (3, 2) & (3, 3) & (3, 4) & 3 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×6 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m B \u001b[0m\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m A \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼──────────────────────────────────────────────\n", " 1 │ 1 (1, 1) (1, 2) (1, 3) (1, 4) 1\n", " 2 │ 2 (2, 1) (2, 2) (2, 3) (2, 4) 2\n", " 3 │ 3 (3, 1) (3, 2) (3, 3) (3, 4) 3" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "insertcols!(x, 1, :B => [1,2,3])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Note that `insertcols!` can be used to insert several columns to a data frame at once and that it performs broadcasting if needed:" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:37.998000Z", "iopub.status.busy": "2024-06-04T16:21:37.998000Z", "iopub.status.idle": "2024-06-04T16:21:38.006000Z", "shell.execute_reply": "2024-06-04T16:21:38.006000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×1 DataFrame
Rowa
Int64
11
22
33
" ], "text/latex": [ "\\begin{tabular}{r|c}\n", "\t& a\\\\\n", "\t\\hline\n", "\t& Int64\\\\\n", "\t\\hline\n", "\t1 & 1 \\\\\n", "\t2 & 2 \\\\\n", "\t3 & 3 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×1 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m a \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\n", "─────┼───────\n", " 1 │ 1\n", " 2 │ 2\n", " 3 │ 3" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = DataFrame(a = [1, 2, 3])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:38.008000Z", "iopub.status.busy": "2024-06-04T16:21:38.007000Z", "iopub.status.idle": "2024-06-04T16:21:38.556000Z", "shell.execute_reply": "2024-06-04T16:21:38.556000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowabcd
Int64StringCharArray…
11xa[1, 2, 3]
22xb[1, 2, 3]
33xc[1, 2, 3]
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& a & b & c & d\\\\\n", "\t\\hline\n", "\t& Int64 & String & Char & Array…\\\\\n", "\t\\hline\n", "\t1 & 1 & x & a & [1, 2, 3] \\\\\n", "\t2 & 2 & x & b & [1, 2, 3] \\\\\n", "\t3 & 3 & x & c & [1, 2, 3] \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m a \u001b[0m\u001b[1m b \u001b[0m\u001b[1m c \u001b[0m\u001b[1m d \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m String \u001b[0m\u001b[90m Char \u001b[0m\u001b[90m Array… \u001b[0m\n", "─────┼────────────────────────────────\n", " 1 │ 1 x a [1, 2, 3]\n", " 2 │ 2 x b [1, 2, 3]\n", " 3 │ 3 x c [1, 2, 3]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "insertcols!(df, :b => \"x\", :c => 'a':'c', :d => Ref([1,2,3]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Interestingly we can emulate `hcat` mutating the data frame in-place using `insertcols!`:" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:38.558000Z", "iopub.status.busy": "2024-06-04T16:21:38.558000Z", "iopub.status.idle": "2024-06-04T16:21:38.560000Z", "shell.execute_reply": "2024-06-04T16:21:38.560000Z" } }, "outputs": [ { "data": { "text/html": [ "
2×1 DataFrame
Rowa
Int64
11
22
" ], "text/latex": [ "\\begin{tabular}{r|c}\n", "\t& a\\\\\n", "\t\\hline\n", "\t& Int64\\\\\n", "\t\\hline\n", "\t1 & 1 \\\\\n", "\t2 & 2 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m2×1 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m a \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\n", "─────┼───────\n", " 1 │ 1\n", " 2 │ 2" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 = DataFrame(a=[1,2])" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:38.562000Z", "iopub.status.busy": "2024-06-04T16:21:38.562000Z", "iopub.status.idle": "2024-06-04T16:21:38.586000Z", "shell.execute_reply": "2024-06-04T16:21:38.586000Z" } }, "outputs": [ { "data": { "text/html": [ "
2×2 DataFrame
Rowbc
Int64Int64
123
234
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& b & c\\\\\n", "\t\\hline\n", "\t& Int64 & Int64\\\\\n", "\t\\hline\n", "\t1 & 2 & 3 \\\\\n", "\t2 & 3 & 4 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m2×2 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m b \u001b[0m\u001b[1m c \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼──────────────\n", " 1 │ 2 3\n", " 2 │ 3 4" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = DataFrame(b=[2,3], c=[3,4])" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:38.588000Z", "iopub.status.busy": "2024-06-04T16:21:38.588000Z", "iopub.status.idle": "2024-06-04T16:21:38.589000Z", "shell.execute_reply": "2024-06-04T16:21:38.589000Z" } }, "outputs": [ { "data": { "text/html": [ "
2×3 DataFrame
Rowabc
Int64Int64Int64
1123
2234
" ], "text/latex": [ "\\begin{tabular}{r|ccc}\n", "\t& a & b & c\\\\\n", "\t\\hline\n", "\t& Int64 & Int64 & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & 2 & 3 \\\\\n", "\t2 & 2 & 3 & 4 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m2×3 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m a \u001b[0m\u001b[1m b \u001b[0m\u001b[1m c \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼─────────────────────\n", " 1 │ 1 2 3\n", " 2 │ 2 3 4" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hcat(df1, df2)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:38.591000Z", "iopub.status.busy": "2024-06-04T16:21:38.591000Z", "iopub.status.idle": "2024-06-04T16:21:38.592000Z", "shell.execute_reply": "2024-06-04T16:21:38.591000Z" } }, "outputs": [ { "data": { "text/html": [ "
2×1 DataFrame
Rowa
Int64
11
22
" ], "text/latex": [ "\\begin{tabular}{r|c}\n", "\t& a\\\\\n", "\t\\hline\n", "\t& Int64\\\\\n", "\t\\hline\n", "\t1 & 1 \\\\\n", "\t2 & 2 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m2×1 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m a \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\n", "─────┼───────\n", " 1 │ 1\n", " 2 │ 2" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 ## df1 is not touched" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:38.594000Z", "iopub.status.busy": "2024-06-04T16:21:38.594000Z", "iopub.status.idle": "2024-06-04T16:21:38.683000Z", "shell.execute_reply": "2024-06-04T16:21:38.683000Z" } }, "outputs": [ { "data": { "text/html": [ "
2×3 DataFrame
Rowabc
Int64Int64Int64
1123
2234
" ], "text/latex": [ "\\begin{tabular}{r|ccc}\n", "\t& a & b & c\\\\\n", "\t\\hline\n", "\t& Int64 & Int64 & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & 2 & 3 \\\\\n", "\t2 & 2 & 3 & 4 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m2×3 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m a \u001b[0m\u001b[1m b \u001b[0m\u001b[1m c \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼─────────────────────\n", " 1 │ 1 2 3\n", " 2 │ 2 3 4" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "insertcols!(df1, pairs(eachcol(df2))...)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:38.686000Z", "iopub.status.busy": "2024-06-04T16:21:38.686000Z", "iopub.status.idle": "2024-06-04T16:21:38.688000Z", "shell.execute_reply": "2024-06-04T16:21:38.687000Z" } }, "outputs": [ { "data": { "text/html": [ "
2×3 DataFrame
Rowabc
Int64Int64Int64
1123
2234
" ], "text/latex": [ "\\begin{tabular}{r|ccc}\n", "\t& a & b & c\\\\\n", "\t\\hline\n", "\t& Int64 & Int64 & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & 2 & 3 \\\\\n", "\t2 & 2 & 3 & 4 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m2×3 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m a \u001b[0m\u001b[1m b \u001b[0m\u001b[1m c \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼─────────────────────\n", " 1 │ 1 2 3\n", " 2 │ 2 3 4" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 ## now we have changed df1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " ## Subsetting/removing columns\n", " Let's create a new `DataFrame` `x` and show a few ways to create DataFrames with a subset of `x`'s columns." ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:38.690000Z", "iopub.status.busy": "2024-06-04T16:21:38.690000Z", "iopub.status.idle": "2024-06-04T16:21:38.710000Z", "shell.execute_reply": "2024-06-04T16:21:38.710000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×5 DataFrame
Rowx1x2x3x4x5
Tuple…Tuple…Tuple…Tuple…Tuple…
1(1, 1)(1, 2)(1, 3)(1, 4)(1, 5)
2(2, 1)(2, 2)(2, 3)(2, 4)(2, 5)
3(3, 1)(3, 2)(3, 3)(3, 4)(3, 5)
" ], "text/latex": [ "\\begin{tabular}{r|ccccc}\n", "\t& x1 & x2 & x3 & x4 & x5\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 2) & (1, 3) & (1, 4) & (1, 5) \\\\\n", "\t2 & (2, 1) & (2, 2) & (2, 3) & (2, 4) & (2, 5) \\\\\n", "\t3 & (3, 1) & (3, 2) & (3, 3) & (3, 4) & (3, 5) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×5 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────────────────────\n", " 1 │ (1, 1) (1, 2) (1, 3) (1, 4) (1, 5)\n", " 2 │ (2, 1) (2, 2) (2, 3) (2, 4) (2, 5)\n", " 3 │ (3, 1) (3, 2) (3, 3) (3, 4) (3, 5)" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x = DataFrame([(i,j) for i in 1:3, j in 1:5], :auto)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " First we could do this by index:" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:38.712000Z", "iopub.status.busy": "2024-06-04T16:21:38.712000Z", "iopub.status.idle": "2024-06-04T16:21:38.729000Z", "shell.execute_reply": "2024-06-04T16:21:38.729000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowx1x2x4x5
Tuple…Tuple…Tuple…Tuple…
1(1, 1)(1, 2)(1, 4)(1, 5)
2(2, 1)(2, 2)(2, 4)(2, 5)
3(3, 1)(3, 2)(3, 4)(3, 5)
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& x1 & x2 & x4 & x5\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 2) & (1, 4) & (1, 5) \\\\\n", "\t2 & (2, 1) & (2, 2) & (2, 4) & (2, 5) \\\\\n", "\t3 & (3, 1) & (3, 2) & (3, 4) & (3, 5) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────────────\n", " 1 │ (1, 1) (1, 2) (1, 4) (1, 5)\n", " 2 │ (2, 1) (2, 2) (2, 4) (2, 5)\n", " 3 │ (3, 1) (3, 2) (3, 4) (3, 5)" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x[:, [1,2,4,5]] ## use ! instead of : for non-copying operation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " or by column name:" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:38.731000Z", "iopub.status.busy": "2024-06-04T16:21:38.731000Z", "iopub.status.idle": "2024-06-04T16:21:38.739000Z", "shell.execute_reply": "2024-06-04T16:21:38.739000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×2 DataFrame
Rowx1x4
Tuple…Tuple…
1(1, 1)(1, 4)
2(2, 1)(2, 4)
3(3, 1)(3, 4)
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& x1 & x4\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 4) \\\\\n", "\t2 & (2, 1) & (2, 4) \\\\\n", "\t3 & (3, 1) & (3, 4) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×2 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x4 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────\n", " 1 │ (1, 1) (1, 4)\n", " 2 │ (2, 1) (2, 4)\n", " 3 │ (3, 1) (3, 4)" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x[:, [:x1, :x4]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " We can also choose to keep or exclude columns by `Bool` (we need a vector whose length is the number of columns in the original `DataFrame`)." ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:38.741000Z", "iopub.status.busy": "2024-06-04T16:21:38.741000Z", "iopub.status.idle": "2024-06-04T16:21:38.769000Z", "shell.execute_reply": "2024-06-04T16:21:38.769000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×3 DataFrame
Rowx1x3x5
Tuple…Tuple…Tuple…
1(1, 1)(1, 3)(1, 5)
2(2, 1)(2, 3)(2, 5)
3(3, 1)(3, 3)(3, 5)
" ], "text/latex": [ "\\begin{tabular}{r|ccc}\n", "\t& x1 & x3 & x5\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 3) & (1, 5) \\\\\n", "\t2 & (2, 1) & (2, 3) & (2, 5) \\\\\n", "\t3 & (3, 1) & (3, 3) & (3, 5) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×3 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x5 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────\n", " 1 │ (1, 1) (1, 3) (1, 5)\n", " 2 │ (2, 1) (2, 3) (2, 5)\n", " 3 │ (3, 1) (3, 3) (3, 5)" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x[:, [true, false, true, false, true]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Here we create a single column `DataFrame`," ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:38.771000Z", "iopub.status.busy": "2024-06-04T16:21:38.771000Z", "iopub.status.idle": "2024-06-04T16:21:38.772000Z", "shell.execute_reply": "2024-06-04T16:21:38.772000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×1 DataFrame
Rowx1
Tuple…
1(1, 1)
2(2, 1)
3(3, 1)
" ], "text/latex": [ "\\begin{tabular}{r|c}\n", "\t& x1\\\\\n", "\t\\hline\n", "\t& Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) \\\\\n", "\t2 & (2, 1) \\\\\n", "\t3 & (3, 1) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×1 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\n", "─────┼────────\n", " 1 │ (1, 1)\n", " 2 │ (2, 1)\n", " 3 │ (3, 1)" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x[:, [:x1]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " and here we access the vector contained in column `:x1`." ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:38.774000Z", "iopub.status.busy": "2024-06-04T16:21:38.774000Z", "iopub.status.idle": "2024-06-04T16:21:40.079000Z", "shell.execute_reply": "2024-06-04T16:21:40.079000Z" } }, "outputs": [ { "data": { "text/plain": [ "3-element Vector{Tuple{Int64, Int64}}:\n", " (1, 1)\n", " (2, 1)\n", " (3, 1)" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x[!, :x1] ## use : instead of ! to copy" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.081000Z", "iopub.status.busy": "2024-06-04T16:21:40.081000Z", "iopub.status.idle": "2024-06-04T16:21:40.086000Z", "shell.execute_reply": "2024-06-04T16:21:40.085000Z" } }, "outputs": [ { "data": { "text/plain": [ "3-element Vector{Tuple{Int64, Int64}}:\n", " (1, 1)\n", " (2, 1)\n", " (3, 1)" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x.x1 ## the same" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " We could grab the same vector by column number" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.088000Z", "iopub.status.busy": "2024-06-04T16:21:40.088000Z", "iopub.status.idle": "2024-06-04T16:21:40.089000Z", "shell.execute_reply": "2024-06-04T16:21:40.089000Z" } }, "outputs": [ { "data": { "text/plain": [ "3-element Vector{Tuple{Int64, Int64}}:\n", " (1, 1)\n", " (2, 1)\n", " (3, 1)" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x[!, 1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Note that getting a single column returns it without copying while creating a new `DataFrame` performs a copy of the column" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.091000Z", "iopub.status.busy": "2024-06-04T16:21:40.091000Z", "iopub.status.idle": "2024-06-04T16:21:40.234000Z", "shell.execute_reply": "2024-06-04T16:21:40.234000Z" } }, "outputs": [ { "data": { "text/plain": [ "false" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x[!, 1] === x[!, [1]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " you can also use `Regex`, `All`, `Between` and `Not` from InvertedIndies.jl for column selection:" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.237000Z", "iopub.status.busy": "2024-06-04T16:21:40.237000Z", "iopub.status.idle": "2024-06-04T16:21:40.241000Z", "shell.execute_reply": "2024-06-04T16:21:40.241000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×2 DataFrame
Rowx1x2
Tuple…Tuple…
1(1, 1)(1, 2)
2(2, 1)(2, 2)
3(3, 1)(3, 2)
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& x1 & x2\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 2) \\\\\n", "\t2 & (2, 1) & (2, 2) \\\\\n", "\t3 & (3, 1) & (3, 2) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×2 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────\n", " 1 │ (1, 1) (1, 2)\n", " 2 │ (2, 1) (2, 2)\n", " 3 │ (3, 1) (3, 2)" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x[!, r\"[12]\"]" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.244000Z", "iopub.status.busy": "2024-06-04T16:21:40.244000Z", "iopub.status.idle": "2024-06-04T16:21:40.255000Z", "shell.execute_reply": "2024-06-04T16:21:40.255000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowx2x3x4x5
Tuple…Tuple…Tuple…Tuple…
1(1, 2)(1, 3)(1, 4)(1, 5)
2(2, 2)(2, 3)(2, 4)(2, 5)
3(3, 2)(3, 3)(3, 4)(3, 5)
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& x2 & x3 & x4 & x5\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 2) & (1, 3) & (1, 4) & (1, 5) \\\\\n", "\t2 & (2, 2) & (2, 3) & (2, 4) & (2, 5) \\\\\n", "\t3 & (3, 2) & (3, 3) & (3, 4) & (3, 5) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────────────\n", " 1 │ (1, 2) (1, 3) (1, 4) (1, 5)\n", " 2 │ (2, 2) (2, 3) (2, 4) (2, 5)\n", " 3 │ (3, 2) (3, 3) (3, 4) (3, 5)" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x[!, Not(1)]" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.257000Z", "iopub.status.busy": "2024-06-04T16:21:40.257000Z", "iopub.status.idle": "2024-06-04T16:21:40.275000Z", "shell.execute_reply": "2024-06-04T16:21:40.275000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×3 DataFrame
Rowx2x3x4
Tuple…Tuple…Tuple…
1(1, 2)(1, 3)(1, 4)
2(2, 2)(2, 3)(2, 4)
3(3, 2)(3, 3)(3, 4)
" ], "text/latex": [ "\\begin{tabular}{r|ccc}\n", "\t& x2 & x3 & x4\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 2) & (1, 3) & (1, 4) \\\\\n", "\t2 & (2, 2) & (2, 3) & (2, 4) \\\\\n", "\t3 & (3, 2) & (3, 3) & (3, 4) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×3 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────\n", " 1 │ (1, 2) (1, 3) (1, 4)\n", " 2 │ (2, 2) (2, 3) (2, 4)\n", " 3 │ (3, 2) (3, 3) (3, 4)" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x[!, Between(:x2, :x4)]" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.278000Z", "iopub.status.busy": "2024-06-04T16:21:40.277000Z", "iopub.status.idle": "2024-06-04T16:21:40.363000Z", "shell.execute_reply": "2024-06-04T16:21:40.362000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowx1x3x4x5
Tuple…Tuple…Tuple…Tuple…
1(1, 1)(1, 3)(1, 4)(1, 5)
2(2, 1)(2, 3)(2, 4)(2, 5)
3(3, 1)(3, 3)(3, 4)(3, 5)
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& x1 & x3 & x4 & x5\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 3) & (1, 4) & (1, 5) \\\\\n", "\t2 & (2, 1) & (2, 3) & (2, 4) & (2, 5) \\\\\n", "\t3 & (3, 1) & (3, 3) & (3, 4) & (3, 5) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────────────\n", " 1 │ (1, 1) (1, 3) (1, 4) (1, 5)\n", " 2 │ (2, 1) (2, 3) (2, 4) (2, 5)\n", " 3 │ (3, 1) (3, 3) (3, 4) (3, 5)" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x[!, Cols(:x1, Between(:x3, :x5))]" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.365000Z", "iopub.status.busy": "2024-06-04T16:21:40.365000Z", "iopub.status.idle": "2024-06-04T16:21:40.412000Z", "shell.execute_reply": "2024-06-04T16:21:40.411000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowx1x3x4x5
Tuple…Tuple…Tuple…Tuple…
1(1, 1)(1, 3)(1, 4)(1, 5)
2(2, 1)(2, 3)(2, 4)(2, 5)
3(3, 1)(3, 3)(3, 4)(3, 5)
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& x1 & x3 & x4 & x5\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 3) & (1, 4) & (1, 5) \\\\\n", "\t2 & (2, 1) & (2, 3) & (2, 4) & (2, 5) \\\\\n", "\t3 & (3, 1) & (3, 3) & (3, 4) & (3, 5) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────────────\n", " 1 │ (1, 1) (1, 3) (1, 4) (1, 5)\n", " 2 │ (2, 1) (2, 3) (2, 4) (2, 5)\n", " 3 │ (3, 1) (3, 3) (3, 4) (3, 5)" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "select(x, :x1, Between(:x3, :x5), copycols=false) # the same as above" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " you can use `select` and `select!` functions to select a subset of columns from a data frame. `select` creates a new data frame and `select!` operates in place" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.414000Z", "iopub.status.busy": "2024-06-04T16:21:40.414000Z", "iopub.status.idle": "2024-06-04T16:21:40.416000Z", "shell.execute_reply": "2024-06-04T16:21:40.416000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×5 DataFrame
Rowx1x2x3x4x5
Tuple…Tuple…Tuple…Tuple…Tuple…
1(1, 1)(1, 2)(1, 3)(1, 4)(1, 5)
2(2, 1)(2, 2)(2, 3)(2, 4)(2, 5)
3(3, 1)(3, 2)(3, 3)(3, 4)(3, 5)
" ], "text/latex": [ "\\begin{tabular}{r|ccccc}\n", "\t& x1 & x2 & x3 & x4 & x5\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 2) & (1, 3) & (1, 4) & (1, 5) \\\\\n", "\t2 & (2, 1) & (2, 2) & (2, 3) & (2, 4) & (2, 5) \\\\\n", "\t3 & (3, 1) & (3, 2) & (3, 3) & (3, 4) & (3, 5) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×5 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────────────────────\n", " 1 │ (1, 1) (1, 2) (1, 3) (1, 4) (1, 5)\n", " 2 │ (2, 1) (2, 2) (2, 3) (2, 4) (2, 5)\n", " 3 │ (3, 1) (3, 2) (3, 3) (3, 4) (3, 5)" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = copy(x)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.418000Z", "iopub.status.busy": "2024-06-04T16:21:40.418000Z", "iopub.status.idle": "2024-06-04T16:21:40.423000Z", "shell.execute_reply": "2024-06-04T16:21:40.423000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×2 DataFrame
Rowx1x2
Tuple…Tuple…
1(1, 1)(1, 2)
2(2, 1)(2, 2)
3(3, 1)(3, 2)
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& x1 & x2\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 2) \\\\\n", "\t2 & (2, 1) & (2, 2) \\\\\n", "\t3 & (3, 1) & (3, 2) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×2 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────\n", " 1 │ (1, 1) (1, 2)\n", " 2 │ (2, 1) (2, 2)\n", " 3 │ (3, 1) (3, 2)" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = select(df, [1, 2])" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.425000Z", "iopub.status.busy": "2024-06-04T16:21:40.425000Z", "iopub.status.idle": "2024-06-04T16:21:40.452000Z", "shell.execute_reply": "2024-06-04T16:21:40.452000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×3 DataFrame
Rowx3x4x5
Tuple…Tuple…Tuple…
1(1, 3)(1, 4)(1, 5)
2(2, 3)(2, 4)(2, 5)
3(3, 3)(3, 4)(3, 5)
" ], "text/latex": [ "\\begin{tabular}{r|ccc}\n", "\t& x3 & x4 & x5\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 3) & (1, 4) & (1, 5) \\\\\n", "\t2 & (2, 3) & (2, 4) & (2, 5) \\\\\n", "\t3 & (3, 3) & (3, 4) & (3, 5) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×3 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────\n", " 1 │ (1, 3) (1, 4) (1, 5)\n", " 2 │ (2, 3) (2, 4) (2, 5)\n", " 3 │ (3, 3) (3, 4) (3, 5)" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "select(df, Not([1, 2]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " by default `select` copies columns" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.455000Z", "iopub.status.busy": "2024-06-04T16:21:40.455000Z", "iopub.status.idle": "2024-06-04T16:21:40.455000Z", "shell.execute_reply": "2024-06-04T16:21:40.455000Z" } }, "outputs": [ { "data": { "text/plain": [ "false" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2[!, 1] === df[!, 1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " this can be avoided by using `copycols=false` keyword argument" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.457000Z", "iopub.status.busy": "2024-06-04T16:21:40.457000Z", "iopub.status.idle": "2024-06-04T16:21:40.459000Z", "shell.execute_reply": "2024-06-04T16:21:40.459000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×2 DataFrame
Rowx1x2
Tuple…Tuple…
1(1, 1)(1, 2)
2(2, 1)(2, 2)
3(3, 1)(3, 2)
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& x1 & x2\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 2) \\\\\n", "\t2 & (2, 1) & (2, 2) \\\\\n", "\t3 & (3, 1) & (3, 2) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×2 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────\n", " 1 │ (1, 1) (1, 2)\n", " 2 │ (2, 1) (2, 2)\n", " 3 │ (3, 1) (3, 2)" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = select(df, [1, 2], copycols=false)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.461000Z", "iopub.status.busy": "2024-06-04T16:21:40.460000Z", "iopub.status.idle": "2024-06-04T16:21:40.461000Z", "shell.execute_reply": "2024-06-04T16:21:40.461000Z" } }, "outputs": [ { "data": { "text/plain": [ "true" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2[!, 1] === df[!, 1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " using `select!` will modify the source data frame" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.463000Z", "iopub.status.busy": "2024-06-04T16:21:40.463000Z", "iopub.status.idle": "2024-06-04T16:21:40.464000Z", "shell.execute_reply": "2024-06-04T16:21:40.464000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×2 DataFrame
Rowx1x2
Tuple…Tuple…
1(1, 1)(1, 2)
2(2, 1)(2, 2)
3(3, 1)(3, 2)
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& x1 & x2\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 2) \\\\\n", "\t2 & (2, 1) & (2, 2) \\\\\n", "\t3 & (3, 1) & (3, 2) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×2 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────\n", " 1 │ (1, 1) (1, 2)\n", " 2 │ (2, 1) (2, 2)\n", " 3 │ (3, 1) (3, 2)" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "select!(df, [1,2])" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.466000Z", "iopub.status.busy": "2024-06-04T16:21:40.466000Z", "iopub.status.idle": "2024-06-04T16:21:40.498000Z", "shell.execute_reply": "2024-06-04T16:21:40.498000Z" } }, "outputs": [ { "data": { "text/plain": [ "true" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df == df2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Here we create a copy of `x` and delete the 3rd column from the copy with `select!` and `Not`." ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.501000Z", "iopub.status.busy": "2024-06-04T16:21:40.500000Z", "iopub.status.idle": "2024-06-04T16:21:40.511000Z", "shell.execute_reply": "2024-06-04T16:21:40.511000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowx1x2x4x5
Tuple…Tuple…Tuple…Tuple…
1(1, 1)(1, 2)(1, 4)(1, 5)
2(2, 1)(2, 2)(2, 4)(2, 5)
3(3, 1)(3, 2)(3, 4)(3, 5)
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& x1 & x2 & x4 & x5\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 2) & (1, 4) & (1, 5) \\\\\n", "\t2 & (2, 1) & (2, 2) & (2, 4) & (2, 5) \\\\\n", "\t3 & (3, 1) & (3, 2) & (3, 4) & (3, 5) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────────────\n", " 1 │ (1, 1) (1, 2) (1, 4) (1, 5)\n", " 2 │ (2, 1) (2, 2) (2, 4) (2, 5)\n", " 3 │ (3, 1) (3, 2) (3, 4) (3, 5)" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "z = copy(x)\n", "select!(z, Not(3))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " alternatively we can achieve the same by using the `select` function" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.513000Z", "iopub.status.busy": "2024-06-04T16:21:40.513000Z", "iopub.status.idle": "2024-06-04T16:21:40.515000Z", "shell.execute_reply": "2024-06-04T16:21:40.515000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowx1x2x4x5
Tuple…Tuple…Tuple…Tuple…
1(1, 1)(1, 2)(1, 4)(1, 5)
2(2, 1)(2, 2)(2, 4)(2, 5)
3(3, 1)(3, 2)(3, 4)(3, 5)
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& x1 & x2 & x4 & x5\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 2) & (1, 4) & (1, 5) \\\\\n", "\t2 & (2, 1) & (2, 2) & (2, 4) & (2, 5) \\\\\n", "\t3 & (3, 1) & (3, 2) & (3, 4) & (3, 5) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────────────\n", " 1 │ (1, 1) (1, 2) (1, 4) (1, 5)\n", " 2 │ (2, 1) (2, 2) (2, 4) (2, 5)\n", " 3 │ (3, 1) (3, 2) (3, 4) (3, 5)" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "select(x, Not(3))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " `x` stays unchanged" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.517000Z", "iopub.status.busy": "2024-06-04T16:21:40.517000Z", "iopub.status.idle": "2024-06-04T16:21:40.518000Z", "shell.execute_reply": "2024-06-04T16:21:40.518000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×5 DataFrame
Rowx1x2x3x4x5
Tuple…Tuple…Tuple…Tuple…Tuple…
1(1, 1)(1, 2)(1, 3)(1, 4)(1, 5)
2(2, 1)(2, 2)(2, 3)(2, 4)(2, 5)
3(3, 1)(3, 2)(3, 3)(3, 4)(3, 5)
" ], "text/latex": [ "\\begin{tabular}{r|ccccc}\n", "\t& x1 & x2 & x3 & x4 & x5\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 2) & (1, 3) & (1, 4) & (1, 5) \\\\\n", "\t2 & (2, 1) & (2, 2) & (2, 3) & (2, 4) & (2, 5) \\\\\n", "\t3 & (3, 1) & (3, 2) & (3, 3) & (3, 4) & (3, 5) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×5 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────────────────────\n", " 1 │ (1, 1) (1, 2) (1, 3) (1, 4) (1, 5)\n", " 2 │ (2, 1) (2, 2) (2, 3) (2, 4) (2, 5)\n", " 3 │ (3, 1) (3, 2) (3, 3) (3, 4) (3, 5)" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " ## Views\n", " Note, that you can also create a view of a DataFrame when we want a subset of its columns:" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:40.520000Z", "iopub.status.busy": "2024-06-04T16:21:40.520000Z", "iopub.status.idle": "2024-06-04T16:21:41.844000Z", "shell.execute_reply": "2024-06-04T16:21:41.844000Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 892.556 ns (18 allocations: 1.67 KiB)\n" ] }, { "data": { "text/html": [ "
3×3 DataFrame
Rowx1x3x5
Tuple…Tuple…Tuple…
1(1, 1)(1, 3)(1, 5)
2(2, 1)(2, 3)(2, 5)
3(3, 1)(3, 3)(3, 5)
" ], "text/latex": [ "\\begin{tabular}{r|ccc}\n", "\t& x1 & x3 & x5\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 3) & (1, 5) \\\\\n", "\t2 & (2, 1) & (2, 3) & (2, 5) \\\\\n", "\t3 & (3, 1) & (3, 3) & (3, 5) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×3 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x5 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────\n", " 1 │ (1, 1) (1, 3) (1, 5)\n", " 2 │ (2, 1) (2, 3) (2, 5)\n", " 3 │ (3, 1) (3, 3) (3, 5)" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "@btime x[:, [1,3,5]]" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:41.847000Z", "iopub.status.busy": "2024-06-04T16:21:41.847000Z", "iopub.status.idle": "2024-06-04T16:21:44.547000Z", "shell.execute_reply": "2024-06-04T16:21:44.547000Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 264.758 ns (3 allocations: 240 bytes)\n" ] }, { "data": { "text/html": [ "
3×3 SubDataFrame
Rowx1x3x5
Tuple…Tuple…Tuple…
1(1, 1)(1, 3)(1, 5)
2(2, 1)(2, 3)(2, 5)
3(3, 1)(3, 3)(3, 5)
" ], "text/latex": [ "\\begin{tabular}{r|ccc}\n", "\t& x1 & x3 & x5\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 3) & (1, 5) \\\\\n", "\t2 & (2, 1) & (2, 3) & (2, 5) \\\\\n", "\t3 & (3, 1) & (3, 3) & (3, 5) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×3 SubDataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x5 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────\n", " 1 │ (1, 1) (1, 3) (1, 5)\n", " 2 │ (2, 1) (2, 3) (2, 5)\n", " 3 │ (3, 1) (3, 3) (3, 5)" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "@btime @view x[:, [1,3,5]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " (now creation of the `view` is slow, but in the coming releases of the `DataFrames.jl` package it will become significantly faster)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " ## Modify column by name" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:44.550000Z", "iopub.status.busy": "2024-06-04T16:21:44.550000Z", "iopub.status.idle": "2024-06-04T16:21:44.579000Z", "shell.execute_reply": "2024-06-04T16:21:44.579000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×5 DataFrame
Rowx1x2x3x4x5
Tuple…Tuple…Tuple…Tuple…Tuple…
1(1, 1)(1, 2)(1, 3)(1, 4)(1, 5)
2(2, 1)(2, 2)(2, 3)(2, 4)(2, 5)
3(3, 1)(3, 2)(3, 3)(3, 4)(3, 5)
" ], "text/latex": [ "\\begin{tabular}{r|ccccc}\n", "\t& x1 & x2 & x3 & x4 & x5\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 2) & (1, 3) & (1, 4) & (1, 5) \\\\\n", "\t2 & (2, 1) & (2, 2) & (2, 3) & (2, 4) & (2, 5) \\\\\n", "\t3 & (3, 1) & (3, 2) & (3, 3) & (3, 4) & (3, 5) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×5 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────────────────────\n", " 1 │ (1, 1) (1, 2) (1, 3) (1, 4) (1, 5)\n", " 2 │ (2, 1) (2, 2) (2, 3) (2, 4) (2, 5)\n", " 3 │ (3, 1) (3, 2) (3, 3) (3, 4) (3, 5)" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x = DataFrame([(i,j) for i in 1:3, j in 1:5], :auto)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " With the following syntax, the existing column is modified without performing any copying (this is discouraged as it creates column alias)." ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:44.582000Z", "iopub.status.busy": "2024-06-04T16:21:44.582000Z", "iopub.status.idle": "2024-06-04T16:21:44.583000Z", "shell.execute_reply": "2024-06-04T16:21:44.583000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×5 DataFrame
Rowx1x2x3x4x5
Tuple…Tuple…Tuple…Tuple…Tuple…
1(1, 2)(1, 2)(1, 3)(1, 4)(1, 5)
2(2, 2)(2, 2)(2, 3)(2, 4)(2, 5)
3(3, 2)(3, 2)(3, 3)(3, 4)(3, 5)
" ], "text/latex": [ "\\begin{tabular}{r|ccccc}\n", "\t& x1 & x2 & x3 & x4 & x5\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 2) & (1, 2) & (1, 3) & (1, 4) & (1, 5) \\\\\n", "\t2 & (2, 2) & (2, 2) & (2, 3) & (2, 4) & (2, 5) \\\\\n", "\t3 & (3, 2) & (3, 2) & (3, 3) & (3, 4) & (3, 5) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×5 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────────────────────\n", " 1 │ (1, 2) (1, 2) (1, 3) (1, 4) (1, 5)\n", " 2 │ (2, 2) (2, 2) (2, 3) (2, 4) (2, 5)\n", " 3 │ (3, 2) (3, 2) (3, 3) (3, 4) (3, 5)" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x[!, :x1] = x[!, :x2]\n", "x" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " this syntax is safer" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:44.585000Z", "iopub.status.busy": "2024-06-04T16:21:44.585000Z", "iopub.status.idle": "2024-06-04T16:21:44.598000Z", "shell.execute_reply": "2024-06-04T16:21:44.598000Z" } }, "outputs": [ { "data": { "text/plain": [ "3-element Vector{Tuple{Int64, Int64}}:\n", " (1, 2)\n", " (2, 2)\n", " (3, 2)" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x[!, :x1] = x[:, :x2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " We can also use the following syntax to add a new column at the end of a `DataFrame`." ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:44.600000Z", "iopub.status.busy": "2024-06-04T16:21:44.600000Z", "iopub.status.idle": "2024-06-04T16:21:44.602000Z", "shell.execute_reply": "2024-06-04T16:21:44.602000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×6 DataFrame
Rowx1x2x3x4x5A
Tuple…Tuple…Tuple…Tuple…Tuple…Int64
1(1, 2)(1, 2)(1, 3)(1, 4)(1, 5)1
2(2, 2)(2, 2)(2, 3)(2, 4)(2, 5)2
3(3, 2)(3, 2)(3, 3)(3, 4)(3, 5)3
" ], "text/latex": [ "\\begin{tabular}{r|cccccc}\n", "\t& x1 & x2 & x3 & x4 & x5 & A\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple… & Tuple… & Tuple… & Int64\\\\\n", "\t\\hline\n", "\t1 & (1, 2) & (1, 2) & (1, 3) & (1, 4) & (1, 5) & 1 \\\\\n", "\t2 & (2, 2) & (2, 2) & (2, 3) & (2, 4) & (2, 5) & 2 \\\\\n", "\t3 & (3, 2) & (3, 2) & (3, 3) & (3, 4) & (3, 5) & 3 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×6 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\u001b[1m A \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼───────────────────────────────────────────────\n", " 1 │ (1, 2) (1, 2) (1, 3) (1, 4) (1, 5) 1\n", " 2 │ (2, 2) (2, 2) (2, 3) (2, 4) (2, 5) 2\n", " 3 │ (3, 2) (3, 2) (3, 3) (3, 4) (3, 5) 3" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x[!, :A] = [1,2,3]\n", "x" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " A new column name will be added to our `DataFrame` with the following syntax as well:" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:44.604000Z", "iopub.status.busy": "2024-06-04T16:21:44.604000Z", "iopub.status.idle": "2024-06-04T16:21:44.643000Z", "shell.execute_reply": "2024-06-04T16:21:44.642000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×7 DataFrame
Rowx1x2x3x4x5AB
Tuple…Tuple…Tuple…Tuple…Tuple…Int64Int64
1(1, 2)(1, 2)(1, 3)(1, 4)(1, 5)111
2(2, 2)(2, 2)(2, 3)(2, 4)(2, 5)212
3(3, 2)(3, 2)(3, 3)(3, 4)(3, 5)313
" ], "text/latex": [ "\\begin{tabular}{r|ccccccc}\n", "\t& x1 & x2 & x3 & x4 & x5 & A & B\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple… & Tuple… & Tuple… & Int64 & Int64\\\\\n", "\t\\hline\n", "\t1 & (1, 2) & (1, 2) & (1, 3) & (1, 4) & (1, 5) & 1 & 11 \\\\\n", "\t2 & (2, 2) & (2, 2) & (2, 3) & (2, 4) & (2, 5) & 2 & 12 \\\\\n", "\t3 & (3, 2) & (3, 2) & (3, 3) & (3, 4) & (3, 5) & 3 & 13 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×7 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\u001b[1m A \u001b[0m\u001b[1m B \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼──────────────────────────────────────────────────────\n", " 1 │ (1, 2) (1, 2) (1, 3) (1, 4) (1, 5) 1 11\n", " 2 │ (2, 2) (2, 2) (2, 3) (2, 4) (2, 5) 2 12\n", " 3 │ (3, 2) (3, 2) (3, 3) (3, 4) (3, 5) 3 13" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x.B = 11:13\n", "x" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " ## Find column name" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:44.645000Z", "iopub.status.busy": "2024-06-04T16:21:44.645000Z", "iopub.status.idle": "2024-06-04T16:21:44.669000Z", "shell.execute_reply": "2024-06-04T16:21:44.669000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×5 DataFrame
Rowx1x2x3x4x5
Tuple…Tuple…Tuple…Tuple…Tuple…
1(1, 1)(1, 2)(1, 3)(1, 4)(1, 5)
2(2, 1)(2, 2)(2, 3)(2, 4)(2, 5)
3(3, 1)(3, 2)(3, 3)(3, 4)(3, 5)
" ], "text/latex": [ "\\begin{tabular}{r|ccccc}\n", "\t& x1 & x2 & x3 & x4 & x5\\\\\n", "\t\\hline\n", "\t& Tuple… & Tuple… & Tuple… & Tuple… & Tuple…\\\\\n", "\t\\hline\n", "\t1 & (1, 1) & (1, 2) & (1, 3) & (1, 4) & (1, 5) \\\\\n", "\t2 & (2, 1) & (2, 2) & (2, 3) & (2, 4) & (2, 5) \\\\\n", "\t3 & (3, 1) & (3, 2) & (3, 3) & (3, 4) & (3, 5) \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×5 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\n", " │\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\u001b[90m Tuple… \u001b[0m\n", "─────┼────────────────────────────────────────\n", " 1 │ (1, 1) (1, 2) (1, 3) (1, 4) (1, 5)\n", " 2 │ (2, 1) (2, 2) (2, 3) (2, 4) (2, 5)\n", " 3 │ (3, 1) (3, 2) (3, 3) (3, 4) (3, 5)" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x = DataFrame([(i,j) for i in 1:3, j in 1:5], :auto)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " We can check if a column with a given name exists via" ] }, { "cell_type": "code", "execution_count": 66, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:44.672000Z", "iopub.status.busy": "2024-06-04T16:21:44.672000Z", "iopub.status.idle": "2024-06-04T16:21:44.678000Z", "shell.execute_reply": "2024-06-04T16:21:44.678000Z" } }, "outputs": [ { "data": { "text/plain": [ "true" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hasproperty(x, :x1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " and determine its index via" ] }, { "cell_type": "code", "execution_count": 67, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:44.680000Z", "iopub.status.busy": "2024-06-04T16:21:44.680000Z", "iopub.status.idle": "2024-06-04T16:21:44.824000Z", "shell.execute_reply": "2024-06-04T16:21:44.824000Z" } }, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columnindex(x, :x2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " ## Advanced ways of column selection\n", " these are most useful for non-standard column names (e.g. containing spaces)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:44.827000Z", "iopub.status.busy": "2024-06-04T16:21:44.826000Z", "iopub.status.idle": "2024-06-04T16:21:44.866000Z", "shell.execute_reply": "2024-06-04T16:21:44.866000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×2 DataFrame
Rowx1column 2
Int64Int64
114
225
336
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& x1 & column 2\\\\\n", "\t\\hline\n", "\t& Int64 & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & 4 \\\\\n", "\t2 & 2 & 5 \\\\\n", "\t3 & 3 & 6 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×2 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m column 2 \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼─────────────────\n", " 1 │ 1 4\n", " 2 │ 2 5\n", " 3 │ 3 6" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = DataFrame()\n", "df.x1 = 1:3\n", "df[!, \"column 2\"] = 4:6\n", "df" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:44.869000Z", "iopub.status.busy": "2024-06-04T16:21:44.869000Z", "iopub.status.idle": "2024-06-04T16:21:45.124000Z", "shell.execute_reply": "2024-06-04T16:21:45.124000Z" } }, "outputs": [ { "data": { "text/plain": [ "3-element Vector{Int64}:\n", " 4\n", " 5\n", " 6" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.\"column 2\"" ] }, { "cell_type": "code", "execution_count": 70, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:45.126000Z", "iopub.status.busy": "2024-06-04T16:21:45.126000Z", "iopub.status.idle": "2024-06-04T16:21:45.138000Z", "shell.execute_reply": "2024-06-04T16:21:45.138000Z" } }, "outputs": [ { "data": { "text/plain": [ "3-element Vector{Int64}:\n", " 4\n", " 5\n", " 6" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[:, \"column 2\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " or you can interpolate column name using `:()` syntax" ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:45.140000Z", "iopub.status.busy": "2024-06-04T16:21:45.140000Z", "iopub.status.idle": "2024-06-04T16:21:45.273000Z", "shell.execute_reply": "2024-06-04T16:21:45.273000Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "x1\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1, 2, 3]\n", "\n", "column 2\n", "[4, 5, 6]\n", "\n" ] } ], "source": [ "for n in names(df)\n", " println(n, \"\\n\", df.:($n), \"\\n\")\n", "end" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " ## Working on a collection of columns\n", " When using `eachcol` of a data frame the resulting object retains reference to its parent and e.g. can be queried with `getproperty`" ] }, { "cell_type": "code", "execution_count": 72, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:45.276000Z", "iopub.status.busy": "2024-06-04T16:21:45.276000Z", "iopub.status.idle": "2024-06-04T16:21:45.345000Z", "shell.execute_reply": "2024-06-04T16:21:45.345000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowx1x2x3x4
Int64Int64Int64Int64
114710
225811
336912
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& x1 & x2 & x3 & x4\\\\\n", "\t\\hline\n", "\t& Int64 & Int64 & Int64 & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & 4 & 7 & 10 \\\\\n", "\t2 & 2 & 5 & 8 & 11 \\\\\n", "\t3 & 3 & 6 & 9 & 12 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼────────────────────────────\n", " 1 │ 1 4 7 10\n", " 2 │ 2 5 8 11\n", " 3 │ 3 6 9 12" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = DataFrame(reshape(1:12, 3, 4), :auto)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:45.347000Z", "iopub.status.busy": "2024-06-04T16:21:45.347000Z", "iopub.status.idle": "2024-06-04T16:21:45.750000Z", "shell.execute_reply": "2024-06-04T16:21:45.750000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrameColumns
Rowx1x2x3x4
Int64Int64Int64Int64
114710
225811
336912
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& x1 & x2 & x3 & x4\\\\\n", "\t\\hline\n", "\t& Int64 & Int64 & Int64 & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & 4 & 7 & 10 \\\\\n", "\t2 & 2 & 5 & 8 & 11 \\\\\n", "\t3 & 3 & 6 & 9 & 12 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrameColumns\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼────────────────────────────\n", " 1 │ 1 4 7 10\n", " 2 │ 2 5 8 11\n", " 3 │ 3 6 9 12" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ec_df = eachcol(df)" ] }, { "cell_type": "code", "execution_count": 74, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:45.753000Z", "iopub.status.busy": "2024-06-04T16:21:45.753000Z", "iopub.status.idle": "2024-06-04T16:21:45.759000Z", "shell.execute_reply": "2024-06-04T16:21:45.759000Z" } }, "outputs": [ { "data": { "text/plain": [ "3-element Vector{Int64}:\n", " 1\n", " 2\n", " 3" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ec_df[1]" ] }, { "cell_type": "code", "execution_count": 75, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:45.762000Z", "iopub.status.busy": "2024-06-04T16:21:45.762000Z", "iopub.status.idle": "2024-06-04T16:21:45.767000Z", "shell.execute_reply": "2024-06-04T16:21:45.767000Z" } }, "outputs": [ { "data": { "text/plain": [ "3-element Vector{Int64}:\n", " 1\n", " 2\n", " 3" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ec_df.x1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Transforming columns\n", "\n", "We will get to this subject later in 10_transforms.ipynb notebook, but here let us just note that `select`, `select!`, `transform`, `transform!` and `combine` functions allow to generate new columns based on the old columns of a data frame.\n", "\n", "The general rules are the following:\n", "* `select` and `transform` always return the number of rows equal to the source data frame, while `combine` returns any number of rows (`combine` is allowed to *combine* rows of the source data frame)\n", "* `transform` retains columns from the old data frame\n", "* `select!` and `transform!` are in-place versions of `select` and `transform`" ] }, { "cell_type": "code", "execution_count": 76, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:45.769000Z", "iopub.status.busy": "2024-06-04T16:21:45.769000Z", "iopub.status.idle": "2024-06-04T16:21:45.771000Z", "shell.execute_reply": "2024-06-04T16:21:45.771000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×4 DataFrame
Rowx1x2x3x4
Int64Int64Int64Int64
114710
225811
336912
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& x1 & x2 & x3 & x4\\\\\n", "\t\\hline\n", "\t& Int64 & Int64 & Int64 & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & 4 & 7 & 10 \\\\\n", "\t2 & 2 & 5 & 8 & 11 \\\\\n", "\t3 & 3 & 6 & 9 & 12 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×4 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼────────────────────────────\n", " 1 │ 1 4 7 10\n", " 2 │ 2 5 8 11\n", " 3 │ 3 6 9 12" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = DataFrame(reshape(1:12, 3, 4), :auto)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here we add a new column `:res` that is a sum of columns `:x1` and `:x2`. A general syntax of transformations of this kind is:\n", "\n", "```\n", "source_columns => function_to_apply => target_column_name\n", "```\n", "then `function_to_apply` gets columns selected by `source_columns` as positional arguments." ] }, { "cell_type": "code", "execution_count": 77, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:45.773000Z", "iopub.status.busy": "2024-06-04T16:21:45.773000Z", "iopub.status.idle": "2024-06-04T16:21:45.989000Z", "shell.execute_reply": "2024-06-04T16:21:45.989000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×5 DataFrame
Rowx1x2x3x4res
Int64Int64Int64Int64Int64
1147105
2258117
3369129
" ], "text/latex": [ "\\begin{tabular}{r|ccccc}\n", "\t& x1 & x2 & x3 & x4 & res\\\\\n", "\t\\hline\n", "\t& Int64 & Int64 & Int64 & Int64 & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & 4 & 7 & 10 & 5 \\\\\n", "\t2 & 2 & 5 & 8 & 11 & 7 \\\\\n", "\t3 & 3 & 6 & 9 & 12 & 9 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×5 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m res \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼───────────────────────────────────\n", " 1 │ 1 4 7 10 5\n", " 2 │ 2 5 8 11 7\n", " 3 │ 3 6 9 12 9" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transform(df, [:x1, :x2] => (+) => :res)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " One can omit passing target_column_name in which case it is automatically generated:" ] }, { "cell_type": "code", "execution_count": 78, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:45.991000Z", "iopub.status.busy": "2024-06-04T16:21:45.991000Z", "iopub.status.idle": "2024-06-04T16:21:45.992000Z", "shell.execute_reply": "2024-06-04T16:21:45.992000Z" } }, "outputs": [], "source": [ "using Statistics" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:45.994000Z", "iopub.status.busy": "2024-06-04T16:21:45.993000Z", "iopub.status.idle": "2024-06-04T16:21:46.103000Z", "shell.execute_reply": "2024-06-04T16:21:46.103000Z" } }, "outputs": [ { "data": { "text/html": [ "
1×1 DataFrame
Rowx1_x2_cor
Float64
11.0
" ], "text/latex": [ "\\begin{tabular}{r|c}\n", "\t& x1\\_x2\\_cor\\\\\n", "\t\\hline\n", "\t& Float64\\\\\n", "\t\\hline\n", "\t1 & 1.0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m1×1 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1_x2_cor \u001b[0m\n", " │\u001b[90m Float64 \u001b[0m\n", "─────┼───────────\n", " 1 │ 1.0" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "combine(df, [:x1, :x2] => cor)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that `combine` allowed the number of columns in the resulting data frame to be changed. If we used `select` instead it would automatically broadcast the return value to match the number of rouws of the source:" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:46.106000Z", "iopub.status.busy": "2024-06-04T16:21:46.106000Z", "iopub.status.idle": "2024-06-04T16:21:46.110000Z", "shell.execute_reply": "2024-06-04T16:21:46.110000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×1 DataFrame
Rowx1_x2_cor
Float64
11.0
21.0
31.0
" ], "text/latex": [ "\\begin{tabular}{r|c}\n", "\t& x1\\_x2\\_cor\\\\\n", "\t\\hline\n", "\t& Float64\\\\\n", "\t\\hline\n", "\t1 & 1.0 \\\\\n", "\t2 & 1.0 \\\\\n", "\t3 & 1.0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×1 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1_x2_cor \u001b[0m\n", " │\u001b[90m Float64 \u001b[0m\n", "─────┼───────────\n", " 1 │ 1.0\n", " 2 │ 1.0\n", " 3 │ 1.0" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "select(df, [:x1, :x2] => cor)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " If you want to apply some function on each row of the source wrap it in `ByRow`:" ] }, { "cell_type": "code", "execution_count": 81, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:46.112000Z", "iopub.status.busy": "2024-06-04T16:21:46.112000Z", "iopub.status.idle": "2024-06-04T16:21:46.241000Z", "shell.execute_reply": "2024-06-04T16:21:46.241000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×3 DataFrame
Rowx1x2x1_x2_string
Int64Int64String
11414
22525
33636
" ], "text/latex": [ "\\begin{tabular}{r|ccc}\n", "\t& x1 & x2 & x1\\_x2\\_string\\\\\n", "\t\\hline\n", "\t& Int64 & Int64 & String\\\\\n", "\t\\hline\n", "\t1 & 1 & 4 & 14 \\\\\n", "\t2 & 2 & 5 & 25 \\\\\n", "\t3 & 3 & 6 & 36 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×3 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x1_x2_string \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m String \u001b[0m\n", "─────┼────────────────────────────\n", " 1 │ 1 4 14\n", " 2 │ 2 5 25\n", " 3 │ 3 6 36" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "select(df, :x1, :x2, [:x1, :x2] => ByRow(string))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Also if you want columns to be passed as a `NamedTuple` to a funcion (instead of being positional arguments) wrap them in `AsTable`:" ] }, { "cell_type": "code", "execution_count": 82, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:46.244000Z", "iopub.status.busy": "2024-06-04T16:21:46.243000Z", "iopub.status.idle": "2024-06-04T16:21:46.383000Z", "shell.execute_reply": "2024-06-04T16:21:46.383000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×3 DataFrame
Rowx1x2x1_x2_function
Int64Int64Int64
1145
2257
3369
" ], "text/latex": [ "\\begin{tabular}{r|ccc}\n", "\t& x1 & x2 & x1\\_x2\\_function\\\\\n", "\t\\hline\n", "\t& Int64 & Int64 & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & 4 & 5 \\\\\n", "\t2 & 2 & 5 & 7 \\\\\n", "\t3 & 3 & 6 & 9 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×3 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x1_x2_function \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼──────────────────────────────\n", " 1 │ 1 4 5\n", " 2 │ 2 5 7\n", " 3 │ 3 6 9" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "select(df, :x1, :x2, AsTable([:x1, :x2]) => x -> x.x1 + x.x2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " For simplicity (as this functionality is often needed) there is a special treatement of nrow function that can be just passed as a transformation (without specifying of column selector):" ] }, { "cell_type": "code", "execution_count": 83, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:46.386000Z", "iopub.status.busy": "2024-06-04T16:21:46.386000Z", "iopub.status.idle": "2024-06-04T16:21:46.416000Z", "shell.execute_reply": "2024-06-04T16:21:46.415000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×2 DataFrame
Rowx1number_of_rows
Int64Int64
113
223
333
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& x1 & number\\_of\\_rows\\\\\n", "\t\\hline\n", "\t& Int64 & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & 3 \\\\\n", "\t2 & 2 & 3 \\\\\n", "\t3 & 3 & 3 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×2 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m number_of_rows \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼───────────────────────\n", " 1 │ 1 3\n", " 2 │ 2 3\n", " 3 │ 3 3" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "select(df, :x1, nrow => \"number_of_rows\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "(note that in `select` the number of rows is automatically broadcasted to match the number of rows of the source data frame)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finally you can conveninently create multiple columns with one function, e.g.:" ] }, { "cell_type": "code", "execution_count": 84, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:46.418000Z", "iopub.status.busy": "2024-06-04T16:21:46.418000Z", "iopub.status.idle": "2024-06-04T16:21:46.603000Z", "shell.execute_reply": "2024-06-04T16:21:46.603000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×3 DataFrame
Rowx1x1²x1³
Int64Int64Int64
1111
2248
33927
" ], "text/latex": [ "\\begin{tabular}{r|ccc}\n", "\t& x1 & x1² & x1³\\\\\n", "\t\\hline\n", "\t& Int64 & Int64 & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & 1 & 1 \\\\\n", "\t2 & 2 & 4 & 8 \\\\\n", "\t3 & 3 & 9 & 27 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×3 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x1² \u001b[0m\u001b[1m x1³ \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼─────────────────────\n", " 1 │ 1 1 1\n", " 2 │ 2 4 8\n", " 3 │ 3 9 27" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "select(df, :x1, :x1 => ByRow(x -> [x ^ 2, x ^ 3]) => [\"x1²\", \"x1³\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "or e.g. (this produces the same result)" ] }, { "cell_type": "code", "execution_count": 85, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:46.606000Z", "iopub.status.busy": "2024-06-04T16:21:46.606000Z", "iopub.status.idle": "2024-06-04T16:21:46.798000Z", "shell.execute_reply": "2024-06-04T16:21:46.798000Z" } }, "outputs": [ { "data": { "text/html": [ "
3×3 DataFrame
Rowx1x1²x1³
Int64Int64Int64
1111
2248
33927
" ], "text/latex": [ "\\begin{tabular}{r|ccc}\n", "\t& x1 & x1² & x1³\\\\\n", "\t\\hline\n", "\t& Int64 & Int64 & Int64\\\\\n", "\t\\hline\n", "\t1 & 1 & 1 & 1 \\\\\n", "\t2 & 2 & 4 & 8 \\\\\n", "\t3 & 3 & 9 & 27 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m3×3 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x1² \u001b[0m\u001b[1m x1³ \u001b[0m\n", " │\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼─────────────────────\n", " 1 │ 1 1 1\n", " 2 │ 2 4 8\n", " 3 │ 3 9 27" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "select(df, :x1, :x1 => (x -> DataFrame(\"x1²\" => x .^ 2, \"x1³\" => x .^ 3)) => AsTable)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Note that since `DataFrames.jl` row aggregation for wide tables is efficient. Here is an example of a wide table with `sum` (other standard reductions are similarly supported):" ] }, { "cell_type": "code", "execution_count": 86, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:46.801000Z", "iopub.status.busy": "2024-06-04T16:21:46.801000Z", "iopub.status.idle": "2024-06-04T16:21:47.467000Z", "shell.execute_reply": "2024-06-04T16:21:47.467000Z" } }, "outputs": [ { "data": { "text/html": [ "
1000×10000 DataFrame
9900 columns and 975 rows omitted
Rowx1x2x3x4x5x6x7x8x9x10x11x12x13x14x15x16x17x18x19x20x21x22x23x24x25x26x27x28x29x30x31x32x33x34x35x36x37x38x39x40x41x42x43x44x45x46x47x48x49x50x51x52x53x54x55x56x57x58x59x60x61x62x63x64x65x66x67x68x69x70x71x72x73x74x75x76x77x78x79x80x81x82x83x84x85x86x87x88x89x90x91x92x93x94x95x96x97x98x99x100
Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64
10.01490880.1779630.3749750.6940630.2905850.2969720.1513780.08950860.1603650.349830.6209360.5234520.3388730.3458760.05543050.3189050.7314030.05129640.4102980.7042510.7647490.4339430.2853340.2728520.3471780.4183850.2440380.805040.4398980.7495060.7862460.9009650.2959960.7468680.4228290.2340340.5772030.1468460.7937040.4878940.6312790.671290.1064740.5286070.3078370.5020220.02957610.2920530.4294360.5653430.4323890.2281830.4903890.1781540.5963350.8961940.8744150.6329390.764080.5919130.8102140.5478530.05683750.6084920.9391050.8584010.541320.523860.07267980.09296970.7351070.6934310.359520.3038670.1599230.02736590.5807280.1467140.5166030.1922050.7732470.4632750.6920370.7214640.3500540.7089890.03660430.7322360.8491920.3798130.3890030.003134870.5770760.6443050.8489680.9012290.02070030.5802290.1135570.426518
20.5203550.6701220.3878570.7243830.08002280.2485480.9113630.368890.5297520.9107710.9971180.254050.6533310.7788020.2081010.1472630.7241270.4660170.3151770.1082080.9137130.2874050.2430880.8808040.6882040.2557060.9623440.4700520.0009567120.9142090.7931580.2822260.2122360.3478330.6622260.4116960.9061730.06481590.3820690.1745680.8525590.8615920.9184370.3751040.3430010.5582250.6838410.237440.241660.3294520.8280580.1609710.8128280.8857880.7679270.6489150.936280.2556110.3828330.8322440.7036720.9839960.2803330.5414430.8014070.441690.9328410.7989060.4253530.6723730.9498150.2636680.03770620.5580030.2978950.5264420.6986340.2641020.837790.2751020.05066710.1228030.7684890.7921940.6120940.3013530.7713680.58320.4918790.03271380.8975390.04594180.8089510.152950.2715820.3284060.8118240.08135240.008512120.407592
30.6395620.04236180.7795940.1304530.1738120.5254160.3249650.9056490.08638950.5045120.8313620.6093320.3626460.2964890.1242940.4454280.3838770.7019390.8991780.8020260.5963130.5504650.01302670.3056850.2429410.9767320.8557570.1853390.4925880.8864980.06703870.2246320.9731810.7004970.9276050.2447540.5528130.2326830.5359750.7566540.6635620.4173510.6107870.322490.9779890.9593340.8256860.9332890.154890.4275860.9211590.04086990.1787350.1828790.2998490.6466160.1639780.6851450.8974060.6898550.4309540.6952620.004899710.6847370.1123930.6206270.3290970.5680490.534150.3684770.7137370.7916970.04364330.3567150.294340.84890.739440.5770090.3696290.03087080.06324580.6681240.4810550.6293090.933240.1075620.9824910.4534320.2815260.3728780.8371150.1964140.7764460.1919240.2566920.07578170.2743210.842960.8990230.00600951
40.8396220.7406990.3609690.4569340.3980930.5094470.3354320.3739520.3986620.8511820.4028620.8273480.2220650.6463950.1399920.7205840.06818180.4522830.6554160.9198540.6342870.5524840.398180.7220390.87560.4487690.5547550.6135620.7102130.004652150.9637630.1114640.5792450.07833180.8116020.05868960.1213530.4514260.4160950.1678020.2963370.680250.8650830.4555780.9849310.6992770.1136610.01724990.3897970.7314380.1798540.1818490.647220.737550.03871120.1631760.58310.6183150.8581660.4874370.6498560.5104680.2653830.7163450.8589970.4210350.712270.2109160.07283640.1150680.9233330.8720.3942730.5758640.9478650.5115220.006052840.8185240.2269280.9497360.7573840.660790.8274280.5681930.004215990.125460.5850290.315640.9814350.2297520.8261710.1335240.2869980.5447420.706280.612950.1305090.4735670.4109990.685049
50.9671430.3026720.7069020.06532160.6444170.9764570.368950.805530.7049880.2291580.4220510.08350240.06009020.1395510.8512710.5131280.41640.75230.1737360.1893010.3895650.7017140.7432980.4396590.6463570.05396690.8354870.6329890.3745620.2668120.3446140.09878430.2050530.01216580.7141310.947080.1611880.311770.5837150.8875390.3375760.06478350.781730.7964170.4266540.2647260.103210.3416790.1805020.3477920.1420750.1596540.8470520.5536830.5483420.8988750.1475090.349160.491930.8301470.759180.295230.5521790.3954060.2035920.3554770.1289120.4606290.9747180.5987440.5361910.492890.3374270.2668630.5988620.04807370.09735540.09529450.3279030.339650.9423630.4249440.6098470.04311330.9383130.6785120.2469450.7585310.1431850.541490.2907750.8540970.59340.04937150.3082890.9326130.1000650.831170.9640440.532569
60.2051680.6831280.7869090.8196350.7596630.5880190.2440110.8881360.7943670.3608230.1992590.8268670.7612330.04484820.8560250.9013080.5409440.5607510.3522950.7675410.443670.6784690.199660.7325220.5025410.5974130.6569330.3014440.1812410.1575320.7140830.1718520.422790.6993740.135070.6828530.5435540.08204180.2707090.8358110.3221010.3838210.399420.216420.6895870.2901820.07423310.7271360.2082750.4853640.4324170.6432450.1242930.8127270.3354840.01944310.1438560.2547430.479930.002363640.8963750.5287120.6008940.07055820.417250.10680.2674780.7455480.1298490.2693690.9683780.733230.2509920.07119080.7637790.8784080.6958540.8009810.08221550.2288320.6402360.7395130.6291270.1754190.6751910.8514470.8417810.7487650.8971750.5184250.5681430.7922760.5618120.599420.9275270.1053340.3250560.06151280.6681080.584317
70.5271840.229610.3957370.7128880.2862810.07841370.2143710.6936810.8127410.8501440.6339950.3803880.5134860.1042720.8941090.7746820.795410.9512640.2533080.2944830.4574230.152260.5173390.9598090.2618880.5622370.1253230.8833580.3015980.04224020.3551240.6420060.6916820.9116350.537460.5962860.3853530.6634630.5815720.3186940.4986420.5254280.5494030.9764010.1441090.5723320.3313460.6416370.9346120.9288310.6426980.255210.301130.6986480.7888030.933380.1888070.3906410.9168190.5170610.8440720.9029470.3227150.1589130.8491920.09437180.5398870.8537650.109270.5588120.4004480.9713530.8309610.3023480.6665660.176530.2345760.5530720.8047930.6611140.7945410.4953110.6968570.2689420.7920130.7398650.3058980.7847170.1546620.9507770.8107380.9917640.1801390.9543130.2973430.4369540.9784160.323790.4246630.877706
80.9511620.9637810.1764820.7511490.3943660.9458450.627040.967740.1033180.6749380.5501640.7174220.6060480.2958660.5389890.133010.9392150.4180930.6491850.9830290.5788340.05523650.1549840.5904660.1087760.4491780.2487310.1746720.9834690.215930.8209170.5566170.4293420.4368620.4354030.0410610.08404430.6519380.8182010.8399260.9951710.780290.02998240.6064470.4673010.8388080.6247650.6143410.7405680.3084660.1608960.6828420.5568880.07691520.8210160.4167310.3929410.230320.4250420.3616740.5704860.3834190.8125830.4964010.5389820.6335570.9947180.8938410.792990.9615180.6278310.04259850.4241320.869620.782640.1165150.7519770.08721140.1485020.1495640.7182680.7191930.3056530.7378010.06119810.2857030.834140.923050.08053780.4361710.03664270.4398640.7491470.9250190.5197980.5508350.2297060.4377170.458170.104438
90.5363690.1181790.9587580.1615650.7223480.355340.7349910.8593310.1613370.9374880.9529160.724840.5512360.9372950.9812650.4472060.9451090.6102490.6283140.542680.01209820.8505870.5691410.0686510.05700090.1841860.2288020.718540.4535990.2261910.3153660.7806550.03911080.253360.5021310.3340830.3248370.8172790.7265280.9679090.3033680.7831430.0144020.0481230.2843660.1581760.6650710.5126390.1064270.2811170.3408890.3365670.7367070.5109420.8068050.07419480.4525980.1163010.7007350.5754280.6563970.3162170.1826340.9344850.6022410.1623340.5829110.734930.6362530.5336440.8209710.05750130.6048680.5792630.2594230.8393580.5427880.4289020.9384940.6348350.08951910.5711680.6375380.7408230.9550970.1249680.1496340.9814640.1497910.4735990.2627070.9005750.6600740.5912480.2370990.1898810.9532650.665720.431520.147092
100.7113890.6610710.4505840.7912750.02253570.5824160.8326040.04542980.7977380.9545290.3726390.589310.8536590.04027390.5803990.3501030.7240240.1558390.432010.1240570.962060.006908460.7241260.8853010.8799990.2269020.2033640.6999190.7041270.735230.008554460.5133160.6830890.234650.5446890.7280270.6445090.0443260.7069120.2036730.1012730.2601130.08545560.2266710.9260630.9542350.1463360.6768730.2834070.6166870.5668450.3595890.1829910.5956470.5992480.2191120.3630720.03481810.1757310.7358060.7039810.3007250.7826690.7213190.4599280.5692510.8106150.5754240.4325980.8200520.5308840.16760.4587890.1546660.3457830.0005536440.9617730.5552880.1337190.6937370.4957090.1558710.210230.7917690.03135810.6387550.9397850.6433520.7541860.4572230.7498050.8395320.5536750.2820280.6821280.02564110.8962420.7163860.9493850.386708
110.1039290.4893570.9436540.9814980.9064750.276050.3198540.5054380.278440.7787940.1305140.1695510.5961710.4587980.958050.03284010.585010.8344430.2859720.5525480.7243970.7449550.04883140.3906390.4657570.1604630.701250.1948930.5266460.3658760.03723430.7264420.01759730.7698480.4513650.4010560.4207270.5604090.4717660.3372630.1452290.07916140.7586290.6983820.755890.6292020.9028280.6053230.0346450.4914090.04710050.4264150.5183520.220790.03643430.9652190.2391230.3047060.6545970.3118240.1756380.0904160.9776950.6443370.8567630.6371810.3739460.986650.4686340.9692940.08330850.4587310.8229410.6335680.2166470.6801090.05034430.2177520.551260.05491940.9681640.6382070.4645170.3833130.130810.5242530.3273970.01784290.8572950.7290450.5560460.3171410.8010090.3815320.7840440.7873650.3110330.4103490.5515430.933318
120.8067040.5979940.2933380.714640.8214850.4217530.5786130.2742160.4245390.3796790.8398050.9024560.2527990.5635330.230670.8065730.4028570.72250.7639320.6635620.8390980.0269740.6153830.1433340.4448090.3679430.5063250.03979440.6043280.3571310.4225440.04097950.5339260.5017250.8213950.3740210.7695640.6765750.4392140.7595250.8249620.583220.1150940.9062210.8265370.8297070.7383760.4188510.1917790.4692790.2227180.5234480.1298090.007488830.4376030.0944940.4935510.1916080.4499340.6539040.6844180.02238810.7540440.3390280.3849080.2623170.9120230.7542090.5068180.8258130.4606990.002810850.9415870.06137820.1697320.4030690.1557110.3227380.9731020.07621170.5696240.348440.3353650.5825990.1072470.6077990.5121980.006371440.3288290.7148540.3399140.9858190.5002610.5215520.283770.209550.4696040.6526630.6836210.746581
130.8705390.08669830.7209780.9579360.4109780.1326530.6653720.5933250.04659870.1350420.05700360.1217090.4305510.7249990.468810.6263540.3974520.2549750.5174910.3734710.3844630.8975560.2578180.1671950.5606330.7262550.4782310.1850420.6249790.7711540.9121020.1113550.4643010.02008430.1705370.3341580.5883370.9806490.3135080.8002890.6845360.8135310.7480870.6074040.8654260.3644550.4073240.7191720.474750.5434480.8841250.4957340.8957770.3882430.7880630.4492470.3260030.9757980.02604810.8830350.550420.8713670.162960.8792330.5906510.7126710.9188690.8723220.6244340.3496210.2167770.6056720.2264410.06574250.1177510.5009550.196950.09735360.09541750.03888310.6461060.1414210.974550.1001590.5419990.2501480.9741780.8193750.6137930.3020140.4720270.3291890.1770030.3732910.8123040.4338180.5979390.2836120.7812180.550587
9890.79380.4702030.931740.4415620.7998810.2143480.07262740.8027770.8852550.1857480.5849610.1589360.6309620.3205730.009540830.2826870.001156150.4594490.8231610.3472620.9779060.9411760.7953640.845130.8448410.5709350.5060170.9574060.3674650.528250.04978430.5550310.2152660.8211810.6227710.9890310.9164780.02753670.7711050.368260.4034220.5271750.1620780.6185890.9968440.1451870.7362590.01439420.2452680.899050.5245220.1743980.4775570.9519290.5017040.87890.1086540.4877520.2807160.8491470.5151930.4041840.555270.8703570.1381220.146830.2641910.7846210.965510.4831330.956820.9173520.7577550.2049850.5649660.172230.5212050.9659160.792940.1008910.4274090.3113640.4098010.6603530.2062060.1689820.3916060.719870.9861820.7089940.4807610.8156040.1610450.5344180.4715930.8372180.9059710.5889070.3074850.253986
9900.3355070.670850.2130060.7121960.4904690.8331140.5535660.1909720.9654110.7732960.1927890.776830.9233380.8490170.5280220.1705520.6001230.3647960.07726430.3439710.9115770.0384560.2367070.9084770.8097310.9461950.3971110.390110.009027160.197030.1033270.2134410.1491040.8395980.4536050.1640990.1776080.04883440.398130.5306430.5408760.2690650.7330040.0003074310.5090960.1583820.5655690.9173480.00362530.3175660.6033790.0688040.7948970.8384350.4162570.7341790.9584650.1752420.919460.4217710.997840.03274370.3490290.2249890.7882290.3106570.6265780.3434020.0702370.737710.9766490.7528710.7621470.1554860.5984840.526280.004752030.8131560.6487260.9603130.9173820.8023920.8469520.9223170.02808450.4084110.4304330.2944350.846640.8220030.7624440.3367040.2922040.3712340.3007880.2653860.191830.1561330.1229740.62161
9910.3186990.8556210.897890.1860050.7571650.5375270.5234550.8107020.4136440.2904250.9580960.5787650.634850.2622680.2459430.4970270.3668440.4148370.3461890.4067630.8119950.4500490.8898670.09758640.6984960.4699120.2113480.2702570.5112980.3246560.5672150.08822010.204430.8497210.1390730.3922520.1221650.5451880.6388740.1246670.7841960.6822750.8601030.7877960.2863250.8679090.9883580.2459360.2142670.05649610.2073050.7453150.5524970.535830.5194480.84160.8069090.4444270.1757870.7448330.2260120.8673890.5232750.7532080.9714460.9304040.04431510.7394930.1770520.4118510.5652020.2191970.7410810.8494910.9237640.1609240.1507550.7274820.3478770.9962830.9529410.819760.4048140.9497740.7260880.6390140.2982940.4909150.7180630.5522940.3848310.5493580.7243980.6266410.000120280.8545450.1954790.01221660.4950490.930874
9920.3855260.08115590.4796140.1633610.01020840.6495460.6576040.7200420.4658670.940450.7826270.6913670.06627540.7327910.5745790.3815990.07080260.6675650.5312750.5841480.1997240.4557640.6072530.9108830.7379460.7037310.8498840.4959020.2422810.8312060.2071930.0642640.001314960.9599670.3124190.5955870.2018430.00753840.8467420.8584280.5791440.4342240.3333090.6265360.5907390.06460440.4627750.4905850.7405370.5920940.1373670.5491060.8698870.7057170.7175860.3639970.3273150.6546020.2858810.737580.4769540.4390740.7729460.7748090.5586330.3596760.2496010.3108650.8836220.5949760.5907580.4941720.977840.5159250.1960560.7591240.2212630.5108250.683410.5443710.5263240.1923380.6470010.4226770.8335590.4556620.7750890.5544090.8625950.6043110.3480850.5560810.002587740.9247990.8405980.2396180.4703940.9673580.0383750.878066
9930.6784580.5571080.2027570.4933110.3431170.9229080.2612420.3573860.4587010.4187940.2822340.1840720.7783290.1196010.07609580.2777570.8939470.04986490.600480.6260550.1803510.6093280.4489550.5753810.5890450.4706280.06277380.9179570.1397590.5865530.9047030.5038820.982790.6782520.2798910.6015910.3121270.564540.3289360.3344090.8883210.5505560.9182260.02622430.4441250.3874280.08302770.5016430.2611010.7238110.3059190.609670.4616390.0913410.5537380.9856210.4753260.9051710.4469040.8010630.9337020.5260140.4914010.1031810.6698050.2066020.9295050.4053480.8807240.1347530.9199210.1164270.5854080.2636110.1485530.8805330.2919080.892870.812730.2305380.01105480.2151580.1882720.317420.3401020.5782170.4899380.7603350.6636210.3123150.1085640.6510620.04751740.9260330.8231370.3004540.3082430.133660.8658410.593067
9940.4242950.780120.5776810.2742280.2255630.295530.3959570.2601420.2779490.4636440.2427190.8300150.5918770.8626640.413730.3659880.09364570.6450050.7094680.1673420.1951790.2453880.6659870.6520010.3822510.5664730.009394040.8196530.2887550.4163320.6749780.0263840.5804840.2513980.1203240.5875030.3620910.2793150.2871510.02360910.5378090.825360.531020.1429530.4212910.6300390.08828460.7585580.2895190.4219270.8336340.5338170.9839970.01736480.09529760.6069370.6904770.3962250.8873280.4847740.4215530.4400670.6672650.3146310.3991970.8748440.1517150.5434290.5623910.7276460.7438610.4030230.5939450.2757990.04171080.8379030.3431060.3357990.323360.9121030.7592960.5518820.676610.3357740.992120.8862190.1922320.5068660.1470.7200160.4110170.8044180.8339630.5889140.9705180.7232640.1737120.6348280.579760.206616
9950.4815540.04418820.7001810.3176770.08467610.9491350.4573410.5715960.06401470.7812810.4328660.1192520.3952970.2860580.5126660.288920.8890650.3922830.6253260.6389380.2676460.7533920.6210890.06586050.594950.3438180.7706930.944880.2279650.609150.2968140.5297010.2071360.9590520.3836380.8924410.621960.1757270.5495170.1378180.9187470.5469680.6684770.4447440.9310120.1480760.5078960.6923970.9429580.8299740.5067190.6677150.3824460.3890340.5923720.2111120.05361740.4351040.6546130.09852650.7244920.1201030.1932830.1118220.1362940.8710790.4063650.1538370.4921830.07079540.02700110.8543670.768910.9607380.2564920.9018380.4570210.08120450.4421280.3974390.517910.757720.3237110.7798310.3319370.8963850.1516540.7528830.611110.2335290.4301810.1623690.6857530.7345980.7872280.3272470.5587460.6329980.5929220.329131
9960.4015280.6589030.4383090.2112170.2115150.4419020.02809030.4127170.1808550.6281660.9492950.02346950.6268410.706880.1976210.2164840.6077820.7696420.9499580.2493060.4810140.6978220.6818570.9208420.3917990.5672470.5814680.2750730.04149210.9606410.1203290.9550940.845510.5406470.0722970.4236890.5455050.2879230.2812480.6587820.4813550.5863610.5277560.6949760.7728770.877870.7820640.9810360.3934830.4788840.6737060.57270.5188010.5185920.7873060.322250.6681030.2346360.365280.9546820.9381050.9596230.1513420.9234380.3139450.005959740.9471030.8521350.1783260.5462210.362110.1433010.3712190.724530.1810770.5913350.8778930.7172750.3830120.1821250.07687440.9493920.4377710.1330590.9847110.7257450.921190.1498440.5910160.2998390.7939630.4350890.7397250.2333210.03057350.9453210.02681550.07563520.5713330.623045
9970.589850.9098570.3547350.4428290.4383220.8478350.8431310.7276180.0435370.29580.06261680.03758870.4805410.1873240.1596190.6818080.3824270.78190.1939150.9187420.3397730.9965880.07503270.6673840.8139430.6487440.7784510.3524970.8052570.9503760.2866050.8614750.1371770.5868760.7472030.5624110.6337590.007052070.5446060.8187140.7403850.569010.03708670.7075340.1199380.7546870.1130220.1531860.5717470.6648750.8847980.6862040.3364960.7599560.03500650.2383420.8299810.7782960.4001110.4603720.1221220.01003030.5143880.1576440.4397950.431170.1695050.4572440.5571650.8015130.702880.0819180.216220.2025250.9733810.6303590.6259550.8689390.05460230.3869190.1118630.4448760.01054330.7689420.946460.2666360.3342590.5335290.3424960.2980160.7689560.2070630.5874750.9017360.3374620.08861050.7542980.5959190.9558530.548087
9980.8719440.8211830.7256360.5216450.2307730.5067470.472830.9025590.8060890.7105610.3437070.2912330.9486280.5250340.4086630.7846390.2079990.9168740.9736970.5945860.1208180.5916780.5550850.2690470.1493410.8366790.1746230.06532850.2475790.8002290.8053760.1417250.700890.4842470.5427840.1333760.7064610.1394030.3110230.2521980.1105570.1394120.4025850.3133340.4855710.9234770.2322770.182730.2163540.345980.7167190.897750.859790.4690570.3530880.06580650.6306430.1946920.9455280.2632280.2274770.8987120.5738410.8347610.6789920.06985540.6067780.06647350.8709480.05660450.2404530.1189740.8149290.6289050.8349590.3416870.3038890.2020030.6394110.6972890.004219120.7997440.4888370.3356560.09531150.690990.9289790.6303340.2252440.9771070.1088070.6317750.04129760.6216930.4018180.5993010.2983240.4275010.01492240.466253
9990.7020690.3995820.3773310.5427350.386310.5553320.7428330.7876030.7282990.02052240.03922080.591810.1349890.05771180.2379580.664760.9412830.8017460.62680.8859240.987530.5556810.02252420.8672170.2133950.8444510.455560.4049910.5480410.2449420.3783650.6912790.4789790.9256380.5831010.1400490.7728560.5615870.003062170.5498840.3423370.5883160.2761320.5897430.2202190.5450510.639010.01123040.7899220.6199180.8881620.8885220.08715370.1708920.2576390.05142650.02851410.4959230.06451040.03699860.7375040.3521190.2098410.5400920.5922860.0273140.0921390.6007560.1648850.4090080.9472330.1778760.6433990.7420340.2695450.4648760.3382760.4649540.6132520.5616790.6126960.3137840.76750.9010550.6961080.01053660.2665480.004478120.5026270.4541810.2493650.3614950.8651660.5463430.9412370.8593120.4090520.326610.4009720.768701
10000.707030.1662080.458230.06822350.4584220.8369390.582080.7215860.1714530.3538990.9225170.3941510.6610910.3813410.5586790.8476870.6717010.3732120.1962670.3533110.8136220.1920310.9116450.4524650.6047910.7430380.7761480.003481420.3321030.9038270.887450.8918420.2034280.2590770.2434650.8774120.6144330.7669430.228140.3024350.8493760.1945020.9039990.03632420.5362680.8867660.4914610.873150.01482710.2570420.2105170.5701150.6844810.4660460.3676770.6923570.9224890.03157430.9649740.5232790.3380240.5096740.1633160.3624070.3012720.8795590.4992130.6318020.3316580.3704990.4503810.6048310.1259160.9344340.3061630.2071420.4710490.06688670.02325460.2122720.2679540.3987190.6959080.8505570.08759970.4217320.3309670.7848160.01308590.3411610.3962920.4127850.3027640.8499040.1752160.6870090.9092960.6925190.2444170.0393655
" ], "text/latex": [ "\\begin{tabular}{r|ccccccccc}\n", "\t& x1 & x2 & x3 & x4 & x5 & x6 & x7 & x8 & \\\\\n", "\t\\hline\n", "\t& Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & \\\\\n", "\t\\hline\n", "\t1 & 0.0149088 & 0.177963 & 0.374975 & 0.694063 & 0.290585 & 0.296972 & 0.151378 & 0.0895086 & $\\dots$ \\\\\n", "\t2 & 0.520355 & 0.670122 & 0.387857 & 0.724383 & 0.0800228 & 0.248548 & 0.911363 & 0.36889 & $\\dots$ \\\\\n", "\t3 & 0.639562 & 0.0423618 & 0.779594 & 0.130453 & 0.173812 & 0.525416 & 0.324965 & 0.905649 & $\\dots$ \\\\\n", "\t4 & 0.839622 & 0.740699 & 0.360969 & 0.456934 & 0.398093 & 0.509447 & 0.335432 & 0.373952 & $\\dots$ \\\\\n", "\t5 & 0.967143 & 0.302672 & 0.706902 & 0.0653216 & 0.644417 & 0.976457 & 0.36895 & 0.80553 & $\\dots$ \\\\\n", "\t6 & 0.205168 & 0.683128 & 0.786909 & 0.819635 & 0.759663 & 0.588019 & 0.244011 & 0.888136 & $\\dots$ \\\\\n", "\t7 & 0.527184 & 0.22961 & 0.395737 & 0.712888 & 0.286281 & 0.0784137 & 0.214371 & 0.693681 & $\\dots$ \\\\\n", "\t8 & 0.951162 & 0.963781 & 0.176482 & 0.751149 & 0.394366 & 0.945845 & 0.62704 & 0.96774 & $\\dots$ \\\\\n", "\t9 & 0.536369 & 0.118179 & 0.958758 & 0.161565 & 0.722348 & 0.35534 & 0.734991 & 0.859331 & $\\dots$ \\\\\n", "\t10 & 0.711389 & 0.661071 & 0.450584 & 0.791275 & 0.0225357 & 0.582416 & 0.832604 & 0.0454298 & $\\dots$ \\\\\n", "\t11 & 0.103929 & 0.489357 & 0.943654 & 0.981498 & 0.906475 & 0.27605 & 0.319854 & 0.505438 & $\\dots$ \\\\\n", "\t12 & 0.806704 & 0.597994 & 0.293338 & 0.71464 & 0.821485 & 0.421753 & 0.578613 & 0.274216 & $\\dots$ \\\\\n", "\t13 & 0.870539 & 0.0866983 & 0.720978 & 0.957936 & 0.410978 & 0.132653 & 0.665372 & 0.593325 & $\\dots$ \\\\\n", "\t14 & 0.790207 & 0.957745 & 0.144518 & 0.988098 & 0.558975 & 0.855644 & 0.890067 & 0.0355403 & $\\dots$ \\\\\n", "\t15 & 0.800393 & 0.155563 & 0.21758 & 0.898822 & 0.751869 & 0.585648 & 0.993167 & 0.927108 & $\\dots$ \\\\\n", "\t16 & 0.898931 & 0.729694 & 0.279658 & 0.205926 & 0.596491 & 0.695755 & 0.695853 & 0.175926 & $\\dots$ \\\\\n", "\t17 & 0.939548 & 0.622013 & 0.32406 & 0.971788 & 0.778883 & 0.112 & 0.392893 & 0.968717 & $\\dots$ \\\\\n", "\t18 & 0.526344 & 0.615295 & 0.222202 & 0.302713 & 0.89399 & 0.0705911 & 0.0447971 & 0.529437 & $\\dots$ \\\\\n", "\t19 & 0.0779683 & 0.349777 & 0.168805 & 0.698644 & 0.131061 & 0.391454 & 0.147786 & 0.664002 & $\\dots$ \\\\\n", "\t20 & 0.966197 & 0.918847 & 0.445359 & 0.986339 & 0.364943 & 0.728386 & 0.850772 & 0.740342 & $\\dots$ \\\\\n", "\t21 & 0.666558 & 0.785516 & 0.65396 & 0.566016 & 0.790067 & 0.950925 & 0.759014 & 0.750961 & $\\dots$ \\\\\n", "\t22 & 0.230104 & 0.303686 & 0.396746 & 0.41203 & 0.836363 & 0.359704 & 0.573183 & 0.631113 & $\\dots$ \\\\\n", "\t23 & 0.0585193 & 0.982141 & 0.504897 & 0.0288279 & 0.99351 & 0.521489 & 0.782399 & 0.14326 & $\\dots$ \\\\\n", "\t24 & 0.298878 & 0.197828 & 0.921214 & 0.372391 & 0.531751 & 0.160731 & 0.310757 & 0.763507 & $\\dots$ \\\\\n", "\t25 & 0.380799 & 0.314726 & 0.574718 & 0.594419 & 0.436391 & 0.109001 & 0.824338 & 0.648367 & $\\dots$ \\\\\n", "\t26 & 0.0687591 & 0.930626 & 0.762947 & 0.339501 & 0.350716 & 0.641212 & 0.0405993 & 0.365176 & $\\dots$ \\\\\n", "\t27 & 0.861725 & 0.997935 & 0.00707363 & 0.702614 & 0.289486 & 0.699093 & 0.503938 & 0.53228 & $\\dots$ \\\\\n", "\t28 & 0.00543656 & 0.67684 & 0.979526 & 0.547557 & 0.797773 & 0.426561 & 0.766105 & 0.552505 & $\\dots$ \\\\\n", "\t29 & 0.698478 & 0.643354 & 0.545089 & 0.647449 & 0.62582 & 0.925163 & 0.0579145 & 0.241449 & $\\dots$ \\\\\n", "\t30 & 0.637793 & 0.112717 & 0.283882 & 0.359841 & 0.566783 & 0.447965 & 0.0586073 & 0.547599 & $\\dots$ \\\\\n", "\t$\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m1000×10000 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m x1 \u001b[0m\u001b[1m x2 \u001b[0m\u001b[1m x3 \u001b[0m\u001b[1m x4 \u001b[0m\u001b[1m x5 \u001b[0m\u001b[1m x6 \u001b[0m\u001b[1m x7 \u001b[0m ⋯\n", " │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Floa\u001b[0m ⋯\n", "──────┼─────────────────────────────────────────────────────────────────────────\n", " 1 │ 0.0149088 0.177963 0.374975 0.694063 0.290585 0.296972 0.15 ⋯\n", " 2 │ 0.520355 0.670122 0.387857 0.724383 0.0800228 0.248548 0.91\n", " 3 │ 0.639562 0.0423618 0.779594 0.130453 0.173812 0.525416 0.32\n", " 4 │ 0.839622 0.740699 0.360969 0.456934 0.398093 0.509447 0.33\n", " 5 │ 0.967143 0.302672 0.706902 0.0653216 0.644417 0.976457 0.36 ⋯\n", " 6 │ 0.205168 0.683128 0.786909 0.819635 0.759663 0.588019 0.24\n", " 7 │ 0.527184 0.22961 0.395737 0.712888 0.286281 0.0784137 0.21\n", " 8 │ 0.951162 0.963781 0.176482 0.751149 0.394366 0.945845 0.62\n", " 9 │ 0.536369 0.118179 0.958758 0.161565 0.722348 0.35534 0.73 ⋯\n", " 10 │ 0.711389 0.661071 0.450584 0.791275 0.0225357 0.582416 0.83\n", " 11 │ 0.103929 0.489357 0.943654 0.981498 0.906475 0.27605 0.31\n", " ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱\n", " 991 │ 0.318699 0.855621 0.89789 0.186005 0.757165 0.537527 0.52\n", " 992 │ 0.385526 0.0811559 0.479614 0.163361 0.0102084 0.649546 0.65 ⋯\n", " 993 │ 0.678458 0.557108 0.202757 0.493311 0.343117 0.922908 0.26\n", " 994 │ 0.424295 0.78012 0.577681 0.274228 0.225563 0.29553 0.39\n", " 995 │ 0.481554 0.0441882 0.700181 0.317677 0.0846761 0.949135 0.45\n", " 996 │ 0.401528 0.658903 0.438309 0.211217 0.211515 0.441902 0.02 ⋯\n", " 997 │ 0.58985 0.909857 0.354735 0.442829 0.438322 0.847835 0.84\n", " 998 │ 0.871944 0.821183 0.725636 0.521645 0.230773 0.506747 0.47\n", " 999 │ 0.702069 0.399582 0.377331 0.542735 0.38631 0.555332 0.74\n", " 1000 │ 0.70703 0.166208 0.45823 0.0682235 0.458422 0.836939 0.58 ⋯\n", "\u001b[36m 9994 columns and 979 rows omitted\u001b[0m" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "large_df = DataFrame(rand(1000, 10000), :auto)" ] }, { "cell_type": "code", "execution_count": 87, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:47.471000Z", "iopub.status.busy": "2024-06-04T16:21:47.471000Z", "iopub.status.idle": "2024-06-04T16:21:47.549000Z", "shell.execute_reply": "2024-06-04T16:21:47.549000Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0.026790 seconds (29.08 k allocations: 1.725 MiB, 76.37% compilation time)\n" ] }, { "data": { "text/html": [ "
1000×1 DataFrame
975 rows omitted
Rowsum
Float64
15010.73
25055.37
34987.2
45016.2
54978.86
65018.0
75011.07
84991.09
94932.78
105017.35
114997.14
124994.29
135020.93
9894975.49
9904959.65
9914997.01
9925003.09
9934984.85
9944933.86
9954952.49
9965015.75
9975013.19
9984976.72
9995015.42
10005054.93
" ], "text/latex": [ "\\begin{tabular}{r|c}\n", "\t& sum\\\\\n", "\t\\hline\n", "\t& Float64\\\\\n", "\t\\hline\n", "\t1 & 5010.73 \\\\\n", "\t2 & 5055.37 \\\\\n", "\t3 & 4987.2 \\\\\n", "\t4 & 5016.2 \\\\\n", "\t5 & 4978.86 \\\\\n", "\t6 & 5018.0 \\\\\n", "\t7 & 5011.07 \\\\\n", "\t8 & 4991.09 \\\\\n", "\t9 & 4932.78 \\\\\n", "\t10 & 5017.35 \\\\\n", "\t11 & 4997.14 \\\\\n", "\t12 & 4994.29 \\\\\n", "\t13 & 5020.93 \\\\\n", "\t14 & 4917.99 \\\\\n", "\t15 & 5023.11 \\\\\n", "\t16 & 5007.8 \\\\\n", "\t17 & 4982.31 \\\\\n", "\t18 & 5010.66 \\\\\n", "\t19 & 4962.08 \\\\\n", "\t20 & 5013.45 \\\\\n", "\t21 & 5008.7 \\\\\n", "\t22 & 5050.03 \\\\\n", "\t23 & 5009.0 \\\\\n", "\t24 & 5016.84 \\\\\n", "\t25 & 5015.97 \\\\\n", "\t26 & 5009.97 \\\\\n", "\t27 & 4966.94 \\\\\n", "\t28 & 4961.31 \\\\\n", "\t29 & 4992.18 \\\\\n", "\t30 & 5037.17 \\\\\n", "\t$\\dots$ & $\\dots$ \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m1000×1 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m sum \u001b[0m\n", " │\u001b[90m Float64 \u001b[0m\n", "──────┼─────────\n", " 1 │ 5010.73\n", " 2 │ 5055.37\n", " 3 │ 4987.2\n", " 4 │ 5016.2\n", " 5 │ 4978.86\n", " 6 │ 5018.0\n", " 7 │ 5011.07\n", " 8 │ 4991.09\n", " 9 │ 4932.78\n", " 10 │ 5017.35\n", " 11 │ 4997.14\n", " ⋮ │ ⋮\n", " 991 │ 4997.01\n", " 992 │ 5003.09\n", " 993 │ 4984.85\n", " 994 │ 4933.86\n", " 995 │ 4952.49\n", " 996 │ 5015.75\n", " 997 │ 5013.19\n", " 998 │ 4976.72\n", " 999 │ 5015.42\n", " 1000 │ 5054.93\n", "\u001b[36m979 rows omitted\u001b[0m" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "@time select(large_df, AsTable(:) => ByRow(sum) => :sum)" ] }, { "cell_type": "code", "execution_count": 88, "metadata": { "execution": { "iopub.execute_input": "2024-06-04T16:21:47.551000Z", "iopub.status.busy": "2024-06-04T16:21:47.551000Z", "iopub.status.idle": "2024-06-04T16:21:47.576000Z", "shell.execute_reply": "2024-06-04T16:21:47.576000Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0.006353 seconds (19.63 k allocations: 1.071 MiB)\n" ] }, { "data": { "text/html": [ "
1000×1 DataFrame
975 rows omitted
Rowsum
Float64
15010.73
25055.37
34987.2
45016.2
54978.86
65018.0
75011.07
84991.09
94932.78
105017.35
114997.14
124994.29
135020.93
9894975.49
9904959.65
9914997.01
9925003.09
9934984.85
9944933.86
9954952.49
9965015.75
9975013.19
9984976.72
9995015.42
10005054.93
" ], "text/latex": [ "\\begin{tabular}{r|c}\n", "\t& sum\\\\\n", "\t\\hline\n", "\t& Float64\\\\\n", "\t\\hline\n", "\t1 & 5010.73 \\\\\n", "\t2 & 5055.37 \\\\\n", "\t3 & 4987.2 \\\\\n", "\t4 & 5016.2 \\\\\n", "\t5 & 4978.86 \\\\\n", "\t6 & 5018.0 \\\\\n", "\t7 & 5011.07 \\\\\n", "\t8 & 4991.09 \\\\\n", "\t9 & 4932.78 \\\\\n", "\t10 & 5017.35 \\\\\n", "\t11 & 4997.14 \\\\\n", "\t12 & 4994.29 \\\\\n", "\t13 & 5020.93 \\\\\n", "\t14 & 4917.99 \\\\\n", "\t15 & 5023.11 \\\\\n", "\t16 & 5007.8 \\\\\n", "\t17 & 4982.31 \\\\\n", "\t18 & 5010.66 \\\\\n", "\t19 & 4962.08 \\\\\n", "\t20 & 5013.45 \\\\\n", "\t21 & 5008.7 \\\\\n", "\t22 & 5050.03 \\\\\n", "\t23 & 5009.0 \\\\\n", "\t24 & 5016.84 \\\\\n", "\t25 & 5015.97 \\\\\n", "\t26 & 5009.97 \\\\\n", "\t27 & 4966.94 \\\\\n", "\t28 & 4961.31 \\\\\n", "\t29 & 4992.18 \\\\\n", "\t30 & 5037.17 \\\\\n", "\t$\\dots$ & $\\dots$ \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m1000×1 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m sum \u001b[0m\n", " │\u001b[90m Float64 \u001b[0m\n", "──────┼─────────\n", " 1 │ 5010.73\n", " 2 │ 5055.37\n", " 3 │ 4987.2\n", " 4 │ 5016.2\n", " 5 │ 4978.86\n", " 6 │ 5018.0\n", " 7 │ 5011.07\n", " 8 │ 4991.09\n", " 9 │ 4932.78\n", " 10 │ 5017.35\n", " 11 │ 4997.14\n", " ⋮ │ ⋮\n", " 991 │ 4997.01\n", " 992 │ 5003.09\n", " 993 │ 4984.85\n", " 994 │ 4933.86\n", " 995 │ 4952.49\n", " 996 │ 5015.75\n", " 997 │ 5013.19\n", " 998 │ 4976.72\n", " 999 │ 5015.42\n", " 1000 │ 5054.93\n", "\u001b[36m979 rows omitted\u001b[0m" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "@time select(large_df, AsTable(:) => ByRow(sum) => :sum)" ] } ], "metadata": { "kernelspec": { "display_name": "Julia 1.10.1", "language": "julia", "name": "julia-1.10" }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "1.10.3" } }, "nbformat": 4, "nbformat_minor": 3 }