{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "d2619392",
   "metadata": {},
   "source": [
    "# Custom columns\n",
    "\n",
    "## Why do we care?\n",
    "\n",
    "Specifying custom columns correctly in `BipartitePandas` is very important. This ensures custom columns interact with classes and methods properly - otherwise, conversions between types are likely to drop these columns, and method calls may not apply to these columns, they may apply incorrectly, or they may raise errors.\n",
    "\n",
    "## Import the BipartitePandas package\n",
    "\n",
    "Make sure to install it using `pip install bipartitepandas`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ae754f39",
   "metadata": {},
   "outputs": [],
   "source": [
    "import bipartitepandas as bpd"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e8fd81cc",
   "metadata": {},
   "source": [
    "## Get your data ready\n",
    "\n",
    "For this notebook, we simulate data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a1c43af9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "      <th>y</th>\n",
       "      <th>t</th>\n",
       "      <th>l</th>\n",
       "      <th>k</th>\n",
       "      <th>alpha</th>\n",
       "      <th>psi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>36</td>\n",
       "      <td>-1.569165</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.908458</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>2.442324</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.335178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>-1.307551</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.335178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>-1.551354</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.335178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>13</td>\n",
       "      <td>-0.789661</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.335178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49995</th>\n",
       "      <td>9999</td>\n",
       "      <td>102</td>\n",
       "      <td>-1.493225</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>-0.430727</td>\n",
       "      <td>0.114185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49996</th>\n",
       "      <td>9999</td>\n",
       "      <td>116</td>\n",
       "      <td>2.368321</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>-0.430727</td>\n",
       "      <td>0.114185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49997</th>\n",
       "      <td>9999</td>\n",
       "      <td>76</td>\n",
       "      <td>-2.070787</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>-0.430727</td>\n",
       "      <td>-0.348756</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49998</th>\n",
       "      <td>9999</td>\n",
       "      <td>23</td>\n",
       "      <td>-1.203733</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.430727</td>\n",
       "      <td>-0.908458</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49999</th>\n",
       "      <td>9999</td>\n",
       "      <td>23</td>\n",
       "      <td>0.132797</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.430727</td>\n",
       "      <td>-0.908458</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>50000 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          i    j         y  t  l  k     alpha       psi\n",
       "0         0   36 -1.569165  0  2  1  0.000000 -0.908458\n",
       "1         0   17  2.442324  1  2  0  0.000000 -1.335178\n",
       "2         0   17 -1.307551  2  2  0  0.000000 -1.335178\n",
       "3         0   17 -1.551354  3  2  0  0.000000 -1.335178\n",
       "4         0   13 -0.789661  4  2  0  0.000000 -1.335178\n",
       "...     ...  ...       ... .. .. ..       ...       ...\n",
       "49995  9999  102 -1.493225  0  1  5 -0.430727  0.114185\n",
       "49996  9999  116  2.368321  1  1  5 -0.430727  0.114185\n",
       "49997  9999   76 -2.070787  2  1  3 -0.430727 -0.348756\n",
       "49998  9999   23 -1.203733  3  1  1 -0.430727 -0.908458\n",
       "49999  9999   23  0.132797  4  1  1 -0.430727 -0.908458\n",
       "\n",
       "[50000 rows x 8 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df = bpd.SimBipartite().simulate()\n",
    "display(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4a48b656",
   "metadata": {},
   "source": [
    "## Columns\n",
    "\n",
    "BipartitePandas includes seven pre-defined general columns:\n",
    "\n",
    "#### Required\n",
    "- `i`: worker id (any type)\n",
    "- `j`: firm id (any type)\n",
    "- `y`: income (float or int)\n",
    "\n",
    "#### Optional\n",
    "- `t`: time (int)\n",
    "- `g`: firm type (any type)\n",
    "- `w`: weight (float or int)\n",
    "- `m`: move indicator (int)\n",
    "\n",
    "## Formats\n",
    "\n",
    "BipartitePandas includes four formats:\n",
    "\n",
    "- *Long* - each row gives a single observation\n",
    "- *Collapsed Long* - like *Long*, but employment spells at the same firm are collapsed into a single observation\n",
    "- *Event Study* - each row gives two consecutive observations\n",
    "- *Collapsed Event Study* - like *Event Study*, but employment spells at the same firm are collapsed into a single observation\n",
    "\n",
    "These formats divide general columns differently:\n",
    "\n",
    "- *Long* - `i`, `j`, `y`, `t`, `g`, `w`, `m`\n",
    "- *Collapsed Long* - `i`, `j`, `y`, `t1`, `t2`, `g`, `w`, `m`\n",
    "- *Event Study* - `i`, `j1`, `j2`, `y1`, `y2`, `t1`, `t2`, `g1`, `g2`, `w1`, `w2`, `m`\n",
    "- *Collapsed Event Study* - `i`, `j1`, `j2`, `y1`, `y2`, `t11`, `t12`, `t21`, `t22`, `g1`, `g2`, `w1`, `w2`, `m`\n",
    "\n",
    "## Constructing DataFrames\n",
    "\n",
    "Our simulated data is in *Long* format, but includes columns that aren't pre-defined. How do we construct a *Long* dataframe that includes these columns?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ad87ff5d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "      <th>y</th>\n",
       "      <th>t</th>\n",
       "      <th>alpha</th>\n",
       "      <th>k</th>\n",
       "      <th>l</th>\n",
       "      <th>psi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>36</td>\n",
       "      <td>-1.569165</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>-0.908458</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>2.442324</td>\n",
       "      <td>1</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>-1.335178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>-1.307551</td>\n",
       "      <td>2</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>-1.335178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>-1.551354</td>\n",
       "      <td>3</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>-1.335178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>13</td>\n",
       "      <td>-0.789661</td>\n",
       "      <td>4</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>-1.335178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49995</th>\n",
       "      <td>9999</td>\n",
       "      <td>102</td>\n",
       "      <td>-1.493225</td>\n",
       "      <td>0</td>\n",
       "      <td>-0.430727</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>0.114185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49996</th>\n",
       "      <td>9999</td>\n",
       "      <td>116</td>\n",
       "      <td>2.368321</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.430727</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>0.114185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49997</th>\n",
       "      <td>9999</td>\n",
       "      <td>76</td>\n",
       "      <td>-2.070787</td>\n",
       "      <td>2</td>\n",
       "      <td>-0.430727</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.348756</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49998</th>\n",
       "      <td>9999</td>\n",
       "      <td>23</td>\n",
       "      <td>-1.203733</td>\n",
       "      <td>3</td>\n",
       "      <td>-0.430727</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.908458</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49999</th>\n",
       "      <td>9999</td>\n",
       "      <td>23</td>\n",
       "      <td>0.132797</td>\n",
       "      <td>4</td>\n",
       "      <td>-0.430727</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.908458</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>50000 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          i    j         y  t     alpha  k  l       psi\n",
       "0         0   36 -1.569165  0  0.000000  1  2 -0.908458\n",
       "1         0   17  2.442324  1  0.000000  0  2 -1.335178\n",
       "2         0   17 -1.307551  2  0.000000  0  2 -1.335178\n",
       "3         0   17 -1.551354  3  0.000000  0  2 -1.335178\n",
       "4         0   13 -0.789661  4  0.000000  0  2 -1.335178\n",
       "...     ...  ...       ... ..       ... .. ..       ...\n",
       "49995  9999  102 -1.493225  0 -0.430727  5  1  0.114185\n",
       "49996  9999  116  2.368321  1 -0.430727  5  1  0.114185\n",
       "49997  9999   76 -2.070787  2 -0.430727  3  1 -0.348756\n",
       "49998  9999   23 -1.203733  3 -0.430727  1  1 -0.908458\n",
       "49999  9999   23  0.132797  4 -0.430727  1  1 -0.908458\n",
       "\n",
       "[50000 rows x 8 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "bdf_long = bpd.BipartiteDataFrame(\n",
    "    i=df['i'], j=df['j'], y=df['y'], t=df['t'],\n",
    "    l=df['l'], k=df['k'], alpha=df['alpha'], psi=df['psi']\n",
    ")\n",
    "display(bdf_long)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9f7acd63",
   "metadata": {},
   "source": [
    "Are we sure this is long? Let's check the datatype:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "11ce1774",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "bipartitepandas.bipartitelong.BipartiteLong"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(bdf_long)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d7138c87",
   "metadata": {},
   "source": [
    "## Categorical columns\n",
    "\n",
    "What if we want to specify a column should be categorical? Then we should specify `custom_categorical_dict`!\n",
    "\n",
    "<div class=\"alert alert-info\">\n",
    "\n",
    "Note\n",
    "\n",
    "`alpha` is float, and BipartiteDataFrame automatically sets floats to collapse by `mean`. Categorical columns cannot be collapsed by mean, so if we mark `alpha` as categorical, we must also specify that it should collapse by `first` (`last` or `None` also work). In addition, categorical columns must use the datatype `'categorical'`.\n",
    "\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "42735a65",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "checking required columns and datatypes\n",
      "sorting rows\n",
      "dropping NaN observations\n",
      "generating 'm' column\n",
      "keeping highest paying job for i-t (worker-year) duplicates (how='max')\n",
      "dropping workers who leave a firm then return to it (how=False)\n",
      "making 'i' ids contiguous\n",
      "making 'j' ids contiguous\n",
      "making 'alpha' ids contiguous\n",
      "computing largest connected set (how=None)\n",
      "sorting columns\n",
      "resetting index\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "      <th>y</th>\n",
       "      <th>t</th>\n",
       "      <th>m</th>\n",
       "      <th>alpha</th>\n",
       "      <th>k</th>\n",
       "      <th>l</th>\n",
       "      <th>psi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>36</td>\n",
       "      <td>-1.569165</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>-0.908458</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>2.442324</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>-1.335178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>-1.307551</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>-1.335178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>-1.551354</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>-1.335178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>13</td>\n",
       "      <td>-0.789661</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>-1.335178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49995</th>\n",
       "      <td>9999</td>\n",
       "      <td>102</td>\n",
       "      <td>-1.493225</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>0.114185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49996</th>\n",
       "      <td>9999</td>\n",
       "      <td>116</td>\n",
       "      <td>2.368321</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>0.114185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49997</th>\n",
       "      <td>9999</td>\n",
       "      <td>76</td>\n",
       "      <td>-2.070787</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.348756</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49998</th>\n",
       "      <td>9999</td>\n",
       "      <td>23</td>\n",
       "      <td>-1.203733</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.908458</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49999</th>\n",
       "      <td>9999</td>\n",
       "      <td>23</td>\n",
       "      <td>0.132797</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.908458</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>50000 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          i    j         y  t  m  alpha  k  l       psi\n",
       "0         0   36 -1.569165  0  1      0  1  2 -0.908458\n",
       "1         0   17  2.442324  1  1      0  0  2 -1.335178\n",
       "2         0   17 -1.307551  2  0      0  0  2 -1.335178\n",
       "3         0   17 -1.551354  3  1      0  0  2 -1.335178\n",
       "4         0   13 -0.789661  4  1      0  0  2 -1.335178\n",
       "...     ...  ...       ... .. ..    ... .. ..       ...\n",
       "49995  9999  102 -1.493225  0  1      1  5  1  0.114185\n",
       "49996  9999  116  2.368321  1  2      1  5  1  0.114185\n",
       "49997  9999   76 -2.070787  2  2      1  3  1 -0.348756\n",
       "49998  9999   23 -1.203733  3  1      1  1  1 -0.908458\n",
       "49999  9999   23  0.132797  4  0      1  1  1 -0.908458\n",
       "\n",
       "[50000 rows x 9 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "bdf_long = bpd.BipartiteDataFrame(\n",
    "    i=df['i'], j=df['j'], y=df['y'], t=df['t'],\n",
    "    l=df['l'], k=df['k'], alpha=df['alpha'], psi=df['psi'],\n",
    "    custom_categorical_dict={'alpha': True},\n",
    "    custom_dtype_dict={'alpha': 'categorical'},\n",
    "    custom_how_collapse_dict={'alpha': 'first'}\n",
    ").clean()\n",
    "display(bdf_long)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6e9bb247",
   "metadata": {},
   "source": [
    "## Collapsing data\n",
    "\n",
    "What if instead of collapsing by the `mean`, we want a column to collapse by `first`, or even to drop when we collapse? Then we should specify `custom_how_collapse_dict`!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "4ee33217",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "checking required columns and datatypes\n",
      "sorting rows\n",
      "dropping NaN observations\n",
      "generating 'm' column\n",
      "keeping highest paying job for i-t (worker-year) duplicates (how='max')\n",
      "dropping workers who leave a firm then return to it (how=False)\n",
      "making 'i' ids contiguous\n",
      "making 'j' ids contiguous\n",
      "computing largest connected set (how=None)\n",
      "sorting columns\n",
      "resetting index\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "      <th>y</th>\n",
       "      <th>t1</th>\n",
       "      <th>t2</th>\n",
       "      <th>w</th>\n",
       "      <th>m</th>\n",
       "      <th>k</th>\n",
       "      <th>l</th>\n",
       "      <th>psi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>36</td>\n",
       "      <td>-1.569165</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>-0.908458</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>-0.138861</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>-1.335178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>13</td>\n",
       "      <td>-0.789661</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>-1.335178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>52</td>\n",
       "      <td>-0.653218</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-0.604585</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>49</td>\n",
       "      <td>0.676861</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-0.604585</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29820</th>\n",
       "      <td>9998</td>\n",
       "      <td>52</td>\n",
       "      <td>-2.618451</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.604585</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29821</th>\n",
       "      <td>9999</td>\n",
       "      <td>102</td>\n",
       "      <td>-1.493225</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.114185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29822</th>\n",
       "      <td>9999</td>\n",
       "      <td>116</td>\n",
       "      <td>2.368321</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.114185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29823</th>\n",
       "      <td>9999</td>\n",
       "      <td>76</td>\n",
       "      <td>-2.070787</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-0.348756</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29824</th>\n",
       "      <td>9999</td>\n",
       "      <td>23</td>\n",
       "      <td>-0.535468</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-0.908458</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>29825 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          i    j         y  t1  t2  w  m    k    l       psi\n",
       "0         0   36 -1.569165   0   0  1  1  1.0  2.0 -0.908458\n",
       "1         0   17 -0.138861   1   3  3  2  0.0  2.0 -1.335178\n",
       "2         0   13 -0.789661   4   4  1  1  0.0  2.0 -1.335178\n",
       "3         1   52 -0.653218   0   0  1  1  2.0  1.0 -0.604585\n",
       "4         1   49  0.676861   1   2  2  2  2.0  1.0 -0.604585\n",
       "...     ...  ...       ...  ..  .. .. ..  ...  ...       ...\n",
       "29820  9998   52 -2.618451   4   4  1  1  2.0  0.0 -0.604585\n",
       "29821  9999  102 -1.493225   0   0  1  1  5.0  1.0  0.114185\n",
       "29822  9999  116  2.368321   1   1  1  2  5.0  1.0  0.114185\n",
       "29823  9999   76 -2.070787   2   2  1  2  3.0  1.0 -0.348756\n",
       "29824  9999   23 -0.535468   3   4  2  1  1.0  1.0 -0.908458\n",
       "\n",
       "[29825 rows x 10 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "bdf_long = bpd.BipartiteDataFrame(\n",
    "    i=df['i'], j=df['j'], y=df['y'], t=df['t'],\n",
    "    l=df['l'], k=df['k'], alpha=df['alpha'], psi=df['psi'],\n",
    "    custom_how_collapse_dict={'alpha': None, 'psi': 'first'}\n",
    ").clean().collapse()\n",
    "display(bdf_long)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a4fa2198",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-warning\">\n",
    "\n",
    "Warning\n",
    "\n",
    "Collapsing by `first`, `last`, `mean`, and `sum` will uncollapse correctly (although information may be lost); any other option (e.g. `var` or `std`) is not guaranteed to uncollapse correctly.\n",
    "\n",
    "</div>\n",
    "\n",
    "## Converting between (collapsed) long and (collapsed) event study formats\n",
    "\n",
    "What if we don't want a column to split when converting to event study, or if we want it to drop during the conversion? Then we should specify `custom_long_es_split_dict`!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "1c35ab24",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "checking required columns and datatypes\n",
      "sorting rows\n",
      "dropping NaN observations\n",
      "generating 'm' column\n",
      "keeping highest paying job for i-t (worker-year) duplicates (how='max')\n",
      "dropping workers who leave a firm then return to it (how=False)\n",
      "making 'i' ids contiguous\n",
      "making 'j' ids contiguous\n",
      "computing largest connected set (how=None)\n",
      "sorting columns\n",
      "resetting index\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>i</th>\n",
       "      <th>j1</th>\n",
       "      <th>j2</th>\n",
       "      <th>y1</th>\n",
       "      <th>y2</th>\n",
       "      <th>t1</th>\n",
       "      <th>t2</th>\n",
       "      <th>m</th>\n",
       "      <th>alpha</th>\n",
       "      <th>k1</th>\n",
       "      <th>k2</th>\n",
       "      <th>l1</th>\n",
       "      <th>l2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>36</td>\n",
       "      <td>17</td>\n",
       "      <td>-1.569165</td>\n",
       "      <td>2.442324</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>17</td>\n",
       "      <td>2.442324</td>\n",
       "      <td>-1.307551</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>17</td>\n",
       "      <td>-1.307551</td>\n",
       "      <td>-1.551354</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>13</td>\n",
       "      <td>-1.551354</td>\n",
       "      <td>-0.789661</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>52</td>\n",
       "      <td>49</td>\n",
       "      <td>-0.653218</td>\n",
       "      <td>1.597527</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.430727</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40657</th>\n",
       "      <td>9998</td>\n",
       "      <td>2</td>\n",
       "      <td>52</td>\n",
       "      <td>-2.069815</td>\n",
       "      <td>-2.618451</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.967422</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40658</th>\n",
       "      <td>9999</td>\n",
       "      <td>102</td>\n",
       "      <td>116</td>\n",
       "      <td>-1.493225</td>\n",
       "      <td>2.368321</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.430727</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40659</th>\n",
       "      <td>9999</td>\n",
       "      <td>116</td>\n",
       "      <td>76</td>\n",
       "      <td>2.368321</td>\n",
       "      <td>-2.070787</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.430727</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40660</th>\n",
       "      <td>9999</td>\n",
       "      <td>76</td>\n",
       "      <td>23</td>\n",
       "      <td>-2.070787</td>\n",
       "      <td>-1.203733</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.430727</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40661</th>\n",
       "      <td>9999</td>\n",
       "      <td>23</td>\n",
       "      <td>23</td>\n",
       "      <td>-1.203733</td>\n",
       "      <td>0.132797</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>-0.430727</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>40662 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          i   j1   j2        y1        y2  t1  t2  m     alpha  k1  k2  l1  l2\n",
       "0         0   36   17 -1.569165  2.442324   0   1  1  0.000000   1   0   2   2\n",
       "1         0   17   17  2.442324 -1.307551   1   2  0  0.000000   0   0   2   2\n",
       "2         0   17   17 -1.307551 -1.551354   2   3  0  0.000000   0   0   2   2\n",
       "3         0   17   13 -1.551354 -0.789661   3   4  1  0.000000   0   0   2   2\n",
       "4         1   52   49 -0.653218  1.597527   0   1  1 -0.430727   2   2   1   1\n",
       "...     ...  ...  ...       ...       ...  ..  .. ..       ...  ..  ..  ..  ..\n",
       "40657  9998    2   52 -2.069815 -2.618451   3   4  1 -0.967422   0   2   0   0\n",
       "40658  9999  102  116 -1.493225  2.368321   0   1  1 -0.430727   5   5   1   1\n",
       "40659  9999  116   76  2.368321 -2.070787   1   2  1 -0.430727   5   3   1   1\n",
       "40660  9999   76   23 -2.070787 -1.203733   2   3  1 -0.430727   3   1   1   1\n",
       "40661  9999   23   23 -1.203733  0.132797   3   4  0 -0.430727   1   1   1   1\n",
       "\n",
       "[40662 rows x 13 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "bdf_long = bpd.BipartiteDataFrame(\n",
    "    i=df['i'], j=df['j'], y=df['y'], t=df['t'],\n",
    "    l=df['l'], k=df['k'], alpha=df['alpha'], psi=df['psi'],\n",
    "    custom_long_es_split_dict={'alpha': False, 'psi': None}\n",
    ").clean().to_eventstudy()\n",
    "display(bdf_long)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "de9f6ab8",
   "metadata": {},
   "source": [
    "## Adding custom columns to an instantiated DataFrame\n",
    "\n",
    "Use the method `.add_column()` to add custom columns to a DataFrame that has already been instantiated.\n",
    "\n",
    "### Proper usage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "8ebc6478",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "checking required columns and datatypes\n",
      "sorting rows\n",
      "dropping NaN observations\n",
      "generating 'm' column\n",
      "keeping highest paying job for i-t (worker-year) duplicates (how='max')\n",
      "dropping workers who leave a firm then return to it (how=False)\n",
      "making 'i' ids contiguous\n",
      "making 'j' ids contiguous\n",
      "making 'alpha' ids contiguous\n",
      "making 'psi' ids contiguous\n",
      "computing largest connected set (how=None)\n",
      "sorting columns\n",
      "resetting index\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "      <th>y</th>\n",
       "      <th>t</th>\n",
       "      <th>m</th>\n",
       "      <th>alpha</th>\n",
       "      <th>k</th>\n",
       "      <th>l</th>\n",
       "      <th>psi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>36</td>\n",
       "      <td>-1.569165</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>2.442324</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>-1.307551</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>-1.551354</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>13</td>\n",
       "      <td>-0.789661</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49995</th>\n",
       "      <td>9999</td>\n",
       "      <td>102</td>\n",
       "      <td>-1.493225</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49996</th>\n",
       "      <td>9999</td>\n",
       "      <td>116</td>\n",
       "      <td>2.368321</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49997</th>\n",
       "      <td>9999</td>\n",
       "      <td>76</td>\n",
       "      <td>-2.070787</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49998</th>\n",
       "      <td>9999</td>\n",
       "      <td>23</td>\n",
       "      <td>-1.203733</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49999</th>\n",
       "      <td>9999</td>\n",
       "      <td>23</td>\n",
       "      <td>0.132797</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>50000 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          i    j         y  t  m  alpha  k  l  psi\n",
       "0         0   36 -1.569165  0  1      0  1  2    0\n",
       "1         0   17  2.442324  1  1      0  0  2    1\n",
       "2         0   17 -1.307551  2  0      0  0  2    1\n",
       "3         0   17 -1.551354  3  1      0  0  2    1\n",
       "4         0   13 -0.789661  4  1      0  0  2    1\n",
       "...     ...  ...       ... .. ..    ... .. ..  ...\n",
       "49995  9999  102 -1.493225  0  1      1  5  1    4\n",
       "49996  9999  116  2.368321  1  2      1  5  1    4\n",
       "49997  9999   76 -2.070787  2  2      1  3  1    6\n",
       "49998  9999   23 -1.203733  3  1      1  1  1    0\n",
       "49999  9999   23  0.132797  4  0      1  1  1    0\n",
       "\n",
       "[50000 rows x 9 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "bdf_long = bpd.BipartiteDataFrame(\n",
    "    i=df['i'], j=df['j'], y=df['y'], t=df['t'],\n",
    "    l=df['l'], k=df['k']\n",
    ")\n",
    "bdf_long = bdf_long.add_column('alpha', df['alpha'], is_categorical=True, dtype='categorical')\n",
    "bdf_long = bdf_long.add_column('psi', df['psi'], is_categorical=True, dtype='categorical')\n",
    "bdf_long = bdf_long.clean()\n",
    "display(bdf_long)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1e51e19",
   "metadata": {},
   "source": [
    "### Improper usage\n",
    "\n",
    "Here, we see what happens if we add custom columns incorrectly. In this example, we see that cleaning will raise an error. If instead, we try to bypass data cleaning and immediately convert between data formats, the custom columns will be dropped during the conversion."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "906e5184",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "checking required columns and datatypes\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "alpha is included in the dataframe but is not saved in .col_reference_dict. Please initialize your BipartiteBase object to include this column by setting 'col_reference_dict=your_col_reference_dict'.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Input \u001b[0;32mIn [9]\u001b[0m, in \u001b[0;36m<cell line: 7>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      5\u001b[0m bdf_long[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124malpha\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124malpha\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m      6\u001b[0m bdf_long[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpsi\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpsi\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m----> 7\u001b[0m bdf_long \u001b[38;5;241m=\u001b[39m bdf_long\u001b[38;5;241m.\u001b[39mclean()\n",
      "File \u001b[0;32m~/opt/anaconda3/envs/stata-env/lib/python3.9/site-packages/bipartitepandas/bipartitelong.py:129\u001b[0m, in \u001b[0;36mBipartiteLong.clean\u001b[0;34m(self, params)\u001b[0m\n\u001b[1;32m    126\u001b[0m     params[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mconnectedness\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    128\u001b[0m \u001b[38;5;66;03m## Initial cleaning ##\u001b[39;00m\n\u001b[0;32m--> 129\u001b[0m frame \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclean\u001b[49m\u001b[43m(\u001b[49m\u001b[43mparams\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    131\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m collapse:\n\u001b[1;32m    132\u001b[0m     \u001b[38;5;66;03m## Collapse then compute largest connected set ##\u001b[39;00m\n\u001b[1;32m    133\u001b[0m     \u001b[38;5;66;03m# Update parameters\u001b[39;00m\n\u001b[1;32m    134\u001b[0m     level_dict \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m    135\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mleave_out_spell\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mspell\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m    136\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mleave_out_match\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmatch\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m    137\u001b[0m     }\n",
      "File \u001b[0;32m~/opt/anaconda3/envs/stata-env/lib/python3.9/site-packages/bipartitepandas/bipartitelongbase.py:104\u001b[0m, in \u001b[0;36mBipartiteLongBase.clean\u001b[0;34m(self, params)\u001b[0m\n\u001b[1;32m    102\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verbose:\n\u001b[1;32m    103\u001b[0m     tqdm\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mchecking required columns and datatypes\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m--> 104\u001b[0m \u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_check_cols\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    106\u001b[0m \u001b[38;5;66;03m# Next, sort rows\u001b[39;00m\n\u001b[1;32m    107\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlog(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124msorting rows\u001b[39m\u001b[38;5;124m'\u001b[39m, level\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124minfo\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
      "File \u001b[0;32m~/opt/anaconda3/envs/stata-env/lib/python3.9/site-packages/bipartitepandas/bipartitebase.py:1196\u001b[0m, in \u001b[0;36mBipartiteBase._check_cols\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1194\u001b[0m error_msg \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcol\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m is included in the dataframe but is not saved in .col_reference_dict. Please initialize your BipartiteBase object to include this column by setting \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcol_reference_dict=your_col_reference_dict\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1195\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlog(error_msg, level\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124minfo\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m-> 1196\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(error_msg)\n",
      "\u001b[0;31mValueError\u001b[0m: alpha is included in the dataframe but is not saved in .col_reference_dict. Please initialize your BipartiteBase object to include this column by setting 'col_reference_dict=your_col_reference_dict'."
     ]
    }
   ],
   "source": [
    "bdf_long = bpd.BipartiteDataFrame(\n",
    "    i=df['i'], j=df['j'], y=df['y'], t=df['t'],\n",
    "    l=df['l'], k=df['k']\n",
    ")\n",
    "bdf_long['alpha'] = df['alpha']\n",
    "bdf_long['psi'] = df['psi']\n",
    "bdf_long = bdf_long.clean()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}