{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### This notebook demonstrates the ability of the DisparateImpactRemover algorithm.\n", "The algorithm corrects for imbalanced selection rates between unprivileged and privileged groups at various levels of repair. It follows the guidelines set forth by [1] for training the algorithm and classifier and uses the AdultDataset as an example." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from __future__ import absolute_import\n", "from __future__ import division\n", "from __future__ import print_function\n", "from __future__ import unicode_literals" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from matplotlib import pyplot as plt\n", "\n", "import sys\n", "sys.path.append(\"../\")\n", "import warnings\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from tqdm import tqdm\n", "\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.svm import SVC as SVM\n", "from sklearn.preprocessing import MinMaxScaler\n", "\n", "from aif360.algorithms.preprocessing import DisparateImpactRemover\n", "from aif360.datasets import AdultDataset\n", "from aif360.metrics import BinaryLabelDatasetMetric\n", "\n", "from IPython.display import Markdown, display" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Dataset\n", "https://archive.ics.uci.edu/ml/datasets/adult" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This data was extracted from the census bureau database.\n", "\n", "48842 instances, mix of continuous and discrete (train=32561, test=16281)\n", "\n", "45222 if instances with unknown values are removed (train=30162, test=15060)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "f = open(\"adult.data\",\"r\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "fl =f.readlines()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "final_list = []\n", "for line in fl:\n", " line = line.replace(\" \", \"\")\n", " line = line.rstrip()\n", " l = line.split(\",\")\n", "# print(l)\n", " if l[len(l) - 1] == '>50K':\n", " l[len(l) - 1] = 1\n", " else:\n", " l[len(l) - 1] = 0\n", "# print(l)\n", "# print(l)\n", " final_list.append(l)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "f.close()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "adult_df = pd.DataFrame(final_list, columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',\n", " 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',\n", " 'native-country', 'income-per-year'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(32562, 15)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adult_df.shape" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | age | \n", "workclass | \n", "fnlwgt | \n", "education | \n", "education-num | \n", "marital-status | \n", "occupation | \n", "relationship | \n", "race | \n", "sex | \n", "capital-gain | \n", "capital-loss | \n", "hours-per-week | \n", "native-country | \n", "income-per-year | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "39 | \n", "State-gov | \n", "77516 | \n", "Bachelors | \n", "13 | \n", "Never-married | \n", "Adm-clerical | \n", "Not-in-family | \n", "White | \n", "Male | \n", "2174 | \n", "0 | \n", "40 | \n", "United-States | \n", "0.0 | \n", "
1 | \n", "50 | \n", "Self-emp-not-inc | \n", "83311 | \n", "Bachelors | \n", "13 | \n", "Married-civ-spouse | \n", "Exec-managerial | \n", "Husband | \n", "White | \n", "Male | \n", "0 | \n", "0 | \n", "13 | \n", "United-States | \n", "0.0 | \n", "
2 | \n", "38 | \n", "Private | \n", "215646 | \n", "HS-grad | \n", "9 | \n", "Divorced | \n", "Handlers-cleaners | \n", "Not-in-family | \n", "White | \n", "Male | \n", "0 | \n", "0 | \n", "40 | \n", "United-States | \n", "0.0 | \n", "
3 | \n", "53 | \n", "Private | \n", "234721 | \n", "11th | \n", "7 | \n", "Married-civ-spouse | \n", "Handlers-cleaners | \n", "Husband | \n", "Black | \n", "Male | \n", "0 | \n", "0 | \n", "40 | \n", "United-States | \n", "0.0 | \n", "
4 | \n", "28 | \n", "Private | \n", "338409 | \n", "Bachelors | \n", "13 | \n", "Married-civ-spouse | \n", "Prof-specialty | \n", "Wife | \n", "Black | \n", "Female | \n", "0 | \n", "0 | \n", "40 | \n", "Cuba | \n", "0.0 | \n", "