pivot_table: create a pivot table

Documentation

Pivot table in Wikipedia
Pivot tables in the pandas User Guide
pivot tables in the pandas Cookbook
pivot_table method of DataFrame
pd.pivot_table

Create a pivot table using groupby and unstack.

Three reports have come in about the number of accidents at Eighth Avenue & 42nd Street. The dfOfMeans has a MultiIndex containing two levels. The pivot_table has a plain old Index containing only one level.

"Create a pivot table using groupby and unstack."
import sys
import random
import pandas as pd

data = [
    [42, "Eighth",  4280],
    [42, "Eighth",  4281],
    [42, "Eighth",  4282],
    [42, "Seventh", 4270],
    [42, "Seventh", 4271],
    [42, "Seventh", 4272],
    [42, "Sixth",   4260],
    [42, "Sixth",   4261],
    [42, "Sixth",   4262],

    [43, "Eighth",  4380],
    [43, "Eighth",  4381],
    [43, "Eighth",  4382],
    [43, "Seventh", 4370],
    [43, "Seventh", 4371],
    [43, "Seventh", 4372],
    [43, "Sixth",   4360],
    [43, "Sixth",   4361],
    [43, "Sixth",   4362],

    [44, "Eighth",  4480],
    [44, "Eighth",  4481],
    [44, "Eighth",  4482],
    [44, "Seventh", 4470],
    [44, "Seventh", 4471],
    [44, "Seventh", 4472],
    [44, "Sixth",   4460],
    [44, "Sixth",   4461],
    [44, "Sixth",   4462]
]

random.shuffle(data)
columns = pd.Index(data = ["street", "avenue", "accidents"])
df = pd.DataFrame(data = data, columns = columns)
print(df)
print()

dfOfMeans = df.groupby(["street", "avenue"]).mean()
print(dfOfMeans)
print()

pivot_table = dfOfMeans.unstack()
print(pivot_table)

sys.exit(0)

    street   avenue  accidents
0       42   Eighth       4281
1       43   Eighth       4380
2       44   Eighth       4481
3       43  Seventh       4370
4       43    Sixth       4360
5       44    Sixth       4462
6       42  Seventh       4272
7       44  Seventh       4472
8       42    Sixth       4260
9       42   Eighth       4280
10      44    Sixth       4461
11      42   Eighth       4282
12      43  Seventh       4372
13      42    Sixth       4262
14      44  Seventh       4470
15      44   Eighth       4482
16      44   Eighth       4480
17      43  Seventh       4371
18      42  Seventh       4271
19      42  Seventh       4270
20      43   Eighth       4381
21      43   Eighth       4382
22      42    Sixth       4261
23      43    Sixth       4362
24      43    Sixth       4361
25      44    Sixth       4460
26      44  Seventh       4471

                accidents
street avenue 
42     Eighth        4281
       Seventh       4271
       Sixth         4261
43     Eighth        4381
       Seventh       4371
       Sixth         4361
44     Eighth        4481
       Seventh       4471
       Sixth         4461

       accidents              
avenue    Eighth Seventh Sixth
street
42          4281    4271  4261
43          4381    4371  4361
44          4481    4471  4461

Create a pivot table using pivot_table.

"Create a pivot table using pivot_table."
import sys
import random
import pandas as pd

data = [
    [42, "Eighth",  4280],
    [42, "Eighth",  4281],
    [42, "Eighth",  4282],
    [42, "Seventh", 4270],
    [42, "Seventh", 4271],
    [42, "Seventh", 4272],
    [42, "Sixth",   4260],
    [42, "Sixth",   4261],
    [42, "Sixth",   4262],

    [43, "Eighth",  4380],
    [43, "Eighth",  4381],
    [43, "Eighth",  4382],
    [43, "Seventh", 4370],
    [43, "Seventh", 4371],
    [43, "Seventh", 4372],
    [43, "Sixth",   4360],
    [43, "Sixth",   4361],
    [43, "Sixth",   4362],

    [44, "Eighth",  4480],
    [44, "Eighth",  4481],
    [44, "Eighth",  4482],
    [44, "Seventh", 4470],
    [44, "Seventh", 4471],
    [44, "Seventh", 4472],
    [44, "Sixth",   4460],
    [44, "Sixth",   4461],
    [44, "Sixth",   4462]
]

random.shuffle(data)
columns = pd.Index(data = ["street", "avenue", "accidents"])
df = pd.DataFrame(data = data, columns = columns)
print(df)
print()

pivot_table = df.pivot_table(
    values = ["accidents"], #the default
    index = "street",
    columns = "avenue",
    aggfunc = "mean",       #the default
    margins = False         #the default
)

print(pivot_table)
sys.exit(0)

    street   avenue  accidents
0       44    Sixth       4461
1       43    Sixth       4361
2       44   Eighth       4481
3       42    Sixth       4261
4       44    Sixth       4462
5       43    Sixth       4362
6       42    Sixth       4262
7       42   Eighth       4281
8       42   Eighth       4282
9       44  Seventh       4471
10      42    Sixth       4260
11      43  Seventh       4371
12      43   Eighth       4382
13      43  Seventh       4372
14      44   Eighth       4480
15      42  Seventh       4272
16      42   Eighth       4280
17      42  Seventh       4270
18      43  Seventh       4370
19      43    Sixth       4360
20      43   Eighth       4381
21      42  Seventh       4271
22      44  Seventh       4472
23      44   Eighth       4482
24      44  Seventh       4470
25      44    Sixth       4460
26      43   Eighth       4380

       accidents
avenue    Eighth Seventh Sixth
street
42          4281    4271  4261
43          4381    4371  4361
44          4481    4471  4461

Things to try

Give the keyword argument
margins = True
to pivot_table to get an extra row at the bottom, giving the average for each avenue, and an extra column at the right, giving the average for each street.

       accidents
avenue    Eighth Seventh Sixth   All
street
42          4281    4271  4261  4271
43          4381    4371  4361  4371
44          4481    4471  4461  4471
All         4381    4371  4361  4371

Then set it back to False.

What happens when you add the following row to the data?
[42, "Eighth", 4285],
The mean for 42nd & Eighth has increased from 4281 to 4282. It is now the sum of four integers.
```
       accidents
avenue    Eighth Seventh Sixth
street
42          4282    4271  4261
43          4381    4371  4361
44          4481    4471  4461
```

What happens when you change the new row of the data to
[42, "Eighth", 4286],
The mean for 42nd & Eighth is now 4282.25

       accidents
avenue    Eighth Seventh   Sixth
street
42       4282.25  4271.0  4261.0
43       4381.00  4371.0  4361.0
44       4481.00  4471.0  4461.0

and the dtype for each of the three columns changes from np.int64, to np.float64, as you can check by saying

print(pivot_table.dtypes)   #Prints a Series of 3 items.

           avenue
accidents  Eighth     float64
           Seventh    float64
           Sixth      float64
dtype: object

What happens when you change the new row of the data to
[45, "Eighth", 4580],
The output gets a new row containing two np.nans, and the dtype for each of the three columns changes to np.float64.

       accidents
avenue    Eighth Seventh   Sixth
street
42        4281.0  4271.0  4261.0
43        4381.0  4371.0  4361.0
44        4481.0  4471.0  4461.0
45        4580.0     NaN     NaN

To insert 0 instead of np.nan, give the keyword argument
fill_value = 0
to pivot_table. The dtypes revert to np.int64.

       accidents
avenue    Eighth Seventh Sixth
street
42          4281    4271  4261
43          4381    4371  4361
44          4481    4471  4461
45          4580       0     0

What happens when you change the new row of the data to
[44, "Fifth", 4410],
Keep the
fill_value = 0
The output gets a new column with two np.nans.

       accidents
avenue    Eighth Fifth Seventh Sixth
street                
42          4281     0    4271  4261
43          4381     0    4371  4361
44          4481  4410    4471  4461

Instead of alphabetical order, list the avenues from west to east:

labels = ["Eighth", "Seventh", "Sixth", "Fifth"]
pivot_table = pivot_table.reindex(labels = labels, axis = 1, level = 1)

       accidents
avenue    Eighth Seventh Sixth Fifth
street                              
42          4281    4271  4261     0
43          4381    4371  4361     0
44          4481    4471  4461  4410

It would seem to be simpler if we change the pivot_table’s index of columns from a two-level pd.MultiIndex to a plain old one-level pd.Index.

print(f"{type(pivot_table.columns) = }")   #pd.MultiIndex
print()
print(pivot_table)
print()

pivot_table = pivot_table.droplevel(axis = 1, level = 0) #or axis = "columns"
print(f"{type(pivot_table.columns) = }")   #plain old pd.Index
print()
print(pivot_table)

type(pivot_table.columns) = <class 'pandas.core.indexes.multi.MultiIndex'>

      accidents              
avenue   Eighth Seventh Sixth
street
42         4281    4271  4261
43         4381    4371  4361
44         4481    4471  4461

type(pivot_table.columns) = <class 'pandas.core.indexes.base.Index'>

avenue  Eighth  Seventh  Sixth
street
42        4281     4271   4261
43        4381     4371   4361
44        4481     4471   4461

But keep the pd.MultiIndex. Here’s what it’s for.

Add a second column of data.

data = [
    [42, "Eighth",  4280, 14280],
    [42, "Eighth",  4281, 14281],
    [42, "Eighth",  4282, 14282],
    [42, "Seventh", 4270, 14270],
    [42, "Seventh", 4271, 14271],
    [42, "Seventh", 4272, 14272],
    [42, "Sixth",   4260, 14260],
    [42, "Sixth",   4261, 14261],
    [42, "Sixth",   4262, 14262],

    [43, "Eighth",  4380, 14380],
    [43, "Eighth",  4381, 14381],
    [43, "Eighth",  4382, 14382],
    [43, "Seventh", 4370, 14370],
    [43, "Seventh", 4371, 14371],
    [43, "Seventh", 4372, 14372],
    [43, "Sixth",   4360, 14360],
    [43, "Sixth",   4361, 14361],
    [43, "Sixth",   4362, 14362],

    [44, "Eighth",  4480, 14480],
    [44, "Eighth",  4481, 14481],
    [44, "Eighth",  4482, 14482],
    [44, "Seventh", 4470, 14470],
    [44, "Seventh", 4471, 14471],
    [44, "Seventh", 4472, 14472],
    [44, "Sixth",   4460, 14460],
    [44, "Sixth",   4461, 14461],
    [44, "Sixth",   4462, 14462]
]

columns = pd.Index(data = ["street", "avenue", "accidents", "vehicles"])

    values = ["accidents", "vehicles"],   #keyword argument of pivot_table

       accidents               vehicles               
avenue    Eighth Seventh Sixth   Eighth Seventh  Sixth
street                                                
42          4281    4271  4261    14281   14271  14261
43          4381    4371  4361    14381   14371  14361
44          4481    4471  4461    14481   14471  14461

Change the aggregation function to a list
aggfunc = ["sum", "count", "mean"], #keyword argument of pivot_table

             sum                                            count                                           mean                                      
       accidents                vehicles                accidents               vehicles               accidents               vehicles               
avenue    Eighth Seventh  Sixth   Eighth Seventh  Sixth    Eighth Seventh Sixth   Eighth Seventh Sixth    Eighth Seventh Sixth   Eighth Seventh  Sixth
street                                                                                                                                                
42         12843   12813  12783    42843   42813  42783         3       3     3        3       3     3      4281    4271  4261    14281   14271  14261
43         13143   13113  13083    43143   43113  43083         3       3     3        3       3     3      4381    4371  4361    14381   14371  14361
44         13443   13413  13383    43443   43413  43383         3       3     3        3       3     3      4481    4471  4461    14481   14471  14461

or to a dict
aggfunc = {"accidents": "sum", "vehicles": "mean"}, #keyword argument of pivot_table

       accidents                vehicles
avenue    Eighth Seventh  Sixth   Eighth Seventh  Sixth
street
42         12843   12813  12783    14281   14271  14261
43         13143   13113  13083    14381   14371  14361
44         13443   13413  13383    14481   14471  14461