pivot_table
method of
DataFrame
pd.pivot_table
Three reports have come in about the number of accidents
at Eighth Avenue & 42nd Street.
The
dfOfMeans
has a
MultiIndex
containing two levels.
The
pivot_table
has a plain old
Index
containing only one level.
"Create a pivot table using groupby and unstack."
import sys
import random
import pandas as pd
data = [
[42, "Eighth", 4280],
[42, "Eighth", 4281],
[42, "Eighth", 4282],
[42, "Seventh", 4270],
[42, "Seventh", 4271],
[42, "Seventh", 4272],
[42, "Sixth", 4260],
[42, "Sixth", 4261],
[42, "Sixth", 4262],
[43, "Eighth", 4380],
[43, "Eighth", 4381],
[43, "Eighth", 4382],
[43, "Seventh", 4370],
[43, "Seventh", 4371],
[43, "Seventh", 4372],
[43, "Sixth", 4360],
[43, "Sixth", 4361],
[43, "Sixth", 4362],
[44, "Eighth", 4480],
[44, "Eighth", 4481],
[44, "Eighth", 4482],
[44, "Seventh", 4470],
[44, "Seventh", 4471],
[44, "Seventh", 4472],
[44, "Sixth", 4460],
[44, "Sixth", 4461],
[44, "Sixth", 4462]
]
random.shuffle(data)
columns = pd.Index(data = ["street", "avenue", "accidents"])
df = pd.DataFrame(data = data, columns = columns)
print(df)
print()
dfOfMeans = df.groupby(["street", "avenue"]).mean()
print(dfOfMeans)
print()
pivot_table = dfOfMeans.unstack()
print(pivot_table)
sys.exit(0)
street avenue accidents
0 42 Eighth 4281
1 43 Eighth 4380
2 44 Eighth 4481
3 43 Seventh 4370
4 43 Sixth 4360
5 44 Sixth 4462
6 42 Seventh 4272
7 44 Seventh 4472
8 42 Sixth 4260
9 42 Eighth 4280
10 44 Sixth 4461
11 42 Eighth 4282
12 43 Seventh 4372
13 42 Sixth 4262
14 44 Seventh 4470
15 44 Eighth 4482
16 44 Eighth 4480
17 43 Seventh 4371
18 42 Seventh 4271
19 42 Seventh 4270
20 43 Eighth 4381
21 43 Eighth 4382
22 42 Sixth 4261
23 43 Sixth 4362
24 43 Sixth 4361
25 44 Sixth 4460
26 44 Seventh 4471
accidents
street avenue
42 Eighth 4281
Seventh 4271
Sixth 4261
43 Eighth 4381
Seventh 4371
Sixth 4361
44 Eighth 4481
Seventh 4471
Sixth 4461
accidents
avenue Eighth Seventh Sixth
street
42 4281 4271 4261
43 4381 4371 4361
44 4481 4471 4461
"Create a pivot table using pivot_table."
import sys
import random
import pandas as pd
data = [
[42, "Eighth", 4280],
[42, "Eighth", 4281],
[42, "Eighth", 4282],
[42, "Seventh", 4270],
[42, "Seventh", 4271],
[42, "Seventh", 4272],
[42, "Sixth", 4260],
[42, "Sixth", 4261],
[42, "Sixth", 4262],
[43, "Eighth", 4380],
[43, "Eighth", 4381],
[43, "Eighth", 4382],
[43, "Seventh", 4370],
[43, "Seventh", 4371],
[43, "Seventh", 4372],
[43, "Sixth", 4360],
[43, "Sixth", 4361],
[43, "Sixth", 4362],
[44, "Eighth", 4480],
[44, "Eighth", 4481],
[44, "Eighth", 4482],
[44, "Seventh", 4470],
[44, "Seventh", 4471],
[44, "Seventh", 4472],
[44, "Sixth", 4460],
[44, "Sixth", 4461],
[44, "Sixth", 4462]
]
random.shuffle(data)
columns = pd.Index(data = ["street", "avenue", "accidents"])
df = pd.DataFrame(data = data, columns = columns)
print(df)
print()
pivot_table = df.pivot_table(
values = ["accidents"], #the default
index = "street",
columns = "avenue",
aggfunc = "mean", #the default
margins = False #the default
)
print(pivot_table)
sys.exit(0)
street avenue accidents
0 44 Sixth 4461
1 43 Sixth 4361
2 44 Eighth 4481
3 42 Sixth 4261
4 44 Sixth 4462
5 43 Sixth 4362
6 42 Sixth 4262
7 42 Eighth 4281
8 42 Eighth 4282
9 44 Seventh 4471
10 42 Sixth 4260
11 43 Seventh 4371
12 43 Eighth 4382
13 43 Seventh 4372
14 44 Eighth 4480
15 42 Seventh 4272
16 42 Eighth 4280
17 42 Seventh 4270
18 43 Seventh 4370
19 43 Sixth 4360
20 43 Eighth 4381
21 42 Seventh 4271
22 44 Seventh 4472
23 44 Eighth 4482
24 44 Seventh 4470
25 44 Sixth 4460
26 43 Eighth 4380
accidents
avenue Eighth Seventh Sixth
street
42 4281 4271 4261
43 4381 4371 4361
44 4481 4471 4461
margins = True
pivot_table
to get an extra row at the bottom,
giving the average for each avenue,
and an extra column at the right,
giving the average for each street.
accidents
avenue Eighth Seventh Sixth All
street
42 4281 4271 4261 4271
43 4381 4371 4361 4371
44 4481 4471 4461 4471
All 4381 4371 4361 4371
Then set it back to
False.
data?
[42, "Eighth", 4285],
accidents
avenue Eighth Seventh Sixth
street
42 4282 4271 4261
43 4381 4371 4361
44 4481 4471 4461
data
to
[42, "Eighth", 4286],
accidents
avenue Eighth Seventh Sixth
street
42 4282.25 4271.0 4261.0
43 4381.00 4371.0 4361.0
44 4481.00 4471.0 4461.0
and the
dtype
for each of the three columns
changes from
np.int64,
to
np.float64,
as you can check by saying
print(pivot_table.dtypes) #Prints a Series of 3 items.
avenue
accidents Eighth float64
Seventh float64
Sixth float64
dtype: object
data
to
[45, "Eighth", 4580],
np.nans,
and the
dtype
for each of the three columns changes to
np.float64.
accidents
avenue Eighth Seventh Sixth
street
42 4281.0 4271.0 4261.0
43 4381.0 4371.0 4361.0
44 4481.0 4471.0 4461.0
45 4580.0 NaN NaN
To insert
0
instead of
np.nan,
give the keyword argument
fill_value = 0
pivot_table.
The
dtypes
revert to
np.int64.
accidents
avenue Eighth Seventh Sixth
street
42 4281 4271 4261
43 4381 4371 4361
44 4481 4471 4461
45 4580 0 0
data
to
[44, "Fifth", 4410],
fill_value = 0
np.nans.
accidents
avenue Eighth Fifth Seventh Sixth
street
42 4281 0 4271 4261
43 4381 0 4371 4361
44 4481 4410 4471 4461
Instead of alphabetical order,
list the avenues from west to east:
labels = ["Eighth", "Seventh", "Sixth", "Fifth"] pivot_table = pivot_table.reindex(labels = labels, axis = 1, level = 1)
accidents
avenue Eighth Seventh Sixth Fifth
street
42 4281 4271 4261 0
43 4381 4371 4361 0
44 4481 4471 4461 4410
pivot_table’s
index of columns
from a two-level
pd.MultiIndex
to a plain old one-level
pd.Index.
print(f"{type(pivot_table.columns) = }") #pd.MultiIndex print() print(pivot_table) print() pivot_table = pivot_table.droplevel(axis = 1, level = 0) #or axis = "columns" print(f"{type(pivot_table.columns) = }") #plain old pd.Index print() print(pivot_table)
type(pivot_table.columns) = <class 'pandas.core.indexes.multi.MultiIndex'> accidents avenue Eighth Seventh Sixth street 42 4281 4271 4261 43 4381 4371 4361 44 4481 4471 4461 type(pivot_table.columns) = <class 'pandas.core.indexes.base.Index'> avenue Eighth Seventh Sixth street 42 4281 4271 4261 43 4381 4371 4361 44 4481 4471 4461
pd.MultiIndex.
Here’s what it’s for.
Add a second column of data.
data = [
[42, "Eighth", 4280, 14280],
[42, "Eighth", 4281, 14281],
[42, "Eighth", 4282, 14282],
[42, "Seventh", 4270, 14270],
[42, "Seventh", 4271, 14271],
[42, "Seventh", 4272, 14272],
[42, "Sixth", 4260, 14260],
[42, "Sixth", 4261, 14261],
[42, "Sixth", 4262, 14262],
[43, "Eighth", 4380, 14380],
[43, "Eighth", 4381, 14381],
[43, "Eighth", 4382, 14382],
[43, "Seventh", 4370, 14370],
[43, "Seventh", 4371, 14371],
[43, "Seventh", 4372, 14372],
[43, "Sixth", 4360, 14360],
[43, "Sixth", 4361, 14361],
[43, "Sixth", 4362, 14362],
[44, "Eighth", 4480, 14480],
[44, "Eighth", 4481, 14481],
[44, "Eighth", 4482, 14482],
[44, "Seventh", 4470, 14470],
[44, "Seventh", 4471, 14471],
[44, "Seventh", 4472, 14472],
[44, "Sixth", 4460, 14460],
[44, "Sixth", 4461, 14461],
[44, "Sixth", 4462, 14462]
]
columns = pd.Index(data = ["street", "avenue", "accidents", "vehicles"])
values = ["accidents", "vehicles"], #keyword argument of pivot_table
accidents vehicles
avenue Eighth Seventh Sixth Eighth Seventh Sixth
street
42 4281 4271 4261 14281 14271 14261
43 4381 4371 4361 14381 14371 14361
44 4481 4471 4461 14481 14471 14461
list
aggfunc = ["sum", "count", "mean"], #keyword argument of pivot_table
sum count mean
accidents vehicles accidents vehicles accidents vehicles
avenue Eighth Seventh Sixth Eighth Seventh Sixth Eighth Seventh Sixth Eighth Seventh Sixth Eighth Seventh Sixth Eighth Seventh Sixth
street
42 12843 12813 12783 42843 42813 42783 3 3 3 3 3 3 4281 4271 4261 14281 14271 14261
43 13143 13113 13083 43143 43113 43083 3 3 3 3 3 3 4381 4371 4361 14381 14371 14361
44 13443 13413 13383 43443 43413 43383 3 3 3 3 3 3 4481 4471 4461 14481 14471 14461
or to a
dict
aggfunc = {"accidents": "sum", "vehicles": "mean"}, #keyword argument of pivot_table
accidents vehicles
avenue Eighth Seventh Sixth Eighth Seventh Sixth
street
42 12843 12813 12783 14281 14271 14261
43 13143 13113 13083 14381 14371 14361
44 13443 13413 13383 14481 14471 14461