A
reduction
or
aggregation
is an operation that takes an entire
Series
and produces a result that is a single number
(or a single string).
import sys
import numpy as np
import pandas as pd
data = [10.0, 20.0, 30.0, 40.0, 50.0]
series = pd.Series(data = data, name = "prices")
series.index.name = "day"
print(series)
print()
print(f"{series.count() = }")
print(f"{series.sum() = }") #also try prod
print(f"{series.mean() = }")
print(f"{series.median() = }")
print()
print(f"{series.min() = }")
print(f"{series.max() = }")
print(f"{series.idxmin() = }") #index of the smallest value
print(f"{series.idxmax() = }")
print(f"{np.argmin(series.array) = }") #position (an integer, like iloc) of the smallest value
print(f"{np.argmax(series.array) = }")
print()
print(series.describe()) #returns a Series containing 8 rows
sys.exit(0)
day 0 10.0 1 20.0 2 30.0 3 40.0 4 50.0 Name: prices, dtype: float64 series.count() = 5 series.sum() = 150.0 series.mean() = 30.0 series.median() = 30.0 series.min() = 10.0 series.max() = 50.0 series.idxmin() = 0 series.idxmax() = 4 np.argmin(series.array) = 0 np.argmax(series.array) = 4 count 5.000000 mean 30.000000 std 15.811388 min 10.000000 25% 20.000000 50% 30.000000 75% 40.000000 max 50.000000 Name: prices, dtype: float64
import sys
import numpy as np
import pandas as pd
data = [0.0, 10.0, np.nan, 30.0, None] #not a number
series = pd.Series(data = data)
print(series)
print()
print(f"{len(series) = }")
print(f"{series.size = }") #also try series.shape
print(f"{series.count() = }")
print()
print(f"{series.sum() = }") #ignores np.nan
print(f"{series.sum(skipna = False) = }")
sys.exit(0)
0 0.0 1 10.0 2 NaN 3 30.0 4 NaN dtype: float64 len(series) = 5 series.size = 5 series.count() = 3 series.sum() = 40.0 series.sum(skipna = False) = nan
"Covariance and correlation."
import sys
import math
import pandas as pd
#data about N stores
N = 7
data = [
[ 2, 5, 1, 3, 4, 1, 5], #number of commercials for each store
[24, 28, 22, 26, 25, 24, 26] #sales volume in hundreds of dollars
]
series0 = pd.Series(data = data[0], name = "Number of Commercials")
series1 = pd.Series(data = data[1], name = "Sales Volume in Hundreds")
print("sample variance:")
print(series0.var())
print(((series0 - series0.mean()) ** 2).sum() / (N - 1))
print()
print("sample standard deviation:")
print(series0.std())
print(math.sqrt(series0.var()))
print()
print("sample covariance:")
print(series0.cov(series1))
difference0 = series0 - series0.mean()
difference1 = series1 - series1.mean()
print((difference0 * difference1).sum() / (N - 1))
print()
print("sample correlation coefficient:")
print(series0.corr(series1))
print(series0.cov(series1) / (series0.std() * series1.std()))
sys.exit(0)
sample variance: 3.0 3.0 sample standard deviation: 1.7320508075688772 1.7320508075688772 sample covariance: 2.833333333333333 2.8333333333333335 sample correlation coefficiant: 0.8542821429703302 0.8542821429703302
"Scatter plot of a pair of pd.Serieses."
import pandas as pd
import matplotlib.pyplot as plt
#data about 7 stores
data = [
[ 2, 5, 1, 3, 4, 1, 5], #number of commercials for each store
[24, 28, 22, 26, 25, 24, 26] #sales volume in hundreds of dollars
]
series0 = pd.Series(data = data[0], name = "commercials")
series1 = pd.Series(data = data[1], name = "volume")
df = pd.concat([series0, series1], axis = 1) #Create a pd.DataFrame containing 2 columns.
print(df)
axes = df.plot.scatter(
x = "commercials",
y = "volume",
figsize = [6.4, 4.8], #DataFrame.plot.scatter creates a new Figure.
color = "#1f77b4", #red, green, blue
grid = False,
marker = "o", #style of marker; also try "s"
s = 25 #size of marker in points
)
figure = plt.gcf()
figure.canvas.set_window_title("matplotlib DataFrame.plot.scatter")
axes.set_title("Scatter Plot")
axes.set_xlabel("number of commercials")
axes.set_ylabel("sales volume in hundreds of dollars")
plt.show() #infinite loop
commercials volume 0 2 24 1 5 28 2 1 22 3 3 26 4 4 25 5 1 24 6 5 26
"Create a Series with a pd.DatetimeIndex. Compute the percent change in each row."
import sys
import pandas as pd
start = pd.Timestamp(year = 2020, month = 12, day = 25) #or start = pd.Timestamp("2020-12-25")
end = pd.Timestamp(year = 2020, month = 12, day = 31)
index = pd.date_range(start = start, end = end, freq = "1D", name = "date")
#or index = pd.date_range("2020-12-25", "2020-12-31", name = "date")
data = [25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0]
series = pd.Series(data = data, index = index, name = "Prices")
s = series.to_string(dtype = True, float_format = lambda price: f"${price:.2f}", length = True, name = True)
print(s)
print()
#Examine the index in greater detail.
print(f"{series.index = }")
print()
print(f"{type(series.index) = }")
print(f"{series.index.dtype.name = }")
print(f"{series.index.freqstr = }")
print()
seriesOfChanges = series.pct_change()
seriesOfChanges.name = "Percent Change"
s = seriesOfChanges.to_string(dtype = True, float_format = lambda change: f"{change:.4f} %", length = True, name = True)
print(s)
sys.exit(0)
date
2020-12-25 $25.00
2020-12-26 $26.00
2020-12-27 $27.00
2020-12-28 $28.00
2020-12-29 $29.00
2020-12-30 $30.00
2020-12-31 $31.00
Freq: D, Name: Prices, Length: 7, dtype: float64
series.index = DatetimeIndex(['2020-12-25', '2020-12-26', '2020-12-27', '2020-12-28',
'2020-12-29', '2020-12-30', '2020-12-31'],
dtype='datetime64[ns]', name='date', freq='D')
type(series.index) = <class 'pandas.core.indexes.datetimes.DatetimeIndex'>
series.index.dtype.name = 'datetime64[ns]'
series.index.freqstr = 'D'
date
2020-12-25 nan %
2020-12-26 0.0400 %
2020-12-27 0.0385 %
2020-12-28 0.0370 %
2020-12-29 0.0357 %
2020-12-30 0.0345 %
2020-12-31 0.0333 %
Freq: D, Name: Percent Change, Length: 7, dtype: float64
pip3 install pandas-datareader pip3 show pandas-datareader Name: pandas-datareader Version: 0.8.1 Summary: Data readers extracted from the pandas codebase,should be compatible with recent pandas versions Home-page: https://github.com/pydata/pandas-datareader Author: The PyData Development Team Author-email: pydata@googlegroups.com License: BSD License Location: /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages Requires: lxml, pandas, requests Required-by:
"""
How closely correlated is Apple with Google?
closeAAPL, closeGOOG, changeAAPL, changeGOOG are Serieses.
"""
import sys
import math
import pandas as pd
import pandas_datareader
df = pandas_datareader.data.get_data_yahoo(symbols = "AAPL") #df is a pd.DataFrame.
closeAAPL = df["Adj Close"] #Get the Adjusted Close column of the DataFrame.
closeAAPL.name = "AAPL Adj Close"
df = pandas_datareader.data.get_data_yahoo(symbols = "GOOG")
closeGOOG = df["Adj Close"]
closeGOOG.name = "GOOG Adj Close"
pd.set_option("max_rows", 6)
print(closeAAPL)
print()
print(closeGOOG)
print()
changeAAPL = closeAAPL.pct_change() #percent change
changeGOOG = closeGOOG.pct_change()
changeAAPL.name = "Percent Change AAPL"
changeGOOG.name = "Percent Change GOOG"
print(changeAAPL)
print()
print(changeGOOG)
print()
print(f"{changeAAPL.corr(changeGOOG) = }")
print(f"{changeAAPL.cov(changeGOOG) = }")
sys.exit(0)
Unfortunately
the first row of the output of
Series.pct_change
is always
np.nan,
because there is no previous row to compare the first row with.
Date
2014-12-12 100.821831
2014-12-15 99.443619
2014-12-16 98.083740
...
2019-12-09 266.920013
2019-12-10 268.480011
2019-12-11 268.850006
Name: AAPL Adj Close, Length: 1258, dtype: float64
Date
2014-12-12 517.239929
2014-12-15 512.393250
2014-12-16 494.033630
...
2019-12-09 1343.560059
2019-12-10 1344.660034
2019-12-11 1345.785034
Name: GOOG Adj Close, Length: 1258, dtype: float64
Date
2014-12-12 NaN
2014-12-15 -0.013670
2014-12-16 -0.013675
...
2019-12-09 -0.014000
2019-12-10 0.005844
2019-12-11 0.001378
Name: Percent Change AAPL, Length: 1258, dtype: float64
Date
2014-12-12 NaN
2014-12-15 -0.009370
2014-12-16 -0.035831
...
2019-12-09 0.002193
2019-12-10 0.000819
2019-12-11 0.000837
Name: Percent Change GOOG, Length: 1258, dtype: float64
changeAAPL.corr(changeGOOG) = 0.5233805904933649
changeAAPL.cov(changeGOOG) = 0.00012464409418096498
Here’s how I found out that there is a
"Volume"
column in addition to the
"Adj Close"
column.
df = pandas_datareader.data.get_data_yahoo("AAPL")
print(f"{df.columns = }")
df.columns = Index(['High', 'Low', 'Open', 'Close', 'Volume', 'Adj Close'], dtype='object')Is there any correlation between the volume and the adjusted close?