Data exploration and visualization

Author

Michał Janiga

1 Analysis of stock price volatility

For data from the last 15 months concerning two selected listed companies:
- create charts of the percentage changes of closing prices against the date,
- plot and compare histograms of the percentage changes of closing prices, and add density lines for normal distributions with parameters derived from the data to the charts,
- create one joint figure with box plots of the closing price changes.

1.1 Data preparation

show/hide
ticker1 <- "CDR"
ticker2 <- "PZU"
cutoff_date <- seq(Sys.Date(), length = 2, by = "-15 months")[2]

if (file.exists(".env")) readRenviron(".env")
apikey <- Sys.getenv("STOOQ_API_KEY")

download_data <- function(ticker) {
  url <- paste0('https://stooq.pl/q/d/l/?s=', ticker, '&i=d', '&apikey=', apikey)

  file_path <- paste0("data/", ticker, ".csv")
  if (!file.exists(file_path)) {
    download.file(url, file_path, quiet = TRUE)
  }
  
  data <- read.csv(file_path)
  data$Data <- as.Date(data$Data)

  data <- data[data$Data >= cutoff_date, ]
  data <- data[order(data$Data), ]
  
  n <- nrow(data)
  data$PctChange <- c(NA, diff(data$Zamkniecie) / data$Zamkniecie[-n] * 100)
  
  data <- data[!is.na(data$PctChange), c("Data", "PctChange")]
  data$Ticker <- ticker
  return(data)
}

df1 <- download_data(ticker1)
df2 <- download_data(ticker2)

1.2 Chart 1a: Price over time: CDR

show/hide
plot(df1$Data, df1$PctChange, type = "l", col = "blue", 
     main = paste("Percentage changes:", ticker1),
     xlab = "Date: last 15 months", 
     ylab = "Closing price change (%)")

1.3 Chart 1b: Price over time: PZU

show/hide
plot(df2$Data, df2$PctChange, type = "l", col = "red", 
     main = paste("Percentage changes:", ticker2),
     xlab = "Date: last 15 months", 
     ylab = "Closing price change (%)")

1.4 Chart 2a: Histograms and normal distribution CDR

show/hide
hist(df1$PctChange, breaks = 30, freq = FALSE, 
     col = "lightblue",
     main = paste("Histogram:", ticker1), 
     xlab = "Closing price change (%)", ylab = "Frequency")
curve(dnorm(x, mean = mean(df1$PctChange), sd = sd(df1$PctChange)), 
      add = TRUE, col = "darkred", lwd = 2)

1.5 Chart 2b: Histograms and normal distribution PZU

show/hide
hist(df2$PctChange, breaks = 30, freq = FALSE,
     col = "mistyrose",
     main = paste("Histogram:", ticker2),
     xlab = "Closing price change (%)", ylab = "Frequency")
curve(dnorm(x, mean = mean(df2$PctChange), sd = sd(df2$PctChange)), 
      add = TRUE, col = "darkred", lwd = 2)

1.6 Chart 3: Volatility comparison (box plot)

show/hide
boxplot(df1$PctChange, df2$PctChange, 
        names = c(ticker1, ticker2),
        col = c("lightblue", "mistyrose"),
        main = "Daily volatility comparison",
        ylab = "Closing price change (%)")

Analyzing the data of the daily percentage change of the closing price, differences in the range of these changes can be observed. CDR shares proved to be more volatile in the studied period than PZU shares, which is a fairly obvious observation considering the nature of both companies. The normal distribution, in the case of analyzing daily volatility, provides useful information which is the general level of volatility of a given instrument in the studied period, but it does not reflect “unrealized” risk by underestimating extreme events.

2 Analysis of air crash statistics

  1. Create a chart of the number of air crashes in individual:
  • months of the year (January - December),
  • days of the month (1-31),
  • days of the week (weekdays()).
  1. Plot how the following changed over subsequent years:
  • the number of people who survived the crashes,
  • the percentage of people (in percent) who survived the crashes.
show/hide
crashes = read.csv("data/crashes.csv")

2.1 Conversion: date

show/hide
crashes$DateObj = as.Date(crashes$Date, format='%m/%d/%Y')

2.2 Conversion: date components

show/hide
crashes$Year = as.numeric(format(crashes$DateObj, '%Y'))
crashes$Month = factor(format(crashes$DateObj, '%m'))
crashes$Day = as.numeric(format(crashes$DateObj, '%d'))

crashes$WeekdayNum = as.numeric(format(crashes$DateObj, "%u"))

crashes$Weekday = factor(crashes$WeekdayNum, 
                         levels = 1:7, 
                         labels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"))

2.3 Aggregation: month

show/hide
month_table = table(crashes$Month)

2.4 Chart 1: Number of crashes in individual months of the year

Months that have 31 days will be burdened with an additional error resulting from the fact that there are more accidents the more days there are in a given time interval.

This could “more” emphasize the “danger” of a month with 31 days (and underestimate it in the case of 28 days).

show/hide
barplot(month_table, 
        main = "Number of air crashes in individual months",
        col = "blue", 
        xlab = "Month", 
        ylab = "Number of crashes")

2.5 Aggregation: day of the month

show/hide
day_table = table(crashes$Day)

2.6 Chart 2: Number of crashes on individual days of the month (1-31)

In this step, it should be emphasized that the 31st day of the month is unrepresentative because there are ~ half as many 31st days of the month as other days on an annual scale.

show/hide
barplot(day_table, 
        main = "Number of crashes by day of the month (1-31)",
        col = "yellow", 
        xlab = "Day of the month", 
        ylab = "Number of crashes")

2.7 Aggregation: day of the week

show/hide
day_table = table(crashes$Weekday)

2.8 Chart 3: Number of crashes on individual days of the week (1-7)

show/hide
barplot(day_table, 
        main = "Number of crashes by day of the week (1-7)",
        col = "red", 
        xlab = "Day of the week", 
        ylab = "Number of crashes")

2.9 Calculations

show/hide
crashes$Survivors = crashes$Aboard - crashes$Fatalities

data_per_year = aggregate(cbind(Survivors, Aboard) ~ Year, data = crashes, sum)

data_per_year$SurvivalRate = (data_per_year$Survivors / data_per_year$Aboard) * 100

2.10 Chart 1: Number of survivors

show/hide
plot(data_per_year$Year, data_per_year$Survivors,
     type = "h",
     col = "blue",
     main = "Number of survivors of crashes (subsequent years)",
     xlab = "Year",
     ylab = "Number of survivors")

2.11 Chart 2: Percentage of survivors

show/hide
plot(data_per_year$Year, data_per_year$SurvivalRate,
     type = "h",
     col = "darkgreen",
     main = "Percentage of survivors (%)",
     xlab = "Year",
     ylab = "Percentage (%)")

3 Binomial distribution simulation

  1. For two different sets of binomial distribution parameters (rbinom):
  • Binom(25,0.2)
  • Binom(25,0.8)

generate random samples consisting of M = 1000 samples and plot the values of the generated data.

  1. For each of the distributions, plot the empirical and theoretical (use dbinom function) probability functions on one chart, and empirical and theoretical (use pbinom function) cumulative distribution functions on the second chart. In both cases, scale the x-axis from 0 to 25.

3.1 Data generation

show/hide
n <- 25
M <- 1000
x_range <- 0:25

set.seed(42)
sample1 <- rbinom(M, size = n, prob = 0.2)
sample2 <- rbinom(M, size = n, prob = 0.8)

3.2 Distribution: Binom(25, 0.2)

3.2.1 Chart for generated values

show/hide
plot(sample1,
     main = "Values of Binom(25, 0.2)", 
     xlab = "Index", ylab = "Value",
     col = "blue")

3.2.2 Probability function (empirical vs theoretical)

show/hide
empirical_sample1 <- table(factor(sample1, levels = x_range)) / M
theoretical_sample1 <- dbinom(x_range, size = n, prob = 0.2)
show/hide
plot(x_range, theoretical_sample1, type = "h", lwd = 2, 
     col = "blue", 
     main = "Probability Binom(25, 0.2)", 
     xlab = "k", ylab = "P(X=k)")
points(x_range, empirical_sample1, col = "red", pch = 16)
legend("topright", 
       legend = c("Theoretical", "Empirical"), 
       col = c("blue", "red"), 
       lwd = c(2, NA), 
       pch = c(NA, 16))

3.2.3 Cumulative distribution function (empirical vs theoretical)

show/hide
plot(x_range, pbinom(x_range, size = n, prob = 0.2), 
     type = "s", lwd = 2, col = "blue",
     main = "Cumulative distribution function Binom(25, 0.2)", 
     xlab = "x", ylab = "F(x)")
lines(ecdf(sample1), col = "red", verticals = TRUE, do.points = FALSE)
legend("bottomright", 
       legend = c("Theoretical", "Empirical"), 
       col = c("blue", "red"), lwd = 2)

3.3 Distribution: Binom(25, 0.8)

3.3.1 Chart for generated values

show/hide
plot(sample2, main = "Values of Binom(25, 0.8)",
     xlab = "Index", 
     ylab = "Value", 
     col = "darkgreen")

3.3.2 Probability function (empirical vs theoretical)

show/hide
empirical_sample2 <- table(factor(sample2, levels = x_range)) / M
theoretical_sample2 <- dbinom(x_range, size = n, prob = 0.8)
show/hide
plot(x_range, theoretical_sample2, type = "h", lwd = 2, 
     col = "darkgreen", 
     main = "Probability Binom(25, 0.8)", 
     xlab = "k", ylab = "P(X=k)")
points(x_range, empirical_sample2, col = "red", pch = 16)
legend("topleft", 
       legend = c("Theoretical", "Empirical"), 
       col = c("darkgreen", "red"), 
       lwd = c(2, NA), pch = c(NA, 16))

3.3.3 Cumulative distribution function (empirical vs theoretical)

show/hide
plot(x_range, pbinom(x_range, size = n, prob = 0.8), 
     type = "s", lwd = 2, col = "darkgreen", 
     main = "Cumulative distribution function Binom(25, 0.8)",
     xlab = "x", ylab = "F(x)")
lines(ecdf(sample2), col = "red",
      verticals = TRUE, do.points = FALSE)
legend("bottomright", 
       legend = c("Theoretical", "Empirical"), 
       col = c("darkgreen", "red"), lwd = 2)

4 Impact of sample size on statistical distribution

  1. For the binomial distribution Binom(25, 0.2), generate three random samples consisting of M = 100, 1000 and 10000 samples.
  2. For individual samples, plot empirical and theoretical probability functions, as well as empirical and theoretical cumulative distribution functions. Scale the x-axes from 0 to 25.
  3. In all cases, calculate empirical means and variances. Compare them with each other and with theoretical values for the Binom(25, 0.2) distribution.
show/hide
set.seed(42)
M_1 <- 100
M_2 <- 1000
M_3 <- 10000
n <- 25
p <- 0.2
x_range <- 0:25

4.1 Generating random samples

show/hide
sample_100 <- rbinom(M_1, n, p)
sample_1000 <- rbinom(M_2, n, p)
sample_10000 <- rbinom(M_3, n, p)

4.2 Function charts: probability and cumulative distribution (empirical and theoretical)

show/hide
theoretical_prob_mass_func <- dbinom(x_range, n, p)
theoretical_cum_dist_func <- pbinom(x_range, n, p)

4.2.1 For M = 100

show/hide
emp_pmf_100 <- table(factor(sample_100, levels = x_range)) / M_1
show/hide
plot(x_range, theoretical_prob_mass_func, 
     type = "h", lwd = 2, col = "blue", 
     main = "Probability function (M=100)", 
     xlab = "k", ylab = "P")
points(x_range, as.numeric(emp_pmf_100), 
       col = "red", pch = 16)
legend("topright", 
       legend = c("Theoretical", "Empirical"),
       col = c("blue", "red"), lwd = 2)

show/hide
plot(x_range, theoretical_cum_dist_func, type = "s", 
     lwd = 2, col = "blue", 
     main = "Cumulative distribution function (M=100)", 
     xlab = "x", ylab = "F")
lines(ecdf(sample_100), col = "red", 
      verticals = TRUE, do.points = FALSE)
legend("bottomright", 
       legend = c("Theoretical", "Empirical"), 
       col = c("blue", "red"), lwd = 2)

4.2.2 For M = 1000

show/hide
emp_pmf_1000 <- table(factor(sample_1000, levels = x_range)) / M_2
show/hide
plot(x_range, theoretical_prob_mass_func, type = "h", 
     lwd = 2, col = "blue", 
     main = "Probability function (M=1000)", 
     xlab = "k", ylab = "P")
points(x_range, as.numeric(emp_pmf_1000), col = "red", pch = 16)
legend("topright", legend = c("Theoretical", "Empirical"), 
       col = c("blue", "red"), lwd = 2)

show/hide
plot(x_range, theoretical_cum_dist_func, type = "s", 
     lwd = 2, col = "blue", 
     main = "Cumulative distribution function (M=1000)", 
     xlab = "x", ylab = "F")
lines(ecdf(sample_1000), col = "red", 
      verticals = TRUE, do.points = FALSE)
legend("bottomright", 
       legend = c("Theoretical", "Empirical"), 
       col = c("blue", "red"), lwd = 2)

4.2.3 For M = 10000

show/hide
emp_pmf_10000 <- table(factor(sample_10000, levels = x_range)) / M_3
show/hide
plot(x_range, theoretical_prob_mass_func, type = "h", 
     lwd = 2, col = "blue",
     main = "Probability function (M=10000)", 
     xlab = "k", ylab = "P")
points(x_range, as.numeric(emp_pmf_10000), 
       col = "red", pch = 16)
legend("topright", legend = c("Theoretical", "Empirical"), 
       col = c("blue", "red"), lwd = 2)

show/hide
plot(x_range, theoretical_cum_dist_func, type = "s", 
     lwd = 2, col = "blue",
     main = "Cumulative distribution function (M=10000)", 
     xlab = "x", ylab = "F")
lines(ecdf(sample_10000), col = "red", 
      verticals = TRUE, do.points = FALSE)
legend("bottomright", legend = c("Theoretical", "Empirical"), 
       col = c("blue", "red"), lwd = 2)

4.3 Calculating and comparing parameters

show/hide
# Theoretical values
theo_mean <- n * p
theo_var  <- n * p * (1 - p)

# Empirical values
mean_100 <- mean(sample_100)
var_100  <- var(sample_100)

mean_1000 <- mean(sample_1000)
var_1000  <- var(sample_1000)

mean_10000 <- mean(sample_10000)
var_10000  <- var(sample_10000)

result_table <- data.frame(
  "Type" = c("Theory", "M = 100", "M = 1000", "M = 10000"),
  "Mean" = c(theo_mean, mean_100, mean_1000, mean_10000),
  "Variance" = c(theo_var, var_100, var_1000, var_10000)
)

result_table
       Type   Mean Variance
1    Theory 5.0000 4.000000
2   M = 100 5.1400 4.747879
3  M = 1000 4.8790 4.030389
4 M = 10000 5.0092 4.084924

Based on the table above, it can be observed that as the sample size increases, the empirical mean and variance tend toward the theoretical mean and variance. It should be assumed that a larger sample size will allow for more accurate calculations.