Department of Statistics, Faculty of Applied Sciences, University of Sri Jayewardenepura, Sri Lanka
2025-06-12
(Based on 2019 data)
Favorable climate that speed up the mosquito life cycle
Warm temperatures year-round
High humidity and frequent rainfall
No harsh winters
Many equatorial countries have rapidly growing urban areas with poor waste and water management.
People often live and work in open environments with limited protection, increasing exposure.
Sri Lanka
library(dplyr)
library(ggplot2)
library(lubridate)
library(tsibble)
library(tidyverse)
library(lubridate)
library(plotly)
# Step 1: Aggregate weekly cases across all districts
data("srilanka_weekly_data")
srilanka_weekly_data <- srilanka_weekly_data[-which(srilanka_weekly_data$year == 2023 & srilanka_weekly_data$week == 52), ]
srilanka_weekly_data[which(srilanka_weekly_data$start.date == "12/26/2020"), ]$cases <- c(35, 17, 18, 20, 2, 0, 3, 2, 7, 6, 1, 1, 0, 0, 208, 0, 0, 12, 8, 4, 0, 0, 0, 2, 3, 2)
country_weekly <- srilanka_weekly_data |>
group_by(year, week, start.date) %>%
summarise(total_cases = sum(cases, na.rm = TRUE), .groups = 'drop') |>
arrange(start.date)
# Step 2: Plot the time series
country_weekly <- country_weekly |>
mutate(
yearweek = yearweek(start.date)) |>
distinct(yearweek, .keep_all = TRUE)
country_weekly_tsibble <- country_weekly |>
as_tsibble(index = yearweek)
p1 <- ggplot(country_weekly_tsibble, aes(x = yearweek, y = total_cases)) +
geom_line() +
scale_x_yearweek(date_breaks = "1 year", date_labels = "%Y") +
labs(
# title = "Weekly Dengue Cases in Sri Lanka",
# subtitle = "From National Aggregated Data",
x = "Year",
y = "Weekly Dengue Cases"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(face = "bold")
)
ggplotly(p1)Red Segment: Interrupted Period Due to the COVID-19 Pandemic
denguedatahub R packagedenguedatahub R packagelink: https://denguedatahub.netlify.app/
Approach 1
Excluding the interrupted period from model training
Approach 2
Forecasting the interrupted period first and then using it for modeling
Approach 3
Down-weighting interrupted period
Benchmark
#data("srilanka_weekly_data")
df <- srilanka_weekly_data |>
mutate(yearweek = yearweek(start.date))
duplicaterws <- df |>
mutate(
yearweek = yearweek(start.date)) |>
duplicates(key = district, index = yearweek)
df_tsibble <- df |>
distinct(district, yearweek, .keep_all = TRUE) |>
as_tsibble(index = yearweek, key = district)df_tsibble |> head()
# A tsibble: 6 x 7 [1W]
# Key: district [1]
year week start.date end.date district cases yearweek
<dbl> <dbl> <chr> <chr> <chr> <dbl> <week>
1 2006 52 12/23/2006 12/29/2006 Ampara 0 2006 W51
2 2007 1 12/30/2006 1/5/2007 Ampara 0 2006 W52
3 2007 2 1/6/2007 1/12/2007 Ampara 0 2007 W01
4 2007 3 1/13/2007 1/19/2007 Ampara 0 2007 W02
5 2007 4 1/20/2007 1/26/2007 Ampara 0 2007 W03
6 2007 5 1/27/2007 2/2/2007 Ampara 0 2007 W04df_tsibble |> tail()
# A tsibble: 6 x 7 [1W]
# Key: district [1]
year week start.date end.date district cases yearweek
<dbl> <dbl> <chr> <chr> <chr> <dbl> <week>
1 2025 6 1/25/2025 1/31/2025 Vavuniya 3 2025 W04
2 2025 7 2/1/2025 2/7/2025 Vavuniya 2 2025 W05
3 2025 8 2/8/2025 2/14/2025 Vavuniya 2 2025 W06
4 2025 9 2/15/2025 2/21/2025 Vavuniya 0 2025 W07
5 2025 10 2/22/2025 2/28/2025 Vavuniya 5 2025 W08
6 2025 11 3/1/2025 3/7/2025 Vavuniya 0 2025 W09Training set
train_tsibble <- df_tsibble |>
filter(year(yearweek) < 2025)
train_tsibble
# A tsibble: 24,466 x 7 [1W]
# Key: district [26]
year week start.date end.date district cases yearweek
<dbl> <dbl> <chr> <chr> <chr> <dbl> <week>
1 2006 52 12/23/2006 12/29/2006 Ampara 0 2006 W51
2 2007 1 12/30/2006 1/5/2007 Ampara 0 2006 W52
3 2007 2 1/6/2007 1/12/2007 Ampara 0 2007 W01
4 2007 3 1/13/2007 1/19/2007 Ampara 0 2007 W02
5 2007 4 1/20/2007 1/26/2007 Ampara 0 2007 W03
6 2007 5 1/27/2007 2/2/2007 Ampara 0 2007 W04
7 2007 6 2/3/2007 2/9/2007 Ampara 0 2007 W05
8 2007 7 2/10/2007 2/16/2007 Ampara 0 2007 W06
9 2007 8 2/17/2007 2/23/2007 Ampara 0 2007 W07
10 2007 9 2/24/2007 3/2/2007 Ampara 1 2007 W08
# ℹ 24,456 more rowsTest set
test_tsibble <- df_tsibble |>
filter(year(yearweek) == 2025)
test_tsibble
# A tsibble: 234 x 7 [1W]
# Key: district [26]
year week start.date end.date district cases yearweek
<dbl> <dbl> <chr> <chr> <chr> <dbl> <week>
1 2025 3 1/4/2025 1/10/2025 Ampara 6 2025 W01
2 2025 4 1/11/2025 1/17/2025 Ampara 7 2025 W02
3 2025 5 1/18/2025 1/24/2025 Ampara 3 2025 W03
4 2025 6 1/25/2025 1/31/2025 Ampara 4 2025 W04
5 2025 7 2/1/2025 2/7/2025 Ampara 3 2025 W05
6 2025 8 2/8/2025 2/14/2025 Ampara 3 2025 W06
7 2025 9 2/15/2025 2/21/2025 Ampara 5 2025 W07
8 2025 10 2/22/2025 2/28/2025 Ampara 4 2025 W08
9 2025 11 3/1/2025 3/7/2025 Ampara 0 2025 W09
10 2025 3 1/4/2025 1/10/2025 Anuradhapura 20 2025 W01
# ℹ 224 more rows# A mable: 26 x 2
# Key: district [26]
district arima
<chr> <model>
1 Ampara <ARIMA(0,1,2)>
2 Anuradhapura <ARIMA(0,1,4)>
3 Badulla <ARIMA(2,1,3)(0,0,1)[52]>
4 Batticaloa <ARIMA(2,1,3)(0,0,1)[52]>
5 Colombo <ARIMA(0,1,2)(0,0,1)[52]>
6 Galle <ARIMA(0,1,3)(0,0,1)[52]>
7 Gampaha <ARIMA(0,1,2)>
8 Hambanthota <ARIMA(0,1,1)(0,0,1)[52]>
9 Jaffna <ARIMA(0,1,5)>
10 Kalmune <ARIMA(0,1,0)>
# ℹ 16 more rows
train_tsibble_excludecovid |> head()
# A tsibble: 6 x 7 [1W]
# Key: district [1]
year week start.date end.date district cases yearweek
<dbl> <dbl> <chr> <chr> <chr> <dbl> <week>
1 2023 2 1/7/2023 1/13/2023 Ampara 7 2023 W01
2 2023 3 1/14/2023 1/20/2023 Ampara 8 2023 W02
3 2023 4 1/21/2023 1/27/2023 Ampara 1 2023 W03
4 2023 5 1/28/2023 2/3/2023 Ampara 1 2023 W04
5 2023 6 2/4/2023 2/10/2023 Ampara 7 2023 W05
6 2023 7 2/11/2023 2/17/2023 Ampara 1 2023 W06
train_tsibble_excludecovid |> tail()
# A tsibble: 6 x 7 [1W]
# Key: district [1]
year week start.date end.date district cases yearweek
<dbl> <dbl> <chr> <chr> <chr> <dbl> <week>
1 2024 49 11/23/2024 11/29/2024 Vavuniya 3 2024 W47
2 2024 50 11/30/2024 12/6/2024 Vavuniya 2 2024 W48
3 2024 51 12/7/2024 12/13/2024 Vavuniya 4 2024 W49
4 2024 52 12/14/2024 12/20/2024 Vavuniya 4 2024 W50
5 2025 1 12/21/2024 12/27/2024 Vavuniya 2 2024 W51
6 2025 2 12/28/2024 1/3/2025 Vavuniya 4 2024 W52# A mable: 26 x 2
# Key: district [26]
district arima
<chr> <model>
1 Ampara <ARIMA(1,0,3) w/ mean>
2 Anuradhapura <ARIMA(0,0,3) w/ mean>
3 Badulla <ARIMA(1,0,1) w/ mean>
4 Batticaloa <ARIMA(0,1,0)>
5 Colombo <ARIMA(3,0,1) w/ mean>
6 Galle <ARIMA(0,1,1)>
7 Gampaha <ARIMA(0,1,1)>
8 Hambanthota <ARIMA(0,1,1)>
9 Jaffna <ARIMA(1,0,2)>
10 Kalmune <ARIMA(3,1,0)>
# ℹ 16 more rows
Step 1: Forecasting interrupted period: training and test sets
Step 2: Fit models
Step 3: Generate forecasts for interrupted period
Step 4: Replace interrupted period with forecasts
train_tsibble_updated
# A tsibble: 24,466 x 8 [1W]
# Key: district [26]
year week start.date end.date district yearweek .model cases
<dbl> <dbl> <chr> <chr> <chr> <week> <chr> <dbl>
1 2006 52 12/23/2006 12/29/2006 Ampara 2006 W51 <NA> 0
2 2007 1 12/30/2006 1/5/2007 Ampara 2006 W52 <NA> 0
3 2007 2 1/6/2007 1/12/2007 Ampara 2007 W01 <NA> 0
4 2007 3 1/13/2007 1/19/2007 Ampara 2007 W02 <NA> 0
5 2007 4 1/20/2007 1/26/2007 Ampara 2007 W03 <NA> 0
6 2007 5 1/27/2007 2/2/2007 Ampara 2007 W04 <NA> 0
7 2007 6 2/3/2007 2/9/2007 Ampara 2007 W05 <NA> 0
8 2007 7 2/10/2007 2/16/2007 Ampara 2007 W06 <NA> 0
9 2007 8 2/17/2007 2/23/2007 Ampara 2007 W07 <NA> 0
10 2007 9 2/24/2007 3/2/2007 Ampara 2007 W08 <NA> 1
# ℹ 24,456 more rowsStep 5: Use updated training set for forecasting
tb_all_ARIMA_updatedcovid
# A mable: 26 x 2
# Key: district [26]
district arima
<chr> <model>
1 Ampara <ARIMA(0,1,2)(0,0,2)[52]>
2 Anuradhapura <ARIMA(3,1,3)>
3 Badulla <ARIMA(1,1,4)>
4 Batticaloa <ARIMA(0,1,1)(0,0,1)[52]>
5 Colombo <ARIMA(0,1,3)(0,0,1)[52]>
6 Galle <ARIMA(0,1,3)(0,0,1)[52]>
7 Gampaha <ARIMA(0,1,2)>
8 Hambanthota <ARIMA(0,1,1)(0,0,1)[52]>
9 Jaffna <ARIMA(1,1,2)>
10 Kalmune <ARIMA(0,1,0)>
# ℹ 16 more rowsBenchmark: Without considering the interruption effect
Approach 1: Excluding the interrupted period from model training
Approach 2: Forecasting the interrupted period first and then using it for modeling
\[forecast_{Approach 3} = 0.7 \times forecast_{\text{excluded}} + 0.3 \times forecast_{\text{included without extimating}}\]
Output only for fcb_accuracy for illustration
fcb_accuracy
# A tibble: 26 × 11
.model district .type ME RMSE MAE MPE MAPE MASE RMSSE
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 arima Ampara Test 0.419 1.95 1.50 -Inf Inf NaN NaN
2 arima Anuradhapura Test 4.98 9.37 6.78 12.0 22.8 NaN NaN
3 arima Badulla Test 12.9 15.2 12.9 64.5 66.1 NaN NaN
4 arima Batticaloa Test -0.00937 7.56 5.14 -1.35 7.95 NaN NaN
5 arima Colombo Test 38.4 43.3 38.4 15.2 15.2 NaN NaN
6 arima Galle Test 21.1 25.8 22.1 38.6 43.3 NaN NaN
7 arima Gampaha Test -62.5 68.1 62.5 -45.4 45.4 NaN NaN
8 arima Hambanthota Test -7.47 10.8 9.77 -59.8 66.7 NaN NaN
9 arima Jaffna Test -20.4 22.6 20.4 -76.0 76.0 NaN NaN
10 arima Kalmune Test -41 41.2 41 -355. 355. NaN NaN
# ℹ 16 more rows
# ℹ 1 more variable: ACF1 <dbl>Benchmark <- fcb_accuracy$RMSE
Excluded <- fc_a1_accuracy$RMSE
UpdatedCovid <- fc_a2_accuracy$RMSE
rmse_by_district <- fc_a3 |>
group_by(district) |>
summarise(RMSE = sqrt(mean((actual - predicted)^2)), .groups = "drop")
Downweight <- rmse_by_district$RMSE
rmse_tbl <- tibble(Benchmark,Excluded, UpdatedCovid, Downweight)# A tibble: 26 × 4
Benchmark Excluded UpdatedCovid Downweight
<dbl> <dbl> <dbl> <dbl>
1 1.95 2.13 2.12 2.12
2 9.37 12.4 8.73 11.6
3 15.2 9.97 11.6 9.68
4 7.56 7.87 8.57 7.72
5 43.3 42.4 43.9 26.4
6 25.8 23.8 24.6 24.2
7 68.1 32.6 70.0 42.2
8 10.8 11.1 10.8 11.2
9 22.6 8.64 22.3 8.56
10 41.2 29.6 41.2 33.1
# ℹ 16 more rows
| Approach | mean_RMSE | median_RMSE | sd_RMSE | min_RMSE | max_RMSE |
|---|---|---|---|---|---|
| Benchmark | 15.4 | 10.3 | 15.6 | 1.95 | 68.1 |
| Downweight | 12.4 | 10.1 | 10.2 | 1.66 | 42.2 |
| Excluded | 12.6 | 10.0 | 10.4 | 1.56 | 42.4 |
| UpdatedCovid | 15.6 | 10.4 | 15.8 | 2.12 | 70.0 |
| Approach | mean_MAE | median_MAE | sd_MAE | min_MAE | max_MAE |
|---|---|---|---|---|---|
| Benchmark | 13.7 | 8.14 | 14.65 | 1.50 | 62.5 |
| Downweight | 10.6 | 8.62 | 9.31 | 1.46 | 36.7 |
| Excluded | 10.8 | 8.72 | 9.33 | 1.15 | 35.5 |
| UpdatedCovid | 13.9 | 8.35 | 14.87 | 1.74 | 64.5 |
| Approach | mean_ME | median_ME | sd_ME | min_ME | max_ME |
|---|---|---|---|---|---|
| Benchmark | -4.33 | -1.82 | 19.2 | -62.5 | 38.4 |
| Downweight | -4.29 | -2.69 | 11.6 | -34.2 | 19.1 |
| Excluded | -4.21 | -1.76 | 11.3 | -29.3 | 18.5 |
| UpdatedCovid | -5.00 | -2.73 | 19.2 | -64.5 | 39.2 |
# A tibble: 26 × 6
Benchmark Excluded UpdatedCovid Downweight Best District
<dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 1.95 2.13 2.12 2.12 Benchmark Ampara
2 9.37 12.4 8.73 11.6 UpdatedCovid Anuradhapura
3 15.2 9.97 11.6 9.68 Downweight Badulla
4 7.56 7.87 8.57 7.72 Benchmark Batticaloa
5 43.3 42.4 43.9 26.4 Downweight Colombo
6 25.8 23.8 24.6 24.2 Excluded Galle
7 68.1 32.6 70.0 42.2 Excluded Gampaha
8 10.8 11.1 10.8 11.2 Benchmark Hambanthota
9 22.6 8.64 22.3 8.56 Downweight Jaffna
10 41.2 29.6 41.2 33.1 Excluded Kalmune
# ℹ 16 more rows
library(sf)
rmse_tbl <- rmse_tbl |>
mutate(join_key = substr(District, 1, 5))
lka_adm2 <- lka_adm2 |>
mutate(join_key = substr(NAME, 1, 5))
# left join to preserve sf geometry from lka_adm2
combined_sf <- lka_adm2 %>%
left_join(rmse_tbl, by = "join_key")
p1 <- ggplot(combined_sf ) +
geom_sf(aes(fill=Best)) + scale_fill_brewer(palette = "Dark2")
p1 district trend_strength seasonal_strength_year seasonal_peak_year
1 Ampara 0.468 0.215 33
2 Anuradhapura 0.455 0.226 30
3 Badulla 0.410 0.297 31
4 Batticaloa 0.502 0.369 5
5 Colombo 0.588 0.284 32
6 Galle 0.670 0.271 33
7 Gampaha 0.502 0.231 31
8 Hambanthota 0.619 0.233 32
9 Jaffna 0.407 0.361 6
10 Kalmune 0.427 0.226 27
11 Kalutara 0.560 0.241 31
12 Kandy 0.555 0.277 33
13 Kegalle 0.522 0.228 32
14 Kilinochchi 0.430 0.321 6
15 Kurunegala 0.579 0.252 31
16 Mannar 0.240 0.264 5
17 Matale 0.453 0.224 50
18 Matara 0.558 0.293 32
19 Monaragala 0.670 0.211 3
20 Mullaitivu 0.362 0.242 7
21 NuwaraEliya 0.372 0.274 33
22 Polonnaruwa 0.434 0.213 30
23 Puttalam 0.631 0.303 6
24 Ratnapura 0.349 0.279 30
25 Trincomalee 0.372 0.228 6
26 Vavuniya 0.328 0.194 2
seasonal_trough_year spikiness linearity curvature stl_e_acf1 stl_e_acf10
1 19 1.33e-02 36.74 -31.12 0.570 0.880
2 43 1.16e+00 92.47 -65.46 0.611 1.269
3 18 6.86e+00 207.97 -110.05 0.722 1.111
4 41 1.44e+01 511.68 -279.78 0.892 2.890
5 18 5.79e+03 1453.53 -1743.49 0.872 2.636
6 18 4.89e+00 486.11 -173.42 0.686 1.573
7 11 2.15e+04 1123.86 -939.40 0.893 2.236
8 18 1.75e-01 193.00 -80.41 0.617 1.352
9 39 5.36e+02 783.94 -268.39 0.889 2.411
10 42 3.22e+01 273.68 -167.21 0.835 1.572
11 11 1.69e+02 665.38 -368.87 0.641 1.169
12 18 5.73e+02 952.37 -311.29 0.870 2.601
13 18 1.50e+02 301.47 -244.02 0.801 2.906
14 41 4.14e-03 46.04 -15.71 0.624 1.288
15 18 6.58e+01 296.07 -393.57 0.768 2.381
16 38 5.16e-02 30.20 -17.08 0.599 0.778
17 18 3.52e+00 199.38 -5.76 0.837 2.206
18 13 2.56e+01 282.07 -213.84 0.745 1.716
19 18 5.34e-01 126.58 -49.38 0.530 0.960
20 39 6.92e-04 26.02 -16.52 0.532 0.874
21 15 2.49e-02 36.13 -35.56 0.637 1.270
22 41 9.12e-02 60.36 -42.67 0.510 0.602
23 18 8.88e+00 328.41 -206.74 0.752 1.664
24 18 3.24e+02 370.55 -256.71 0.744 1.381
25 38 5.02e+01 301.66 -173.85 0.906 2.668
26 22 1.64e+00 -6.68 -47.94 0.858 1.739
No single method performs best across all districts.
The effectiveness of each approach depends not only on the model architecture but also on the nature of interruption and historical patterns unique to each district.
Beyond improving model performance, this can provide insight into the stability and nature of the time series signal.
Hyndman, R. J., & Rostami-Tabar, B. (2025). Forecasting interrupted time series. Journal of the Operational Research Society, 76(4), 790-803.
Slides are available at: https://thiyangt.github.io/RMedicine2025/#/title-slide
Link to denguedatahub: https://denguedatahub.netlify.app/
Contact: ttalagala@sjp.ac.lk