#!/usr/bin/perl
use strict;
use warnings;
#use Text::CSV;
use DBI;
use Scalar::Util qw(looks_like_number);
use List::Util qw(min);
use List::Util qw(max);
use List::Util qw(any);
use List::MoreUtils qw(uniq);
use Date::Calc qw(Add_Delta_YM);
use DateTime;
use DateTime::Duration;
use Statistics::LineFit;
#
#
#
my $debug = 0;
#
# Statistical processing for noisy data
#
my $adj_confidence = 0; # Adjust trendline slope according to R2 confidence
my $smooth_baseline = 1; # Smooth the deaths in the trend baseline
my $smooth_auto = 2; # Automaticaly smooth the raw data in certain demographics (SCHOOL, CHILDREN), certain diseases (influenza) and if the pop is small 0 = Never. 1 = Auto 2 = All
## Noise reduction parameters
#
my $signal_proc_raw = "raw"; # No signal processing
#my $signal_proc_cooked = "ma"; # Can be "ma" or "loess" currently
my $signal_proc_cooked = "loess"; # Can be "ma" (moving average) or "loess" (my loess algorithm)
#
# Moving Average Smoother parameters
#
my $smooth_window_trends = 2; # Size of the moving average window for smoothing trends
#(na if $smooth_baseline is 0 or the base death data has already been smoothed
my $smooth_window_deaths = 2; # Size of the smoothing window to apply to the base death data
#
# Loess Smoother parameters
my $loess_span = 5; # Adjust this value to change the span of smoothing
my $loess_degree = 1; # Adjust this value to change the degree of smoothing
my $loess_span_trends = 3; # Adjust this value to change the span of smoothing
my $loess_degree_trends = 1; # Adjust this value to change the degree of smoothing
#
####################################
#
#
# Allocate 999s and C99s
#
my $allocate_99s = 1;
#
# Basic reporting lag -- what percentage of death certificates are received from the states to the CDC
# my month of report. I.e. deaths that are only 1 month old are 50% complete. 2 months are 70% etc.
#
my $est_window = 18; # Mortality numbers within 18 months are flagged as estimates
my @reporting_lag = (0.50,0.70,0.80,0.85,0.90,0.95); # Beyond end of reporting_lag assumed to be 100% reported
#
# Trend points. These set the minimium number of trend points necessary to establish a trend
# As well as a warning threshold
#
my $minTrendPoints = 3;
my $minTrendPointsWARN = 4;
#
# Main internal data structures. Death data is read from disk (mysql database)
# and then stored here in these hashes. Each hash has several dimensions
#
#
my %deaths;
my %population;
# DEMO START
# Demographics (age ranges only at this point)
# These hashes map age demographic codes as they come from WONDER (@age_codes) into normalized
# codes that are used internally (@display_codes)
#
my @display_codes = ("< 1","1-4","0-17","5-14","5-17","15-24","18-24","25-34","35-44","45-54","55-64","65-74","75-84","85+","ALL AGES");
my @nondistinct_codes = ("< 1","1-4","5-14","5-17","15-24","ALL AGES");
my @age_codes = ("< 1 year","1-4 years","0-17 years","5-14 years","5-17 years","15-24 years","18-24 years","25-34 years","35-44 years","45-54 years",
"55-64 years","65-74 years","75-84 years","85+ years","ALL AGES");
my %code_map;
my %code_map_r;
@code_map{@display_codes} = @age_codes;
@code_map_r{@age_codes} = @display_codes;
#
# Internal demographic categories
# ALL -- All ages
# OAF -- OAF (Old And Frail or Old As Fuck) 75+ years
# MAL -- MAL (Middle Age Living or Middle Age Loser) 45-74 years
# YAH -- YAH (Young And Healthy or Young And Horrible)) 18-44 years
# SCHOOL -- SCHOOL aged children. 5-17 years
# CHILD -- Children. 0-17 years
#
my @ALLDemo = all_distinct_ages();
my @CHILDDemo = ("0-17");
my @SCHOOLDemo = ("5-17");
my @YAHDemo = ("18-24","25-34","35-44");
my @MALDemo = ("45-54","55-64","65-74");
my @OAFDemo = ("75-84","85+");
#
# First two characters are sort order for output and not returned by demoNames
#
my %DEMOSets = ("01SCHOOL" => \@SCHOOLDemo,
"02CHILD" => \@CHILDDemo,
"03YAH" => \@YAHDemo,
"04MAL" => \@MALDemo,
"05OAF" => \@OAFDemo,
"99ALL" => \@ALLDemo);
#
#
#
my $dsn = "DBI:mysql:PHI_DEATH";
my $username = "xxxx";
my $password = "xxxxx";
my %db_attr = (PrintError=>0,RaiseError=>1);
my $dbh = DBI->connect($dsn,$username,$password,\%db_attr);
#my $csv = Text::CSV->new({
#sep_char => ',',
# sep_char => "\t",
# binary => 1,
# auto_diag => 1
#});
#
sub demoNames {
my @names;
for my $name (keys %DEMOSets) {
push(@names,$name);
}
@names = sort(@names);
s/^\d\d// for (@names);
return(@names);
}
sub demoSet {
my ($demo) = @_;
my @set;
my $demomatch;
die "NO DEMO IN demoSet" if(not defined($demo));
for my $demoset (keys %DEMOSets) {
$demomatch = $demoset;
$demoset =~ s/^\d\d//;
last if($demo eq $demoset);
undef $demomatch;
}
die "No demoSet named $demo" if(not defined($demomatch));
my $demoSets = $DEMOSets{$demomatch};
@set = @{$demoSets};
return(@set);
}
sub all_distinct_ages {
my @ages;
for my $display_code (@display_codes) {
if (! ( grep( /^$display_code$/, @nondistinct_codes))) {
push(@ages,$display_code);
}
}
if($debug > 2) {
foreach my $distinct_age (@ages) {
print "Distinct Age: $distinct_age\n";
}
}
return(@ages);
}
#
# DEMO END
#
#
#
my @monthnames = ("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec","YEAR");
my @monthseason = (6,6,6,2,2,2,5,5,5,4,4,4,8);
my $be = 2019; # end of base period
my $nbyears = 5; # Default base span. Should be odd
#
# Names of all the external and internal tables
# External tables are read from Mysql into the hashes mentioned above
#
# Real tables come from the database itself
# Synthetic tables are constructed internally from real tables -- usually by merging two or more tables into one synthetic
# or by subtracting a real table from another real table
# (example subtract COVID deaths from deaths from disease to
# get deaths from all diseases OTHER than COVID
# Canttrend_tables are tables for which we cannot compute a trend. Usually because they are a disease that did not exist in
# the baseline years (e.g. COVID)
#
my %real_tables; # Tables read out of the database
my %canttrend_tables; # Tables read out of the database that can't be trended
my %synthetic_tables; # Tables we make here (i.e. difference tables)
if(1==1) {
%canttrend_tables = ("01deaths_covid_mcd" => 2,
"90deaths_999_ucd" => 0);
%real_tables = ("00deaths_all" => 1, # Should be first as it is used to fill in missing pop info for other tables
"01deaths_influenza_mcd" => 2,
"01deaths_disease_mcd" => 2,
"01deaths_disease_ucd" => 2,
"01deaths_nondisease_ucd" => 2,
"02deaths_cancer_ucd" => 2,
"02deaths_circulatory_ucd" => 2,
"02deaths_respiratory_ucd" => 2,
"02deaths_infectparasite_ucd" => 2,
"02deaths_blood_ucd" => 2,
"02deaths_drugalcoholod_ucd" => 2,
"02deaths_drugod_ucd" => 2,
"02deaths_alcoholod_ucd" => 2,
"02deaths_suicide_ucd" => 2,
"90deaths_r99_ucd",0);
%synthetic_tables = ("99deaths_nondisease_mdrugalcohol" => 4,
"99deaths_all_mdrugalcohol" => 4,
"99deaths_disease_mcovid" => 2);
} else {
%real_tables = ("01deaths_disease_ucd" => 2,
"01deaths_nondisease_ucd" => 2,
"90deaths_r99_ucd",0);
%canttrend_tables = ("01deaths_covid_mcd" => 2,
"90deaths_999_ucd" => 0);
}
#if(($#ARGV < 1) || ($#ARGV > 4)) {
# die "usage: compute_trend.pl [--shellvar| [location] [baseline year end] [number of years comprising baseline]]\n";
#}
#
# Location (i.e. USA or individual state)
#
my $ofile = "USA";
my $syear = $ARGV[0];
sub tablenames {
my %hash = @_;
my @tkeys = sort keys(%hash);
for (@tkeys) { s/^\d\d// };
return(@tkeys);
}
if($syear eq "--shellvar") {
print "REAL_TABLES=\"";
my $d1 = 0;
foreach my $t (tablenames(%real_tables)) {
if($d1 > 0) {
print " ";
}
$d1 = 1;
print "$t";
}
print "\"\n";
print "CANTTREND_TABLES=\"";
$d1 = 0;
foreach my $t (tablenames(%canttrend_tables)) {
if($d1 > 0) {
print " ";
}
$d1 = 1;
print "$t";
}
print "\"\n";
print "SYNTHETIC_TABLES=\"";
$d1 = 0;
foreach my $t (tablenames(%synthetic_tables)) {
if($d1 > 0) {
print " ";
}
$d1 = 1;
print "$t";
}
print "\"\n";
exit;
}
my $eyear = $ARGV[1];
if($#ARGV >= 2) {
$ofile = $ARGV[2];
}
my $location = $ofile;
$location =~ s/_/ /g;
#
# Baseline setup
#
if($#ARGV >= 3) {
$be = $ARGV[3];
}
if($#ARGV >= 4) {
$nbyears = $ARGV[4];
}
my $bs = $be - ($nbyears-1);
my $launchyr = $bs;
if(($syear < 1999) || ($syear > 2024) || ($eyear < 1999) || ($eyear > 2024) || ($eyear < $syear)) {
die "years must be between 1999 and 2024 and start year must be less than or equal to end year";
}
#
# Get months
# Return the number of months between the current date and a given year and month
#
sub get_months_ago {
my ($target_year,$target_month) = @_;
if($target_month == 13) {
$target_month = 1;
}
my $current_date = DateTime->now;
my $target_dt = DateTime->new(
year => $target_year,
month => $target_month,
day => 1, # Set day to 1 to represent the 1st day of the target month
);
# my $ddifference = $current_date->subtract_datetime($target_dt);
my $ddifference = $current_date - $target_dt;
my $md = $ddifference->in_units('months');
return($md);
}
#
#
# Merge 99s
#
# This routine attempts to control for some of the reporting lag by pre-emptively merging both 999 reports and
# ICD-10 R99 category death certificates (as indicated on the underlying cause of death)
# It allocates them to either the category of natural death (i.e. death from disease) or
# to unnatural death (accidents, homicides, etc.) by perentage according to the age of the R99s/999s
#
#
sub allocate_99s {
my ($receiver_table, $receiver_table_type) = @_;
my @c999schedule = (99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99); # 999 should not extend beyond six months
my @cR99schedule = (85, 80, 75, 65, 50, 30, 20, 10, 0, 0, 0, 0, 0, 0, 0, 0); # R99 are 100% allocated to disease beyond 7 months
my $t1 = "deaths_999_ucd";
my $t2 = "deaths_r99_ucd";
#
# First let's do the 999s
#
foreach my $year (sort { $a <=> $b } keys %{$deaths{$receiver_table}}) {
foreach my $month (sort { $a <=> $b } keys %{$deaths{$receiver_table}{$year}}) {
my $avg_c999 = 0;
my $avg_cR99 = 0;
if($month == 13) {
foreach my $val (@c999schedule) {
$avg_c999 = $avg_c999 + $val;
}
$avg_c999 = $avg_c999 / scalar(@c999schedule);
foreach my $val (@cR99schedule) {
$avg_cR99 = $avg_cR99 + $val;
}
$avg_cR99 = $avg_cR99 / scalar(@cR99schedule);
}
foreach my $age_code (sort keys %{$deaths{$receiver_table}{$year}{$month}}) {
next if($age_code =~ /Not Stated/);
my $t999deaths;
my $t999pop;
if($year >= minyear($t1)) {
$t999deaths = $deaths{$t1}{$year}{$month}{$age_code}{$signal_proc_raw};
$t999pop = $population{$t1}{$year}{$month}{$age_code};
}
my $tc99deaths;
my $tc99pop;
if($year >= minyear($t2)) {
$tc99deaths = $deaths{$t2}{$year}{$month}{$age_code}{$signal_proc_raw};
$tc99pop = $population{$t2}{$year}{$month}{$age_code};
}
my $multipliert999;
my $multipliertc99;
my $months_ago;
if($month == 13) {
$months_ago = get_months_ago($year,1);
if($months_ago >= 24) {
$multipliert999 = 100;
$multipliertc99 = 0;
} else {
$multipliert999 = $avg_c999;
$multipliertc99 = $avg_cR99;
}
} else {
$months_ago = get_months_ago($year,$month);
$multipliert999 = $c999schedule[min($months_ago,$#c999schedule)];
$multipliertc99 = $cR99schedule[min($months_ago,$#cR99schedule)];
}
if($receiver_table_type eq "DISEASE") {
$multipliert999 = 100-$multipliert999;
$multipliertc99 = 100-$multipliertc99;
}
my $rdeaths = $deaths{$receiver_table}{$year}{$month}{$age_code}{$signal_proc_raw};
my $rpop = $population{$receiver_table}{$year}{$month}{$age_code};
if(not defined $rpop) {
die "rpop not defined for $receiver_table year $year month $month age code $age_code. Rdeaths is $rdeaths";
}
if($debug > 1) {
if(($receiver_table eq "deaths_disease_ucd") && ($year == 2022) && ($month == 13) && ($age_code =~ /0-17 years/)) {
print "$receiver_table Year $year Month $month age code $age_code deaths $rdeaths";
}
}
if(defined $t999deaths) {
die "allocate_99s Receiver $receiver_table population ne $t1 population ($rpop/$t999pop)" if($rpop != $t999pop);
my $givto = int(($t999deaths * ($multipliert999/100)));
if($debug > 1) {
if(($receiver_table eq "deaths_disease_ucd") && ($year == 2022) && ($month == 13) && ($age_code =~ /0-17 years/)) {
print " adding $givto (out of $t999deaths) from 999";
}
}
if($givto > 0) {
$rdeaths = $rdeaths + $givto;
}
}
if(defined $tc99deaths) {
die "allocate_99s Receiver $receiver_table population ne $t2 population ($rpop/$tc99pop)" if($rpop != $tc99pop);
my $givto = int(($tc99deaths * ($multipliertc99/100)));
if($debug > 1) {
if(($receiver_table eq "deaths_disease_ucd") && ($year == 2022) && ($month == 13) && ($age_code =~ /0-17 years/)) {
print " adding $givto (out of $tc99deaths) from c99";
}
}
if($givto > 0) {
$rdeaths = $rdeaths + $givto;
}
}
if(defined($rdeaths)) {
if($debug > 1) {
if(($receiver_table eq "deaths_disease_ucd") && ($year == 2022) && ($month == 13) && ($age_code =~ /0-17 years/)) {
print " total $rdeaths\n";
}
}
$deaths{$receiver_table}{$year}{$month}{$age_code}{$signal_proc_raw} = $rdeaths;
}
}
}
}
}
sub greg_max {
my ($a,$b) = @_;
if($a >= $b) {
return($a);
}
return($b);
}
sub greg_min {
my ($a,$b) = @_;
if($a < $b) {
return($a);
}
return($b);
}
sub greg_uniq {
my @ilist = sort @_;
my @olist;
my $oname = "";
foreach my $name (@ilist) {
if($name ne $oname) {
push(@olist,$name);
}
$oname = $name;
}
return(@olist);
}
#
#
#
sub distinct_age {
my ($age_code) = @_;
if($age_code =~ /ALL AGES/) {
return(0);
}
foreach my $check_code (@nondistinct_codes) {
if($code_map{$check_code} eq $age_code) {
return(0);
}
}
return(1);
}
#
#
#
sub nicemonth {
my ($year,$month) = @_;
my $monthname = $monthnames[$month-1];
$monthname =~ s/^(.).*/$1/;
my $niceout = "\"$monthname ";
if(($month == 2)) {
$niceout .= "\\n$year";
} else {
if(($month > 3) && ($month < 9999)) {
$niceout .= "\\n -- ";
}
}
$niceout .= "\"";
return($niceout);
}
#
#
#
sub table_is_canttrend_table {
my ($table) = @_;
foreach my $ctable (tablenames(%canttrend_tables)) {
if($table =~ /$ctable/) {
return(1);
}
}
return(0);
}
sub insert_into_db {
my ($date,$location,$pop,$demo, $table,$month,$per100K,$eper100K,$rmdeaths,$mdeaths,$medeaths,$season) = @_;
my $sql = "INSERT INTO EXCESS VALUES (?,?,?,?,?,?,?,?,?,?,?,?);";
my $stmt = $dbh->prepare($sql);
print "INSERTING INTO EXCESS $date/$location/$pop/$demo/$table/$month/$rmdeaths/$mdeaths/$medeaths/\n" if($debug > 1);
$stmt->execute($date, $location, $pop, $demo, $table, $month, $per100K, $eper100K, $rmdeaths, $mdeaths, $medeaths, $season);
$stmt->finish();
}
sub signal_process {
my ($table,$year,$month,$demo) = @_;
if($smooth_auto == 0) {
return($signal_proc_raw);
}
if($smooth_auto == 2) {
return($signal_proc_cooked);
}
# Auto Determine
if($table =~ /_covid_/) {
return($signal_proc_raw);
}
if($demo =~ /CHILD|SCHOOL/) {
return($signal_proc_cooked);
}
my $demo_pop = pop_in_demo($table,$year,$month,$demo);
if(defined($demo_pop)) {
if(($demo eq "OAF") && ($demo_pop >= 2000000)) {
return($signal_proc_raw);
}
if(($demo eq "MAL") && ($demo_pop >= 6000000)) {
return($signal_proc_raw);
}
if(($demo eq "YAH") && ($demo_pop >= 10000000)) {
return($signal_proc_raw);
}
}
return($signal_proc_cooked);
}
sub display_ages_for_sql {
my @ages = @_;
my $sql;
my $first = 0;
for my $display_code (@ages) {
if($first > 0) {
$sql .= ","
}
$first = 1;
if($display_code ne "ALL AGES") {
$sql .= "\"$code_map{$display_code}\"";
}
}
return($sql);
}
#
#
# Read the base data in
#
#
sub read_data {
my ($table,$shortstop) = @_;
my $sql = "select \"$location\",year(date_taken),month(date_taken), age_code,round(sum(population),0),sum(deaths_actual)\nfrom ${table}_month\nwhere year(date_taken) <= $eyear";
if($location ne "USA_FROM_STATES") {
$sql .= "\nand location=\"$location\"";
}
$sql .= "\ngroup by 1,2,3,4";
$sql .= "\nUNION\n";
$sql .= "select \"$location\",year(date_taken),month(date_taken), \"ALL AGES\",round(sum(population),0),sum(deaths_actual)\nfrom ${table}_month\nwhere year(date_taken) <= $eyear";
$sql .= "\nand age_code in (" . display_ages_for_sql(@ALLDemo) . ")";
if($location ne "USA_FROM_STATES") {
$sql .= "\nand location=\"$location\"";
}
$sql .= "\ngroup by 1,2,3,4";
$sql .= "\nUNION\n";
$sql .= "select \"$location\",year(date_taken),13, age_code, round(sum(population),0),sum(deaths_actual)\nfrom ${table}_annual\nwhere year(date_taken) <= $eyear";
if($location ne "USA_FROM_STATES") {
$sql .= "\nand location=\"$location\"";
}
$sql .= "\ngroup by 1,2,3,4";
$sql .= "\nUNION\n";
$sql .= "select \"$location\",year(date_taken),13, \"ALL AGES\",round(sum(population),0),sum(deaths_actual)\nfrom ${table}_annual\nwhere year(date_taken) <= $eyear";
$sql .= "\nand age_code in (" . display_ages_for_sql(@ALLDemo) . ")";
if($location ne "USA_FROM_STATES") {
$sql .= "\nand location=\"$location\"";
}
$sql .= "\ngroup by 1,2,3,4 order by 1,2,3,4;";
print "***Reading TABLE: $table.\n" if($debug > 0);
my $stmt = $dbh->prepare($sql);
$stmt->execute();
#
#
#
while(my @row = $stmt->fetchrow_array()) {
my ($location,$year,$month,$age_code,$population,$deaths) = @row;
my $months_ago = get_months_ago($year,$month);
if(defined($population)) {
$population{$table}{$year}{$month}{$age_code} = $population;
}
if(defined($deaths)) {
#
# Adjust for lag
#
if($month == 13) {
#
# Adjust annual totals upwards if needed
#
# Months ago is since the beginning of $year
my $have_months;
$have_months = $months_ago + 1;
my $short_magic = 0.5;
$have_months = $have_months - ($shortstop * $short_magic);
if(($have_months > 0) && ($have_months <= 12)) {
$deaths = int($deaths * (12/$have_months));
}
} else {
#
# Adjust month totals upwards if needed
#
}
#
#
#
$deaths{$table}{$year}{$month}{$age_code}{$signal_proc_raw} = $deaths;
}
}
$stmt->finish();
}
#
# Now compute the trends
#
my %trend_P100K;
#############################################################################
#
# Noise reduction section
#
# Moving Average Smoothing algorithm
#
sub moving_average {
my ($array_ref, $window_size) = @_;
my @smoothed_values;
my $half_window = int($window_size / 2);
for my $i (0 .. $#{$array_ref}) {
my $sum = 0;
my $count = 0;
for my $j (-$half_window .. $half_window) {
my $index = $i + $j;
if ($index >= 0 && $index <= $#{$array_ref}) {
if(defined($array_ref->[$index])) {
$sum += $array_ref->[$index];
} else {
die "Undefined deaths in moving average"; # Theoretically because we assign one death in every suppressed case (in read_data), this should never happen. We shall see
}
$count++;
}
}
my $average = $count ? $sum / $count : 0;
push @smoothed_values, $average;
}
return \@smoothed_values;
}
#
# LOESS smoothing/noise reduction function
#
# LOESS function
# LOESS function
use strict;
use warnings;
sub loess {
my ($xref, $yref, $span, $degree) = @_;
my @x = @$xref;
my @y = @$yref;
my $n = scalar @x;
my @fitted_values;
for (my $i = 0; $i < $n; $i++) {
my $x0 = $x[$i];
my $sum_weights = 0;
my $weighted_sum_x = 0;
my $weighted_sum_y = 0;
for (my $j = 0; $j < $n; $j++) {
my $xj = $x[$j];
my $distance = abs($xj - $x0);
my $weight = tricube_weight($distance / $span);
$sum_weights += $weight;
$weighted_sum_x += $weight * $xj;
$weighted_sum_y += $weight * $y[$j];
}
my $fitted_value = $weighted_sum_y / $sum_weights;
push @fitted_values, $fitted_value;
}
return \@fitted_values;
}
sub tricube_weight {
my $x = shift;
return ($x >= 0 && $x <= 1) ? (1 - $x ** 3) ** 3 : 0;
}
#
#
###################################################################
#
#
#
#
sub compute_trendline_Statistics {
my ($years,$deaths,$table,$month,$demo) = @_;
my $sdeaths;
if($smooth_baseline && (signal_process($table,$bs,1,$demo) eq $signal_proc_raw)) {
# We only signal process the trendline if the entire demo isn't already being signal processed
#
# First, smooth the data if necessary
#
print "CALLING MOVING_AVERAGE FROM COMPUTE TRENDLINE STATISTICS FOR TABLE $table Years $years, Month $month Demo $demo\n" if($debug > 1);
if($signal_proc_cooked eq "ma") {
print "compute_trendline_statistics: CALLING MOVING_AVERAGE on Table $table\n" if($debug > 1);
$sdeaths = moving_average($deaths,$smooth_window_trends);
} elsif ($signal_proc_cooked eq "loess") {
print "compute_trendline_statistics: CALLING LOESS on Table $table\n" if($debug > 1);
$sdeaths = loess($years,$deaths,$loess_span_trends, $loess_degree_trends);
} else {
die "Unknown signal processing algorithm $signal_proc_cooked";
}
} else {
$sdeaths = $deaths;
}
#
# Now extract only the baseline years and deaths from the death vectors
#
#
# Repair the baseline by filling in any missing years
#
my @byears;
my @bdeaths;
my $lastDeaths;
my $loopYear = @{$years}[0];
for (my $i = 0; $i <= $#{$years} ; $i++) {
my $nowYear = @{$years}[$i];
if(defined($nowYear)) {
if(($nowYear >= 0) && ($nowYear <= $nbyears)) {
my $nowDeaths = ${$sdeaths}[$i];
if($nowYear > $loopYear) { # There were years missing!
if(defined($lastDeaths)) {
my $lostYears = $nowYear - $loopYear;
while($loopYear < $nowYear) {
push(@byears,$loopYear);
push(@bdeaths, ($nowDeaths + $lastDeaths) / (($lostYears)+1));
$loopYear++;
}
}
}
push(@byears,$nowYear);
push(@bdeaths,$nowDeaths);
$lastDeaths = $nowDeaths;
}
$loopYear = $nowYear + 1;
} else {
die "NOWYEAR NOT DEFINED";
}
}
#
#
#
my $trendPoints = scalar(@{$years});
die "byears ne bdeaths" if(scalar(@byears) != scalar(@bdeaths));
die "trendpoints ne bdeaths" if($trendPoints != scalar(@bdeaths));
die "trendpoints ne sdeaths" if($trendPoints != scalar(@{$sdeaths}));
if($trendPoints < $minTrendPoints) {
return(undef,undef,undef,$trendPoints);
}
#
#
#
if($debug > 0) {
if(($table eq "deaths_disease_ucd") && ($demo eq "YAH") && ($month==4)) {
my $bmax = $#byears;
my $omax = $#{$deaths};
my $smax = $#{$sdeaths};
printf "INPUT TO LineFit. Omax $omax Smax $smax Bmax $bmax\n";
for (my $i = 0; $i <= $omax;$i++) {
my $odeaths = @{$deaths}[$i];
my $oyear = @{$years}[$i];
my $sdeaths = @{$sdeaths}[$i];
my $bdeaths = $bdeaths[$i];
my $byear = $byears[$i];
if(not defined($byear)) {
$byear = "";
}
if(not defined($bdeaths)) {
$bdeaths = "";
}
print "------ $table YAH Apr Year $oyear ODeaths $odeaths SDeaths $sdeaths BYear $byear BDeaths $bdeaths\n";
}
}
}
#
#
#
# Do the Linefit
my $line_fit = Statistics::LineFit->new();
$line_fit->setData($years, $sdeaths) or die "Invalid data points";
#
#
#
# Perform linear regression
my ($intercept, $slope) = $line_fit->coefficients();
# Calculate R-squared (R2)
my $r_squared = $line_fit->rSquared();
# if(($table =~ /deaths_disease_ucd/) && ($demo =~ /0-17/) && ($month == 7)) {
# print "$table: Linear Regression: Y = $slope * X + $intercept\n";
# }
# print "R-squared (R2): $r_squared\n";
return($slope,$intercept,$r_squared,$trendPoints);
}
#
# Logs of troubles creating trends
#
my %trend_trouble;
my %multiplier_trouble;
my %multi_sub_trouble;
#
sub print_trend_troubles {
foreach my $trouble_table (keys(%trend_trouble)) {
print "ALERT: $location could not compute trends for table $trouble_table for demos: ";
my $numentries = scalar(@{$trend_trouble{$trouble_table}});
if($numentries > 10) {
print " $numentries entries";
} else {
foreach my $trouble_demo (uniq(@{$trend_trouble{$trouble_table}})) {
print "$trouble_demo ";
}
}
print("\n");
}
foreach my $trouble_table (keys(%multiplier_trouble)) {
print "WARNING: $location could not provide multipliers for for table $trouble_table: ";
my $numentries = scalar(@{$multiplier_trouble{$trouble_table}});
if($numentries > 10) {
print " $numentries entries";
} else {
foreach my $trouble_text (uniq(@{$multiplier_trouble{$trouble_table}})) {
print "$trouble_text ";
}
}
print("\n");
}
foreach my $trouble_table (keys(%multi_sub_trouble)) {
print "WARNING: $location annual trend substitition for $trouble_table: ";
my $numentries = scalar(@{$multi_sub_trouble{$trouble_table}});
if($numentries > 10) {
print " $numentries entries";
} else {
foreach my $trouble_text (uniq(@{$multi_sub_trouble{$trouble_table}})) {
print "$trouble_text ";
}
}
print("\n");
}
}
sub compute_trends {
my ($table) = @_;
#
#
# For every demographic
# For every month
# For every year
# 1. Calculate the percent difference between the per-capita deaths at that point compared to the per-capita deaths in $byear
# 2. Add to the total difference
# End
# Divide the total difference by the number of years in the baseline (give average)
# Store the difference as the slope of the trend line with x & y intercept at $byear
# End
# End
# print "Doing $table....\n";
foreach my $demo (demoNames()) {
for (my $month = 1;$month <= 13;$month++) {
my @deathVecP100K;
my @yearVec;
# for(my $year = minyear($table); $year <= $be;$year++) { # We let the moving average smoother "sniff" at the years prior to the baseline but not see any years of the pandemic
for(my $year = $bs; $year <= $be;$year++) {
my $sp;
if($smooth_baseline == 1) {
$sp = $signal_proc_cooked;
} else {
$sp = $signal_proc_raw;
}
my $demoDeaths = deaths_in_demo($table,$year,$month,$demo,"P100K",$sp);
if(defined($demoDeaths)) {
push(@deathVecP100K,$demoDeaths);
push(@yearVec,$year-$bs); # $bs is year zero
} else {
}
}
my ($m,$b,$r_squared);
my $trendpoints;
($m,$b,$r_squared,$trendpoints) = compute_trendline_Statistics(\@yearVec,\@deathVecP100K,$table,$month,$demo);
if($trendpoints >= $minTrendPoints) {
if($trendpoints <= $minTrendPointsWARN) {
print "WARNING: Only $trendpoints trendpoints for $table Demo $demo Month $monthnames[$month-1] baseline in $location\n" if($debug > 0);
}
# print "My m is $m, theirs is $ms, my b is $b, theirs is $bs. R_Squared is $r_squared for theirs\n";
$trend_P100K{$table}{$month}{$demo} = [$m,$b,$r_squared];
if($r_squared < 0.5) {
if(! ($table =~ /_999_|_r99_/)) {
print "$location: Low quality trend warning. Table $table month $monthnames[$month-1] demo $demo. R2 is $r_squared\n" if($debug > 1);
}
}
} else {
push(@{$trend_trouble{$table}},$demo);
}
}
}
}
#
# How confident are we?
#
sub confidence {
my ($rsquared) = @_;
my $mmult = 1.0;
my $bmult = 1.0;
my @bmults = (1.0,1.0,1.0,1.0);
# my @mmults = (1.0,0.95,0.9,0.85); # Smaller the number, the less of a slope
# my @mmults = (1.0,0.98,0.96,0.94); # Smaller the number, the less of a slope
my @mmults = (1.0,1.0,1.0,1.0);
$mmult = $mmults[3];
$bmult = $bmults[3];
if($rsquared > 0.70) {
$mmult = $mmults[0];
$bmult = $bmults[0];
}
if($rsquared > 0.50) {
$mmult = $mmults[1];
$bmult = $bmults[1];
}
if($rsquared > 0.30) {
$mmult = $mmults[2];
$bmult = $bmults[2];
}
return($mmult,$bmult);
}
#
# Linear trendline for starters
#
sub calc_trendmultiplier_demo {
my ($table,$year,$month,$demo,$mode) = @_;
my $deaths; # Can be either P100K or RAW
if(table_is_canttrend_table($table)) {
return(deaths_in_demo($table,$year,$month,$demo,$mode));
}
if(not defined $trend_P100K{$table}{$month}{$demo}) {
print "Warning: $location table $table no trend multiplier for year $year month $monthnames[$month-1] for demo $demo. Will try and substitute annual\n" if(0==0);
push(@{$multi_sub_trouble{$table}},"$demo $year/$monthnames[$month-1]");
if(defined ($trend_P100K{$table}{13}{$demo})) {
my $madj = 12;
my $badj = 12;
my ($m,$b,$r_squared) = @{$trend_P100K{$table}{13}{$demo}};
$trend_P100K{$table}{$month}{$demo} = [$m / $madj,$b / $badj,$r_squared];
}
}
if(defined $trend_P100K{$table}{$month}{$demo}) {
my ($m, $b, $r_squared) = @{$trend_P100K{$table}{$month}{$demo}};
die "table $table year $year month $month demo $demo trend values not defined" if(not (defined($m)));
if($adj_confidence) {
my ($ms,$bs) = confidence($r_squared);
$m = $m * $ms;
$b = $b * $bs;
}
$deaths = (($m) * ($year-$bs)) + ($b);
if($mode eq "P100K") {
# Nothing
} elsif($mode eq "RAW") {
my $pop = pop_in_demo($table,$year,$month,$demo);
if(defined($pop)) {
$deaths = $deaths * ($pop / 100000);
} else {
$deaths = undef;
}
} else {
die "Unknown mode in calc_trendmultiplier";
}
} else {
print "Warning: $location table $table could not provide trend multiplier for $location year $year month $monthnames[$month-1] for demo $demo\n" if($debug > 0);
push(@{$multiplier_trouble{$table}},"$demo $year/$month NM");
}
if(defined($deaths) && ($deaths < 0)) {
$deaths = 0;
}
return($deaths);
}
sub r2value_demo {
my ($table,$month,$demo) = @_;
if(not defined($trend_P100K{$table}{$month}{$demo})) {
return(0);
}
my ($d1,$d2,$r_squared) = @{$trend_P100K{$table}{$month}{$demo}};
return($r_squared);
}
#
# Output the trend
#
sub deaths_in_demo {
my ($table,$year,$month,$demo,$mode,$signalproc) = @_;
my $deaths;
my $population;
die "NO DEMO IN deaths_in_demo" if(not defined($demo));
print "Table $table year $year month $month deaths in demo $demo " if($debug > 1);
if(not defined($signalproc)) {
$signalproc = signal_process($table,$year,$month,$demo);
} else {
# print "\tDEATHS IN DEMO SIGNAL PROC DEFINED AS $signalproc\n";
}
foreach my $display_code (demoSet($demo)) {
my $d = $deaths{$table}{$year}{$month}{$code_map{$display_code}}{$signalproc};
my $p = $population{$table}{$year}{$month}{$code_map{$display_code}};
if(defined($d)) {
$deaths += $d;
}
if(defined($p)) {
$population += $p;
}
}
if(not defined($deaths)) {
print "WARNING deaths_in_demo:Deaths not defined for demo $demo in $table/$year/$month\n" if($debug > 1);
print "UNDEF\n" if($debug > 1);
return(undef);
}
if($mode eq "RAW") {
print "$deaths\n" if($debug > 1);
return($deaths);
} elsif($mode eq "P100K") {
if(defined($population)) {
print "100K $deaths\n" if($debug > 1);
return($deaths / ($population / 100000));
} else {
print "WARNING deaths_in_demo:Population not defined for demo $demo in $table/$year/$month" if($debug > 1);
print "UNDEF POP\n" if($debug > 1);
return(undef);
}
}
die "Unknown mode $mode in deaths_in_demo";
}
sub pop_in_demo {
my ($table,$year,$month,$demo) = @_;
my $pop_table = "deaths_all";
my $population;
die "NO DEMO in pop_in_demo" if(not defined($demo));
foreach my $display_code (demoSet($demo)) {
my $p = $population{$table}{$year}{$month}{$code_map{$display_code}};
if(not defined($p)) {
$p = $population{$pop_table}{$year}{$month}{$code_map{$display_code}};
}
if(defined($p)) {
$population += $p;
}
}
print "Table $table year $year month $month population in demo $demo $population" if($debug > 1);
return($population);
}
sub output_trends {
my ($table,$filename) = @_;
open(FH, '>', $filename) or die "$filename : $!";
print FH "Location,$table,Year";
for (my $month = 1;$month <= 13;$month++) {
foreach my $demo (demoNames()) {
my $r_squared = sprintf("\"R2 %3.0f%%\"",r2value_demo($table,$month,$demo) * 100);
print FH ",$monthnames[$month-1] $demo RAW,$monthnames[$month-1] $demo SMOOTH, $monthnames[$month-1] $demo TREND,$r_squared";
}
}
print FH "\n";
for(my $year = $bs; $year <= $be;$year++) {
print FH "$location,$table,";
my $date = sprintf("%04d-01-01",$year);
print FH "$date";
for (my $month = 1;$month <= 13;$month++) {
foreach my $demo (demoNames()) {
my $bpop = pop_in_demo($table,$bs,$month,$demo) / 100000;
my $npop = pop_in_demo($table,$year,$month,$demo) / 100000;
my $byear_r = deaths_in_demo($table,$bs,$month,$demo,"P100K",$signal_proc_raw);
my $nyear_r = deaths_in_demo($table,$year,$month,$demo,"P100K",$signal_proc_raw);
my $byear_s = deaths_in_demo($table,$bs,$month,$demo,"P100K",$signal_proc_cooked);
my $nyear_s = deaths_in_demo($table,$year,$month,$demo,"P100K",$signal_proc_cooked);
my $byear_t = calc_trendmultiplier_demo($table,$bs,$month,$demo,"P100K");
my $nyear_t = calc_trendmultiplier_demo($table,$year,$month,$demo,"P100K");
if($table =~ /deaths_all_|deaths_disease_|deaths_nondisease_/) {
if((not defined($byear_r)) || ($byear_r == 0)) {
print "INFO: no raw base year ($bs-$month) for table $table demo $demo\n" if($debug > 0);
} else {
if((not defined($byear_s)) || ($byear_s == 0)) {
print "WARNING: no smoothed base year for table $table demo $demo\n";
}
}
if((not defined($byear_t)) || ($byear_t == 0)) {
print "ALERT: no trended base year for table $table demo $demo\n";
}
}
if(defined($byear_r) && defined($nyear_r)) {
my $rPct;
if(($byear_r == 0)||($nyear_r == 0)) {
$rPct = 0;
} else {
$rPct = ($nyear_r / $npop) / ($byear_r / $bpop);
$rPct -= 1;
}
print FH ",$rPct";
} else {
print FH ",NaN"; # Can't be blank or Gnuplot freaks out
}
if(defined $byear_s && defined($nyear_s)) {
my $sPct;
if(($byear_s == 0)||($nyear_s == 0)) {
$sPct = 0;
} else {
$sPct = ($nyear_s / $npop) / ($byear_s / $bpop);
$sPct -= 1;
}
print FH ",$sPct";
} else {
print FH ",NaN"; # Can't be blank or Gnuplot freaks out
}
if(defined $byear_t && defined($nyear_t)) {
my $tPct;
if(($byear_t == 0)||($nyear_t == 0)) {
$tPct = 0;
} else {
$tPct = ($nyear_t / $npop) / ($byear_t / $bpop);
$tPct -= 1;
}
print FH ",$tPct";
} else {
print FH ",NaN"; # Can't be blank or Gnuplot freaks out
}
my $r_squared = sprintf("%3.0f",r2value_demo($table,$month,$demo) * 100);
print FH ",$r_squared";
}
}
print FH "\n";
}
print FH "\n";
close(FH);
}
#
# Finally, output the data
#
sub output_data {
my ($table,$filename,$incraw) = @_;
open(FH, '>', $filename) or die $!;
print FH "Location,Table,Year";
foreach my $monthname (@monthnames) {
foreach my $demo (demoNames()) {
if($incraw) {
print FH ",$monthname $demo total deaths,$monthname $demo population";
} else {
print FH ",$monthname $demo Per 100K";
}
}
}
print FH "\n";
for (my $year = $syear;$year <= maxyear($table);$year++) {
print FH "$location,$table,";
# my $date = sprintf("%04d-01-01",$year);
my $date = sprintf("\\n%04d",$year);
print FH "$date";
if(get_months_ago($year,1) < $est_window) {
print FH "(*EST)";
}
for(my $month = 1; $month <= 13; $month += 1) {
foreach my $demo (demoNames()) {
my $demoDeath;
my $demoDeathP100k;
my $demoPop;
foreach my $display_code (demoSet($demo)) {
# my $death = $deaths{$table}{$year}{$month}{$code_map{$display_code}}{signal_process($table,$year,$month,$demo)};
my $death = $deaths{$table}{$year}{$month}{$code_map{$display_code}}{$signal_proc_raw};
my $pop = $population{$table}{$year}{$month}{$code_map{$display_code}};
if(defined($death) && defined($pop)) {
$demoDeath += $death;
$demoPop += $pop;
}
}
if(defined($demoDeath) && defined($demoPop)) {
$demoDeathP100k = $demoDeath / ($demoPop / 100000);
if($incraw) {
print FH ",$demoDeath,$demoPop";
} else {
print FH ",$demoDeathP100k";
}
} else {
if($incraw) {
print FH ",NaN,NaN"; # Can't be blank or Gnuplot freaks out
} else {
print FH ",NaN"; # Can't be blank or Gnuplot freaks out
}
}
}
}
print FH "\n";
}
close(FH);
}
sub get_shortstop {
my ($table) = @_;
my %tables = (%real_tables,%canttrend_tables,%synthetic_tables);
foreach my $tab (keys(%tables)) {
my $nz = $tab;
$nz =~ s/^\d\d//;
if($table eq $nz) {
return($tables{$tab});
}
}
die "Can't find shortstop for $table";
}
sub output_months1 {
my ($table,$filename,$period,$demo) = @_;
open(FH, '>', $filename) or die $!;
print FH "Location,Table,Month,YAH Per 100K\n";
my %monthtot;
my %poptot;
my $minyear;
my $maxyear;
my $shortstop;
$shortstop = get_shortstop($table);
my $cur_year = 1900 + (localtime)[5];
for(my $month = 1; $month <= 13; $month += 1) {
foreach my $year (sort { $a <=> $b } keys %{$deaths{$table}}) {
if($period eq "BASELINE") {
next if($year < $bs);
next if($year > $be);
} else {
next if($year <= $be);
# next if(get_months_ago($year,$month) < $shortstop);
next if($year >= $cur_year);
}
my $d = deaths_in_demo($table,$year,$month,$demo,"RAW");
if(defined($d)) {
$monthtot{$month} += $d;
}
my $p = pop_in_demo($table,$year,$month,$demo);
if(defined($p)) {
$poptot{$month} += $p;
}
}
}
for(my $month = 1; $month <= 13; $month += 1) {
my $deaths = $monthtot{$month};
next if(! $deaths);
print FH "$location,$table";
if($month == 13) {
print FH ",AVERAGE";
} else {
print FH ",$monthnames[$month-1]";
}
if(defined $deaths) {
my $pop = $poptot{$month};
die "POP ZERO $pop deaths $deaths demo $demo table $table month $month" if((not defined($pop)) || ($pop == 0));
my $per100K = $deaths/($pop/100000);
if($month == 13) {
$per100K = $per100K/12;
}
print FH ",$per100K";
} else {
print FH ",NaN";
}
my $season = $monthseason[$month-1];
print FH ",$season\n";
}
close(FH);
}
sub output_months2 {
my ($table,$filename,$demo) = @_;
open(FH, '>', $filename) or die $!;
print FH "Date,Location,Table,Month,Expected $demo Per 100K,Actual $demo Per 100K,Season\n";
my $shortstop;
$shortstop = get_shortstop($table);
for(my $year = $syear; $year <= maxyear($table); $year += 1) {
print "****Months2: location $location table $table year $year\n" if($debug > 2);
for(my $month = 1;$month <= 13;$month += 1) {
if(($month <= 12) && (get_months_ago($year, $month) < $shortstop)) {
$month = 13; # Cause it to barf out the annual stats and quit
}
my $mpop = pop_in_demo($table,$year,$month,$demo);
my $per100K = deaths_in_demo($table,$year,$month,$demo,"P100K");
my $rmdeaths = deaths_in_demo($table,$year,$month,$demo,"RAW",$signal_proc_raw);
my $mdeaths = deaths_in_demo($table,$year,$month,$demo,"RAW");
my $medeaths = calc_trendmultiplier_demo($table,$year,$month,$demo,"RAW");
my $eper100K = calc_trendmultiplier_demo($table,$year,$month,$demo,"P100K");
if(not defined($per100K)) {
$per100K = "NaN";
}
if(not defined($eper100K)) {
$eper100K = "NaN";
}
if(not defined($rmdeaths)) {
$rmdeaths = "NaN";
}
if(not defined($mdeaths)) {
$mdeaths = "NaN";
}
if(not defined($medeaths)) {
$medeaths = "NaN";
}
print "OUTPUT: table $table demo $demo year $year month $month mpop $mpop rmdeaths $rmdeaths mdeaths $mdeaths medeaths $medeaths eper100K $eper100K, per100K $per100K\n" if($debug > 1);
my $date;
if($month <= 12) {
$date = sprintf("%04d-%02d-01",$year,$month);
my $season = $monthseason[$month-1];
my $nicemonth = nicemonth($year,$month);
$date = sprintf("%04d-%02d-01",$year,$month);
print FH "$date,$location,$table,$nicemonth,$eper100K,$per100K,$season\n";
} else {
$date = sprintf("%04d-01-01",$year);
}
insert_into_db($date, $location, $mpop, $demo, $table, $month,
($per100K eq "NaN" ? undef : $per100K), ($eper100K eq "NaN" ? undef : $eper100K),
($rmdeaths eq "NaN" ? undef : $rmdeaths), ($mdeaths eq "NaN" ? undef : $mdeaths), ($medeaths eq "NaN" ? undef : $medeaths),
0);
}
print "\n" if($debug > 2);
}
close(FH);
}
sub generate_difference {
my ($dest,$t1,$t2) = @_;
foreach my $year (sort { $a <=> $b } keys %{$population{$t1}}) {
foreach my $month (sort { $a <=> $b } keys %{$population{$t1}{$year}}) {
foreach my $age_code (sort keys %{$population{$t1}{$year}{$month}}) {
my $t1deaths = $deaths{$t1}{$year}{$month}{$age_code}{$signal_proc_raw};
my $t2deaths = $deaths{$t2}{$year}{$month}{$age_code}{$signal_proc_raw};
my $t1pop = $population{$t1}{$year}{$month}{$age_code};
my $t2pop = $population{$t2}{$year}{$month}{$age_code};
if((! ($age_code =~ /Not Stated/)) && ((! looks_like_number($t1pop)) || ($t1pop <= 0))) {
die "FATAL: Generate_Difference: Population $year/$month/$age_code not defined or zero ($t1pop) in $t1!\n";
}
if(defined($t2deaths)) {
if(!($age_code =~ /Not Stated/) && (not defined($t2pop))) {
die "FATAL: Generate_Difference: Population $year/$month/$age_code not defined in $t2!\n";
}
if(! ($age_code =~ /Not Stated/)) {
die "FATAL: Generate_Difference: Population $t1 $year/$month/$age_code does not match $t2 !\n" if($t1pop != $t2pop);
}
if(($age_code =~ /Not Stated/) || defined($t1pop)) {
$population{$dest}{$year}{$month}{$age_code} = $t1pop;
if(defined($t1deaths)) {
$deaths{$dest}{$year}{$month}{$age_code}{$signal_proc_raw} = $t1deaths - $t2deaths;
}
}
} else { # Else nothing to do (nothing to subtract)
$population{$dest}{$year}{$month}{$age_code} = $t1pop;
$deaths{$dest}{$year}{$month}{$age_code}{$signal_proc_raw} = $t1deaths;
}
}
}
}
}
sub check_data {
foreach my $table (keys %deaths) {
foreach my $year (sort { $a <=> $b } keys %{$deaths{$table}}) {
my $tpop = 0;
my $tdeath = 0;
my $months = 0;
my $popAnnual = $population{$table}{$year}{13}{"ALL AGES"};
next if(not defined($popAnnual)); # Some years have no deaths from a given disease (ex COVID)
my $mPop;
foreach my $month (sort { $a <=> $b } keys %{$deaths{$table}{$year}}) {
print "---- Table $table Year: $year Month: $month\n" if($debug > 1);
$mPop = 0;
# foreach my $age_code (sort keys %{$deaths{$table}{$year}{$month}}) {
foreach my $age_code (sort keys %{$population{$table}{$year}{$month}}) {
my $deaths = $deaths{$table}{$year}{$month}{$age_code}{$signal_proc_raw};
my $pop = $population{$table}{$year}{$month}{$age_code};
my $aPopAnnual = $population{$table}{$year}{13}{$age_code};
if(defined $deaths) {
if(!($age_code =~ /Not Stated/)) {
die "check_data:$table $year $age_code annual population does not look like a number ($aPopAnnual)" if(not looks_like_number($aPopAnnual));
die "check_data:$table $year $month $age_code population is undefined for $age_code" if(not looks_like_number($pop));
die "check_data:$table $year $month $age_code month population $pop not equal to annual population $aPopAnnual" if($pop != $aPopAnnual);
die "check_data: Table $table Year $year Month $month Demo $age_code population not defined" if(! defined($pop));
my $tp100K = $deaths/($pop/100000);
}
if(distinct_age($age_code) && $month != 13) {
$tdeath += $deaths;
}
}
if(distinct_age($age_code) && $month != 13) {
# print "Age code $age_code is distinct\n";
if(!($age_code =~ /Not Stated/)) {
die "check_data: Table $table Pop not defined for age code $age_code year $year month $month" if(not (defined $pop));
# print "Adding population of $age_code ($pop) to total population\n" if($table eq "deaths_all");
$tpop += $pop;
$mPop += $pop;
}
}
if($debug > 1 && ($month != 13)) {
print "\t++++Table $table Year $year Month $month Age Code $age_code Pop $pop Month Pop $mPop Population Annual $popAnnual Total pop $tpop\n";
}
}
$months += 1;
}
# print "\*** MPOP $mPop. TPOP $tpop\n";
if($months > 1) {
$tpop /= ($months-1);
}
print "+++++++++++++++Table $table Year $year ($months months) Total pop $tpop Popannual $popAnnual Total death $tdeath\n" if($debug > 1);
die "***** Check Data: Table $table Year $year ($months months) PopAnnual not set\n" if(not defined($popAnnual));
die "***** Check Data: Table $table Year $year ($months months) PopAnnual is $popAnnual\n" if(not looks_like_number($popAnnual));
if(($tpop != $popAnnual) && ($months == 13)) {
print "WARNING Check Data: Table $table Year $year $months months) Total Pop $tpop Not equal to PopAnnual $popAnnual\n";
}
}
print "Table $table checks in check_data\n" if($debug > 0);
}
}
sub maxyear {
my ($table) = @_;
my @years = sort { $b <=> $a }(keys %{$deaths{$table}});
die "Table $table no years" if($#years < 0);
my $maxyear = $years[0];
my (@tm) = localtime(time());
if(! ($table =~ /deaths_all$|deaths_999_ucd|deaths_r99_ucd/)) {
my ($maxyear_d,$maxmonth_d,$maxday_d) = Add_Delta_YM($tm[5]+1900,$tm[4],$tm[3], 0, -3); # Three months ago
if($maxyear_d < $maxyear) {
$maxyear = $maxyear_d;
}
}
print "Returning Maximum Year $maxyear for $table\n" if($debug > 2);
return($maxyear);
}
sub minyear {
my ($table) = @_;
my @years = sort { $a <=> $b }(keys %{$deaths{$table}});
die "Table $table no years" if($#years < 0);
my $minyear = $years[0];
if($table eq "deaths_999_ucd") {
print "Returning Minimum Year $minyear for $table\n" if($debug > 2);
}
return($minyear);
}
sub maxmonth {
my ($table,$year) = @_;
my @months = sort { $b <=> $a } (keys %{$deaths{$table}{$year}});
die "Table $table no years" if($#months < 1); # Month 13 (the year) will always be there so this array should always have at least two entries
my $maxmonth = $months[1];
die "table $table year $year. months[0] !!= 13 or maxmonth ($maxmonth) > 12" if(($months[0] != 13)||($maxmonth > 12));
print "Returning Month $maxmonth for $table Year $year\n" if($debug > 2);
return($maxmonth);
}
sub noise_reduce {
my $numTables = 0;
foreach my $table (keys %deaths) {
my %acDeaths;
my %acMonths;
foreach my $age_code (@age_codes) {
my $monthCnt = 0;
for (my $year = minyear($table); $year <= maxyear($table);$year++) {
for (my $month = 1;$month <= maxmonth($table,$year);$month++) {
my $death = $deaths{$table}{$year}{$month}{$age_code}{$signal_proc_raw};
die "DEATH IS UNDEFINED FOR TABLE $table Year $year Month $month Age_Code $age_code Signal Proc: $signal_proc_raw\n" if(not defined($death));
push(@{$acMonths{$age_code}},$monthCnt);
push(@{$acDeaths{$age_code}},$death);
$monthCnt++;
}
}
}
foreach my $age_code (@age_codes) {
my $monthVec = $acMonths{$age_code};
my $deathVec = $acDeaths{$age_code};
my $deathVecCnt = scalar(@{$deathVec});
my $newDeathsS;
if($signal_proc_cooked eq "ma") {
print "CALLING MOVING_AVERAGE FROM noise_reduce on Table $table Age Code $age_code\n" if($debug > 1);
$newDeathsS = moving_average($deathVec,$smooth_window_deaths);
} elsif ($signal_proc_cooked eq "loess") {
print "CALLING LOESS FROM noise_reduce on Table $table Age Code $age_code\n" if($debug > 1);
$newDeathsS = loess($monthVec,$deathVec,$loess_span, $loess_degree);
# } elsif ($signal_proc_cooked eq "pdl_loess") {
# print "CALLING PDL_LOESS FROM noise_reduce on Table $table Age Code $age_code\n" if($debug > 1);
# $newDeathsS = pdl_loess($monthVec,$deathVec,$loess_pdl_span, $loess_pdl_degree);
} else {
die "Unknown signal processing algorithm $signal_proc_cooked";
}
for (my $year = minyear($table); $year <= maxyear($table);$year++) {
for (my $month = 1;$month <= maxmonth($table,$year);$month++) {
my $death = shift(@{$newDeathsS});
$deaths{$table}{$year}{$month}{$age_code}{$signal_proc_cooked} = $death;
}
$deaths{$table}{$year}{13}{$age_code}{$signal_proc_cooked} = $deaths{$table}{$year}{13}{$age_code}{$signal_proc_raw};
}
}
# print "COOKED Table $table $numYears/$numMonths/$numAc\n";
}
return;
}
sub fill_suppressed {
my ($table) = @_;
print "fill_suppressed: TABLE $table\n" if($debug > 0);
for (my $year=minyear($table);$year <= maxyear($table);$year++) {
foreach my $age_code (@age_codes) {
# first see if the year hasn't been suppressed
if(not defined($deaths{$table}{$year}{13}{$age_code}{$signal_proc_raw})) {
print "Table $table Year $year Age code $age_code NO ANNUAL FOR ADJUSTMENT\n" if($debug > 0);;
$deaths{$table}{$year}{13}{$age_code}{$signal_proc_raw} = int(max(1,rand(9)));
}
for (my $month=1;$month <= maxmonth($table,$year);$month++) {
if(not defined($deaths{$table}{$year}{$month}{$age_code}{$signal_proc_raw})) { # Death was "suppressed" (means it was between 1 and 9)
# print "Table $table deaths suppressed for $year/$month/$age_code\n";
my $fillin_deaths = $deaths{$table}{$year}{13}{$age_code}{$signal_proc_raw}; # Annual number of deaths
die "NO ANNUAL (???) in fill_suppressed" if(not defined($fillin_deaths));
if($fillin_deaths > 300) {
print("WARNING: Table $table Year $year Month $month Age Code $age_code suspcious annual deaths ($fillin_deaths)\n") if(! ($table =~ "deaths_r99_ucd|deaths_999_ucd|deaths_influenza_mcd"));
}
$fillin_deaths = int($fillin_deaths / 12); # make it per-month. This should probably also try and adjust for seasonality (UGH)
my $fillin = int(rand($fillin_deaths));
$fillin = min($fillin,9);
if($fillin == 0) {
$fillin = 1;
}
print("Adjusting $table $year/$month/$age_code supressed deaths to $fillin\n") if($debug > 1);
$deaths{$table}{$year}{$month}{$age_code}{$signal_proc_raw} = $fillin;
}
}
}
}
}
my %tk = (%real_tables,%canttrend_tables);
foreach my $table (tablenames(%tk)) {
read_data($table,get_shortstop($table));
fill_suppressed($table);
}
#
# Allocate 999 and R99 codes as necessary
print "ALLOCATE 99s" if($debug > 0);
#
if($allocate_99s) {
allocate_99s("deaths_disease_ucd","DISEASE");
allocate_99s("deaths_nondisease_ucd","NONDISEASE");
}
#
# Generate synthetic tables
#
print "GENERATE DIFFERENCE" if($debug > 0);
generate_difference("deaths_nondisease_mdrugalcohol","deaths_nondisease_ucd","deaths_drugalcoholod_ucd");
generate_difference("deaths_all_mdrugalcohol","deaths_all","deaths_drugalcoholod_ucd");
generate_difference("deaths_disease_mcovid","deaths_disease_mcd","deaths_covid_mcd");
#
#
# Basic data sanity checking
#
#
print "CHECK DATA" if($debug > 0);
check_data();
#
# Do noise reduction
#
print "NOISE REDUCE DATA\n" if($debug > 0);
noise_reduce();
my @tables = tablenames(%real_tables,%canttrend_tables,%synthetic_tables);
foreach my $table (@tables) {
if(!table_is_canttrend_table($table)) {
print "$table COMPUTE TRENDS\n" if($debug > 0);
compute_trends($table);
print "$table OUTPUT TRENDS\n" if($debug > 0);
output_trends($table,"Outputs/CSV/$table/$ofile-trends.csv");
} else {
print "INFO: Table $table is a cant-trend table.\n" if($debug > 0);
}
print "$table OUTPUT DATA\n" if($debug > 0);
output_data($table,"Outputs/CSV/$table/$ofile-data.csv",0);
output_data($table,"Outputs/CSV/$table/$ofile-data-raw.csv",1);
#
#
#
#
foreach my $demo (demoNames()) {
print "$table DEMO $demo output\n" if($debug > 0);
output_months1($table,"Outputs/CSV/$table/$ofile-$demo-BASELINE-months1.csv","BASELINE",$demo);
output_months1($table,"Outputs/CSV/$table/$ofile-$demo-PANDEMIC-months1.csv","PANDEMIC",$demo);
output_months2($table,"Outputs/CSV/$table/$ofile-$demo-months2.csv",$demo);
}
}
print_trend_troubles();