There are a couple of considerations:
- processing is easier when data is transposed into a tall & thin structure
- one row per id/disease flag
- in some cases the person may have a 'gap' in age range
- example: disease at 20-29 and next at 40-49
this to me is implicit zero count of diseases at age range 30-39. This implicit would be important (and should be considered) in computing 30-39 disease proportion over all ids.
- filling in 'gaps' is easily accomplished when a standardized value for age is used for an age in a range. The 'standard' value can be computed using a custom format and a custom informat.
- adding dummy records for gaps should only occur for ages between enroll date and last disease date
Approach
- transpose age and flag for each id and enroll age
- use
MEANS
to sum flags for each id age(standardized) to get count of concurrent diseases in an age range
- left join a cross join ids and ages with sums for gap fill in
- use
MEANS
again to count age and age*flag_count
- use
DATA
step to compute proportion of counts [age*flag_count]/[age]
Sample code
* custom format and informat for standardizing age;
proc format;
value agegrp
20-29 = '20-29'
30-39 = '30-39'
40-49 = '40-49'
50-59 = '50-59'
60-69 = '60-69'
70-79 = '70-79'
80-89 = '80-89'
90-high = '90 + '
;
invalue agegrp
'20-29' = 20
'30-39' = 30
'40-49' = 40
'50-59' = 50
'60-69' = 60
'70-79' = 70
'80-89' = 80
'90 + ' = 90
;
run;
* generate some sample data;
data have;
call streaminit(123);
do id = 1 to 10000;
enrolled = '01jan2000'd + rand('integer', 1, 3650);
age = 20 + rand('integer', 59);
flag1 = rand('uniform') < 0.25;
date1 = enrolled + rand('integer',2500);
flag2 = rand('uniform') < 0.25;
date2 = date1 + rand('integer',2500);
flag3 = rand('uniform') < 0.25;
date3 = date2 + rand('integer',2500);
output;
end;
format enrolled date: yymmdd10. flag: 1.;
run;
* compute age at each disease flag time point;
data stage1;
set have;
age1 = age + intck('year', enrolled, date1);
age2 = age + intck('year', enrolled, date2);
age3 = age + intck('year', enrolled, date3);
run;
* parallel array based transposition of ages and flags;
data stage2;
set stage1;
* map age in range to first value in range;
* need for later when creating zero counts for intermediate age ranges not in data;
* counts of zero counts needed to compute proportion;
enroll_age = input(put(age,agegrp.),agegrp.);
age = input(put(age1,agegrp.),agegrp.); flag = flag1; output;
age = input(put(age2,agegrp.),agegrp.); flag = flag2; output;
age = input(put(age3,agegrp.),agegrp.); flag = flag3; output;
keep id enroll_age age flag;
format enroll_age age 4. flag 1.;
run;
ods listing;
* compute number of simultaneous diseases in age range;
proc means noprint nway data=stage2;
class id enroll_age age;
format enroll_age age agegrp.;
var flag;
output out=stage3(drop= _type_ _freq_) sum=flag_count;
run;
* data for cross join;
data all_ages;
do age = 20 to 90 by 10;
output;
end;
format age agegrp.;
run;
* combine all_ages with each id so as to get a zero count
* for a range not present between enroll_date and max date;
proc sql;
create table stage4 as
select ids.id, ages.age, coalesce(stage3.flag_count,0) as flag_count
from
( select distinct id from stage3 ) as ids
cross join
all_ages as ages
left join
stage3
on ids.id = stage3.id and ages.age = stage3.age
group by
ids.id
having
ages.age between min(stage3.enroll_age) and max(stage3.age)
order by
ids.id, age
;
quit;
* compute counts;
proc means noprint data=stage4;
class age flag_count;
output out=stage5 N(id)=;
types age age*flag_count;
run;
* compute proportions;
data want;
merge
stage5 (where=(_type_=2) rename=_freq_=age_freq)
stage5 (where=(_type_=3) rename=_freq_=flag_count_freq)
;
by age;
age_flag_count_proportion = flag_count_freq / age_freq;
format age_flag_count_proportion percent5.;
keep age flag_count age_freq flag_count_freq age_flag_count_proportion;
format flag_count 1.;
run;
Example output data