1
votes

I have the following code that is being used generate running totals of features for the past 1 day, 7 days, 1 month, 3 months, and 6 months.

LIBNAME A "C:\Users\James\Desktop\data\Base Data";
LIBNAME DATA "C:\Users\James\Desktop\data\Data1";

%MACRO HELPER(P);

data a1;
set data.final_master_&P. ;
QUERY = '%TEST('||STRIP(DATETIME)||','||STRIP(PARTICIPANT)||');';
CALL EXECUTE(QUERY);
run;

%MEND;

%MACRO TEST(TIME,PAR);
proc sql;
select SUM(APP_1), SUM(APP_2), sum(APP_3), SUM(APP_4), SUM(APP_5) INTO :APP_1_24, :APP_2_24, :APP_3_24, :APP_4_24, :APP_5_24
FROM A1
WHERE DATETIME BETWEEN INTNX('SECONDS',&TIME.,-60*60*24) AND &TIME.;

/* 7 Days */
select SUM(APP_1), SUM(APP_2), sum(APP_3), SUM(APP_4), SUM(APP_5) INTO :APP_1_7DAY, :APP_2_7DAY, :APP_3_7DAY, :APP_4_7DAY, :APP_5_7DAY
FROM A1
WHERE DATETIME BETWEEN INTNX('SECONDS',&TIME.,-60*60*24*7) AND &TIME.;

/* One Month */

select SUM(APP_1), SUM(APP_2), sum(APP_3), SUM(APP_4), SUM(APP_5) INTO :APP_1_1MONTH, :APP_2_1MONTH, :APP_3_1MONTH, :APP_4_1MONTH, :APP_5_1MONTH
FROM A1
WHERE DATETIME BETWEEN INTNX('SECONDS',&TIME.,-60*60*24*7*4) AND &TIME.;

/* Three Months */

select SUM(APP_1), SUM(APP_2), sum(APP_3), SUM(APP_4), SUM(APP_5) INTO :APP_1_3MONTH, :APP_2_3MONTH, :APP_3_3MONTH, :APP_4_3MONTH, :APP_5_3MONTH
FROM A1
WHERE DATETIME BETWEEN INTNX('SECONDS',&TIME.,-60*60*24*7*4*3) AND &TIME.;

/* Six Months */

select SUM(APP_1), SUM(APP_2), sum(APP_3), SUM(APP_4), SUM(APP_5) INTO :APP_1_6MONTH, :APP_2_6MONTH, :APP_3_6MONTH, :APP_4_6MONTH, :APP_5_6MONTH
FROM A1
WHERE DATETIME BETWEEN INTNX('SECONDS',&TIME.,-60*60*24*7*4*6) AND &TIME.;

quit;

DATA T;
PARTICIPANT = &PAR.;
DATETIME = &TIME;
APP_1_24 =  &APP_1_24.;
APP_2_24 =  &APP_2_24.;
APP_3_24 =  &APP_3_24.;
APP_4_24 =  &APP_4_24.;
APP_5_24 =  &APP_5_24.;
APP_1_7DAY =  &APP_1_7DAY.;
APP_2_7DAY =  &APP_2_7DAY.;
APP_3_7DAY =  &APP_3_7DAY.;
APP_4_7DAY =  &APP_4_7DAY.;
APP_5_7DAY =  &APP_5_7DAY.;
APP_1_1MONTH =  &APP_1_1MONTH.;
APP_2_1MONTH =  &APP_2_1MONTH.;
APP_3_1MONTH =  &APP_3_1MONTH.;
APP_4_1MONTH =  &APP_4_1MONTH.;
APP_5_1MONTH =  &APP_5_1MONTH.;
APP_1_3MONTH =  &APP_1_3MONTH.;
APP_2_3MONTH =  &APP_2_3MONTH.;
APP_3_3MONTH =  &APP_3_3MONTH.;
APP_4_3MONTH =  &APP_4_3MONTH.;
APP_5_3MONTH =  &APP_5_3MONTH.;
APP_1_6MONTH =  &APP_1_6MONTH.;
APP_2_6MONTH =  &APP_2_6MONTH.;
APP_3_6MONTH =  &APP_3_6MONTH.;
APP_4_6MONTH =  &APP_4_6MONTH.;
APP_5_6MONTH =  &APP_5_6MONTH.;
FORMAT DATETIME DATETIME.;
RUN;

PROC APPEND BASE=DATA.FLAGS_&par. DATA=T;
RUN;

%MEND;

%helper(1);

This code runs perfectly if I limit the number of observations in the %helper macro, using an (obs=) in the creation of the a1 dataset. However, when I put no limit on the obs number, i.e. execute the %test macro for every row in the dataset a1, I get errors. In SAS EG, I get a "server disconnected" popup after the status bar hangs at "running data step", and on Base SAS 9.4 I get the error that none of the macro variables have been resolved that are created in the proc sql into.

I'm confused as the code works fine for a limited amount of observations, but when trying on the whole dataset it hangs or gives errors. The dataset I'm doing this for has around 130,000 observations.

2
You should add %nrstr() to the macro call on call execute. See my long answer to stackoverflow.com/questions/27749472/sas-macro-coding/27757820. That said, you're generating a lot of code, so could still have problems with call execute.Quentin
Just tried this, the amount of code generated is the problem I think. Is there any way to get around this?user2662468
See Joe's answer presenting alternatives: stackoverflow.com/questions/25545892/…. I would try the %include approach.Quentin
I agree the quantity of macro code generated here is too much. I also think that there are far better ways to do this - I'll answer later today with one. In general if the answer is "make millions of rows of code via the macro language" the question is suspect...Joe
Thanks Joe, much appreciated. I'm sure there are much more efficient ways, I'm using a lot of operations this way, however I lack the experience to optimise it!user2662468

2 Answers

1
votes

Here's a pure data step non-hash version. On my machine it's actually faster than the hash solution; I suspect it's not actually faster on a machine with a HDD (I have an SSD, so point access is not substantially slower than hash access, and I avoid having to load the hash). I would recommend using it if you don't know hashes very well or at all, as it'll be easier to troubleshoot, and it scales similarly. For most rows it accesses 11 rows, the current row and five other rows twice (one row, subtract it, then another row) for a total of around a million and a half total reads for 130k rows. (Compare that to about 17 billion reads for the cartesian...)

I suffix the macros with "_2" to differentiate them from the macros in the hash solution.

data test_data;
  array app[5] app_1-app_5;
  do _i = 1 to 130000;
    dt_var = datetime() - _i*180;
    do _j = 1 to dim(app);
      *app[_j] = floor(rand('Uniform')*6); *generate 0 to 5 integer;
      app[_j]=1;
    end;
    output;
  end;
  format dt_var datetime17.;
run;

proc sort data=test_data;
  by dt_var;
run;

%macro add_2(array=);
      do _i = 1 to dim(app);
        &array.[_i] + app[_i];
      end;
%mend add;



%macro subtract_2(array=);
      do _i = 1 to dim(app);
        &array.[_i] + (-1*app[_i]);
      end;
%mend subtract;

%macro process_array_add_2(array=);

  array app_&array. app_&array._1-app_&array._5;   *define array;

  %add_2(array=app_&array.);                       *add current row to array;
%mend process_array_add_2;

%macro process_array_sub_2(array=, period=, number=);
  if _n_ eq 1 then do;                             *initialize point variable;
     point_&array. = 1;
  end;
  else do;                                         *do not have to do this _n_=1 as we only have that row;
    set test_data point=point_&array.;             *set the row that we may be subtracting;
  end;

  do while (intnx("&period.",dt_var,&number.,'s') lt curr_dt_var and point_&array. < _N_);  *until we hit a row that is within the period...;
    %subtract_2(array=app_&array.);                *subtract the rows values;
    point_&array. + 1;                             *increment the point to look at;
    set test_data point=point_&array.;             *set the new row;
  end;

%mend process_array_sub_2;


data want;
  set test_data;

  *if _n_ > 10000 then stop;                       *useful for testing if you want to check time to execute;
  curr_dt_var = dt_var;                            *save dt_var value from originally set record;
  array app[5] app_1-app_5;                        *base array;

  *start macro territory;  
  %process_array_add_2(array=24h);                 *have to do all of these adds before we start subtracting;
  %process_array_add_2(array=1wk);                 *otherwise we have the wrong record values;
  %process_array_add_2(array=1mo);
  %process_array_add_2(array=3mo);
  %process_array_add_2(array=6mo);

  %process_array_sub_2(array=24h,period=DTDay, number=1);   *now start checking to subtract what we need to;
  %process_array_sub_2(array=1wk,period=DTDay, number=7);
  %process_array_sub_2(array=1mo,period=DTMonth, number=1);
  %process_array_sub_2(array=3mo,period=DTMonth, number=3);
  %process_array_sub_2(array=6mo,period=DTMonth, number=6);

  *end macro territory;

  rename curr_dt_var=dt_var;
  format curr_dt_var datetime21.3;
  drop dt_var _:;

  output;                                          *unneeded in this version but left for comparison to hash;


run;
1
votes

The answer to your actual question is that you're simply generating too much macro code and perhaps even simply taking too much time. The way you are doing this is going to operate on an O=n^2 level, as you're basically doing a cartesian join of every record to every record, and then some. 130,000 * 130,000 is a pretty decent sized number, and on top of that you're actually opening the SQL environment several times for each 130,000 rows. Ouch.

The solution is to do it either in a way that isn't too slow, or if it is, in a way that won't have too much overhead at least.

The fast solution is to not do the cartesian join, or to limit how much needs to be joined. One good solution would be to restructure the problem, not require every record to be compared, but instead consider each calendar day, say, a period, especially in the over-24h periods (24h you might do the way you do above, but not the other four). 1 month, 3 month, etc., do you really need to figure out time of day? Probably won't make much difference. If you can get rid of that, then you can use built in PROCs to precompile all possible 1 month periods, all possible 3 month periods, etc., and then join on the appropriate one. But that won't work with 130,000 of them; it would only work if you could limit it to one per day, probably.


If you must do it at the second level (or worse), what you'll want to do is avoid the cartesian join, and instead keep track of the various records you've seen already, and the sums. The short explanation of the algorithm is:

For each row:

  • Add this row's values to the rolling sums (at the end of the queue)
  • Check if the current item of the queue is outside of the period; if it is, subtract it from the rolling sums, and check the next item (repeat until not outside of the period), updating the current queue position
  • Return the sum at this point

This requires checking each row typically twice (except at odd boundaries where you have no rows popped off for several iterations, due to months having different numbers of days). This operates on O=n time, much faster than the cartesian join, and on top of that has far less memory/space required (the cartesian join might need to hit disk space).

The hash version of this solution is below. This will be the fastest solution I think that compares every row. Note that I intentionally make the test data have 1 for every row and same number of rows for every day; that lets you see how it works on a row-wise manner very easily. (For example, every 24h period has 481 rows, because I made 480 rows per day exactly, and 481 includes the same time yesterday - if you change lt to le it will be 480, if you prefer not to include same time yesterday). You can see that the 'month' based periods will have slightly odd results at the boundaries where months change because the '01FEB20xx' to '01MAY20xx' period has far fewer days (and thus rows) than the '01JUL20xx' to '01OCT20xx' period, for example; better would be 30/90/180 day periods.

data test_data;
  array app[5] app_1-app_5;
  do _i = 1 to 130000;
    dt_var = datetime() - _i*180;
    do _j = 1 to dim(app);
      *app[_j] = floor(rand('Uniform')*6); *generate 0 to 5 integer;
      app[_j]=1;
    end;
    output;
  end;
  format dt_var datetime17.;
run;

proc sort data=test_data;
  by dt_var;
run;



%macro add(array=);
      do _i = 1 to dim(app);
        &array.[_i] + app[_i];
      end;
%mend add;



%macro subtract(array=);
      do _i = 1 to dim(app);
        &array.[_i] + (-1*app[_i]);
      end;
%mend subtract;

%macro process_array_add(array=);

  array app_&array. app_&array._1-app_&array._5;


  %add(array=app_&array.);

%mend process_array_add;

%macro process_array_subtract(array=, period=, number=);

  if _n_ eq 1 then do;
    declare hiter hi_&array.('td');
    rc_&array. = hi_&array..first();
  end;
  else do;
    rc_&array. = hi_&array..setcur(key:firstval_&array.);
  end;

  do while (intnx("&period.",dt_var,&number.,'s') lt curr_dt_var and rc_&array.=0);
    %subtract(array=app_&array.);
    rc_&array. = hi_&array..next();
  end;

  retain firstval_&array.;
  firstval_&array. = dt_var; 

%mend process_array_subtract;


data want;
  set test_data;

 * if _n_ > 10000 then stop;
  curr_dt_var = dt_var;
  array app[5] app_1-app_5;


  if _n_ eq 1 then do;
    declare hash td(ordered:'a');
    td.defineKey('dt_var');
    td.defineData('dt_var','app_1','app_2','app_3','app_4','app_5');
    td.defineDone();
  end;


  rc_a = td.add();

  *start macro territory;

  %process_array_add(array=24h);
  %process_array_add(array=1wk);
  %process_array_add(array=1mo);
  %process_array_add(array=3mo);
  %process_array_add(array=6mo);



  %process_array_subtract(array=24h,period=DTDay, number=1);
  %process_array_subtract(array=1wk,period=DTDay, number=7);
  %process_array_subtract(array=1mo,period=DTMonth, number=1);
  %process_array_subtract(array=3mo,period=DTMonth, number=3);
  %process_array_subtract(array=6mo,period=DTMonth, number=6);

  *end macro territory;

  rename curr_dt_var=dt_var;
  format curr_dt_var datetime21.3;
  drop dt_var rc: _:;

  output;


run;