1
votes

I have a large sas dataset (1.5m obs, ~250 variables) that I need to split into several smaller sas datasets of equal size for batch processing. Each dataset needs to contain all the variables but only a fraction of the obs. What is the fastest way of doing this?

2

2 Answers

2
votes

You could do something like the following:

%macro splitds(inlib=,inds=,splitnum=,outid=);

  proc sql noprint;
    select nobs into :nobs
    from sashelp.vtable
    where libname=upcase("&inlib") and memname=upcase("&inds");
  quit;
  %put Number of observations in &inlib..&inds.: &nobs;

  data %do i=1 %to &splitnum.;
         &outid.&i
       %end;;
    set &inds.;
    %do j=1 %to (&splitnum.-1);
      %if &j.=1 %then %do;
        if
      %end;
      %else %do;
        else if
      %end;
                _n_<=((&nobs./&splitnum.)*&j.) then output &outid.&j.;
    %end;
    else output &outid.&splitnum.;
  run;
%mend;

An example call to split MYLIB.MYDATA into 10 data sets named NEWDATA1 - NEWDATA10 would be:

%splitds(inlib=mylib,inds=mydata,splitnum=10,outid=newdata);
1
votes

Try this. I haven't tested yet, so expect a bug somewhere. You will need to edit the macro call to BATCH_PROCESS to include the names of the datasets, number of new data sets, etc.

%macro nobs (dsn);
   %local nobs dsid rc;
   %let nobs=0;
   %let dsid = %sysfunc(open(&dsn));
   %if &dsid %then %do;
      %let nobs = %sysfunc(attrn(&dsid,NOBS));
   %end;
   %else %put Open for dataset &dsn failed - %sysfunc(sysmsg());
   %let rc   = %sysfunc(close(&dsid));
   &nobs
%mend nobs;

%macro batch_process(dsn_in,dsn_out_prefix,number_of_dsns);

  %let dsn_obs = &nobs(&dsn_in);
  %let obs_per_dsn = %sysevalf(&dsn_obs / &number_of_dsns);

  data
     %do i = 1 %to &number_of_dsns;
        &dsn_out_prefix.&i
     %end;
     ;
     set &dsn_in;
     drop _count;
     retain _count 0;
     _count = _count + 1;
     %do i = 1 %to &number_of_dsns;
        if (1 + ((&i - 1) * &obs_per_dsn)) <= _count <= (&i * &obs_per_dsn) then do;
           output &dsn_out_prefix.&i;
        end; 
     %end;
  run;

%mend batch_process;

%batch_process( dsn_in=DSN_NAME , dsn_out_prefix = PREFIX_ , number_of_dsns = 5 );