1
votes

I have two variables ID1 and ID2. They are both the same kinds of identifiers. When they appear in the same row of data it means they are in the same group. I want to make a group identifier for each ID. For example, I have

ID1   ID2
1     4
1     5
2     5
2     6
3     7
4     1
5     1
5     2
6     2
7     3

Then I would want

ID   Group
1     1
2     1
3     2
4     1
5     1
6     1
7     2

Because 1,2,4,5,6 are paired by some combination in the original data they share a group. 3 and 7 are only paired with each other so they are a new group. I want to do this for ~20,000 rows. Every ID that is in ID1 is also in ID2 (more specifically if ID1=1 and ID2=2 for an observation, then there is another observation that is ID1=2 and ID2=1).

I've tried merging them back and forth but that doesn't work. I also tried call symput and trying to make a macro variable for each ID's group and then updating it as I move through rows, but I couldn't get that to work either.

3
Can you run Proc BOM, I forget the package name at the moment.Reeza
For 20k rows a hash-based approach should also be feasible, I expect.user667489
I'm having trouble understanding how proc bom works, could you give an example of how it would help me?DVL
@Reeza It's in SAS/OR.Joe
I've posted a hash-based answer - please confirm whether this behaves as you were expecting.user667489

3 Answers

0
votes

I have used Haikuo Bian's answer as a starting point to develop a slightly more complex algorithm that seems to work for all the test cases I have tried so far. It could probably be optimised further, but it copes with 20000 rows in under a second on my PC while using only a few MB of memory. The input dataset does not need to be sorted in any particular order, but as written it assumes that every row is present at least once with id1 < id2.

Test cases:

/* Original test case */
data have;
input id1 id2;
cards;
1     4
1     5
2     5
2     6
3     7
4     1
5     1
5     2
6     2
7     3
;
run;

/* Revised test case - all in one group with connecting row right at the end */
data have; 
input ID1 ID2; 
/*Make sure each row has id1 < id2*/
if id1 > id2 then do;
t_id2 = id2;
id2   = id1;
id1   = t_id2;
end;
drop t_id2;
cards; 
2 5 
4 8 
2 4 
2 6 
3 7 
4 1 
9 1 
3 2 
6 2 
7 3
;
run;

/*Full scale test case*/
data have;
    do _N_ = 1 to 20000;
        call streaminit(1);
        id1 = int(rand('uniform')*100000);
        id2 = int(rand('uniform')*100000);
        if id1 < id2 then output;
        t_id2 = id2;
        id2   = id1;
        id1   = t_id2;
        if id1 < id2 then output;
    end;
    drop t_id2; 
run;

Code:

option fullstimer;

data _null_;
    length id group 8;
    declare hash h();
    rc = h.definekey('id');
    rc = h.definedata('id');        
    rc = h.definedata('group');
    rc = h.definedone();

    array ids(2) id1 id2;
    array groups(2) group1 group2;

    /*Initial group guesses (greedy algorithm)*/
    do until (eof);
        set have(where = (id1 < id2)) end = eof;
        match = 0;
        call missing(min_group);
        do i = 1 to 2;
            rc = h.find(key:ids[i]);
            match + (rc=0);
            if rc = 0 then min_group = min(group,min_group);
        end;
        /*If neither id was in a previously matched group, create a new one*/
        if not(match) then do;
            max_group + 1;
            group = max_group;
        end;
        /*Otherwise, assign both to the matched group with the lowest number*/
        else group = min_group;
        do i = 1 to 2;
            id = ids[i];
            rc = h.replace();
        end;
    end;

    /*We now need to work through the whole dataset multiple times
      to deal with ids that were wrongly assigned to a separate group
      at the end of the initial pass, so load the table into a 
      hash object + iterator*/
    declare hash h2(dataset:'have(where = (id1 < id2))');
    rc = h2.definekey('id1','id2');
    rc = h2.definedata('id1','id2');
    rc = h2.definedone();
    declare hiter hi2('h2');

    change_count = 1;
    do while(change_count > 0);
        change_count = 0;
        rc = hi2.first();
        do while(rc = 0);
            /*Get the current group of each id from 
              the hash we made earlier*/
            do i = 1 to 2;
                rc = h.find(key:ids[i]);
                groups[i] = group;
            end;
            /*If we find a row where the two ids have different groups, 
              move the id in the higher group to the lower group*/
            if groups[1] < groups[2] then do;
                id = ids[2];
                group = groups[1];
                rc = h.replace();
                change_count + 1;           
            end;
            else if groups[2] < groups[1] then do;
                id = ids[1];
                group = groups[2];
                rc = h.replace();       
                change_count + 1;           
            end;
            rc = hi2.next();
        end;
        pass + 1;
        put pass= change_count=; /*For information only :)*/
    end;    

    rc = h.output(dataset:'want');

run;

/*Renumber the groups sequentially*/
proc sort data = want;
    by group id;
run;

data want;
    set want;
    by group;
    if first.group then new_group + 1;
    drop group;
    rename new_group = group;
run;

/*Summarise by # of ids per group*/
proc sql;
    select a.group, count(id) as FREQ 
        from want a
        group by a.group
        order by freq desc;
quit;   

Interestingly, the suggested optimisation of not checking the group of id2 during the initial pass if id1 is already matched actually slows things down a little in this extended algorithm, because it means that more work has to be done in the subsequent passes if id2 is in a lower numbered group. E.g. output from a trial run I did earlier:

With 'optimisation':

 pass=0 change_count=4696
 pass=1 change_count=204
 pass=2 change_count=23
 pass=3 change_count=9
 pass=4 change_count=2
 pass=5 change_count=1
 pass=6 change_count=0

 NOTE: DATA statement used (Total process time):
       real time           0.19 seconds
       user cpu time       0.17 seconds
       system cpu time     0.04 seconds
       memory              9088.76k
       OS Memory           35192.00k

Without:

 pass=0 change_count=4637
 pass=1 change_count=182
 pass=2 change_count=23
 pass=3 change_count=9
 pass=4 change_count=2
 pass=5 change_count=1
 pass=6 change_count=0

 NOTE: DATA statement used (Total process time):
       real time           0.18 seconds
       user cpu time       0.16 seconds
       system cpu time     0.04 seconds
0
votes

Please try the below code.

data have;
input ID1 ID2;
datalines;
1     4
1     5
2     5
2     6
3     7
4     1
5     1
5     2
6     2
7     3
;
run;

* Finding repeating in ID1;

proc sort data=have;by id1;run;

data want_1;

    set have;
    by id1;

    attrib flagrepeat length=8.;

    if not (first.id1 and last.id1) then flagrepeat=1;
    else flagrepeat=0;
run;

* Finding repeating in ID2;

proc sort data=want_1;by id2;run;

data want_2;
    set want_1;
    by id2;

    if not (first.id2 and last.id2) then flagrepeat=1;

run;

proc sort data=want_2 nodupkey;by id1 ;run;

data want(drop= ID2 flagrepeat rename=(ID1=ID));
    set want_2;
    attrib Group length=8.;

    if(flagrepeat eq 1) then Group=1;
    else Group=2;
run;

Hope this answer helps.

0
votes

Like one commentator mentioned, Hash does seem to be a viable approach. In the following code, 'id' and 'group' is maintained in the Hash table, new 'group' is added only when no 'id' match is found for the entire row. Please note, 'do over' is an undocumented feature, it can be easily replaced with a little bit more coding.

data have;
    input ID1   ID2;
    cards;
1     4
1     5
2     5
2     6
3     7
4     1
5     1
5     2
6     2
7     3
;

data _null_;
    if _n_=1 then
        do;
            declare hash h(ordered: 'a');
            h.definekey('id');
            h.definedata('id','group');
            h.definedone();
            call missing(id,group);
        end;

    set have end=last;
    array ids id1 id2;
    do over ids;
        rc=sum(rc,h.find(key:ids)=0);

        /*you can choose to 'leave' the loop here when first h.find(key:ids)=0 is met, for the sake of better efficiency*/
    end;

    if not rc > 0 then
        group+1;

    do over ids;
        id=ids;
        h.replace();
    end;
if last then rc=h.output(dataset:'want');
run;