1
votes

Relatively new to coding and using awk, so I apologise if this is a silly question! I need to compare $3 in file 1 to $3 in file 2, and if they match, to print the line from file 1 with with the corresponding line entry from $10 in file 2. I have a command that does this,

awk 'NR==FNR{a[$3]=$10; next} a[$3] {print $0 "\t" a[$3]}' file2 file1

However, file2 has columns $10-$647, and I need to do the above for all 637 columns. Is there a way to loop this?

Example file 1:

 1  715348  rs3131984   T   G   100 PASS    AC=5008;AF=1;AN=5008;NS=2504;DP=16986;EAS_AF=1;AMR_AF=1;AFR_AF=1;EUR_AF=1;SAS_AF=1;AA=.|||;VT=SNP   GT  1|1 1|1 1|1
 1  723798  rs34882115  CAG C   100 PASS    AC=4012;AF=0.801118;AN=5008;NS=2504;DP=24752;EAS_AF=0.7946;AMR_AF=0.8775;AFR_AF=0.5416;EUR_AF=0.9602;SAS_AF=0.9407;VT=INDEL GT  1|1 1|1 1|1
 1  723891  rs2977670   G   C   100 PASS    AC=3906;AF=0.779952;AN=5008;NS=2504;DP=22718;EAS_AF=0.7917;AMR_AF=0.8689;AFR_AF=0.4849;EUR_AF=0.9483;SAS_AF=0.9305;AA=.|||;VT=SNP   GT  1|1 1|1 1|1
 1  729679  rs4951859   C   G   100 PASS    AC=3205;AF=0.639976;AN=5008;NS=2504;DP=18762;EAS_AF=0.6875;AMR_AF=0.7536;AFR_AF=0.2905;EUR_AF=0.841;SAS_AF=0.7761;AA=.|||;VT=SNP    GT  1|0 1|1 1|0
 1  752566  rs3094315   G   A   100 PASS    AC=3597;AF=0.718251;AN=5008;NS=2504;DP=21293;EAS_AF=0.8839;AMR_AF=0.804;AFR_AF=0.3873;EUR_AF=0.84;SAS_AF=0.8088;AA=.|||;VT=SNP  GT  0|1 1|1 0|1
 1  752721  rs3131972   A   G   100 PASS    AC=3272;AF=0.653355;AN=5008;NS=2504;DP=22729;EAS_AF=0.7659;AMR_AF=0.7363;AFR_AF=0.2905;EUR_AF=0.839;SAS_AF=0.7781;AA=.|||;VT=SNP    GT  0|1 1|1 0|1
 1  754182  rs3131969   A   G   100 PASS    AC=3398;AF=0.678514;AN=5008;NS=2504;DP=16315;EAS_AF=0.7331;AMR_AF=0.7565;AFR_AF=0.3525;EUR_AF=0.8718;SAS_AF=0.8088;AA=.|||;VT=SNP   GT  0|1 1|1 0|1
 1  754192  rs3131968   A   G   100 PASS    AC=3398;AF=0.678514;AN=5008;NS=2504;DP=16981;EAS_AF=0.7331;AMR_AF=0.7565;AFR_AF=0.3525;EUR_AF=0.8718;SAS_AF=0.8088;AA=.|||;VT=SNP   GT  0|1 1|1 0|1
 1  754334  rs3131967   T   C   100 PASS    AC=3427;AF=0.684305;AN=5008;NS=2504;DP=21917;EAS_AF=0.7629;AMR_AF=0.755;AFR_AF=0.3525;EUR_AF=0.8718;SAS_AF=0.8088;AA=.|||;VT=SNP    GT  0|1 1|1 0|1
 1  754503  rs3115859   G   A   100 PASS    AC=3325;AF=0.663938;AN=5008;NS=2504;DP=19944;EAS_AF=0.7629;AMR_AF=0.7378;AFR_AF=0.3374;EUR_AF=0.839;SAS_AF=0.771;AA=.|||;VT=SNP GT  0|1 1|1 0|1
 1  754964  rs3131966   C   T   100 PASS    AC=3322;AF=0.663339;AN=5008;NS=2504;DP=19476;EAS_AF=0.7629;AMR_AF=0.7378;AFR_AF=0.3366;EUR_AF=0.837;SAS_AF=0.771;AA=.|||;VT=SNP GT  0|1 1|1 0|1
 1  755887  rs3131964   C   G   100 PASS    AC=4905;AF=0.979433;AN=5008;NS=2504;DP=22796;EAS_AF=1;AMR_AF=0.9914;AFR_AF=0.9304;EUR_AF=0.995;SAS_AF=1;AA=.|||;VT=SNP  GT  1|1 1|1 1|1
 1  755890  rs3115858   A   T   100 PASS    AC=3763;AF=0.751398;AN=5008;NS=2504;DP=23185;EAS_AF=0.8839;AMR_AF=0.8242;AFR_AF=0.4539;EUR_AF=0.8728;SAS_AF=0.8405;AA=.|||;VT=SNP   GT  0|1 1|1 0|1
 1  756604  rs3131962   A   G   100 PASS    AC=3746;AF=0.748003;AN=5008;NS=2504;DP=28270;EAS_AF=0.8829;AMR_AF=0.8242;AFR_AF=0.4501;EUR_AF=0.8698;SAS_AF=0.8323;AA=.|||;VT=SNP   GT  0|1 1|1 0|1

Example file 2:

1   742429  rs3094315   A   G   .   .   .   GT  0/0 0/0
1   1011278 rs3737728   G   A   .   .   .   GT  0/0 0/1
1   1077546 rs9442380   C   T   .   .   .   GT  0/0 0/0
1   1084601 rs4970362   G   A   .   .   .   GT  0/0 0/1
1   1089205 rs9660710   C   A   .   .   .   GT  0/0 0/0
1   1300787 rs2765033   C   T   .   .   .   GT  0/0 0/1
1   756604  rs3131962   A   G   100 PASS    AC=3746;AF=0.748003;AN=5008;NS=2504;DP=28270;EAS_AF=0.8829;AMR_AF=0.8242;AFR_AF=0.4501;EUR_AF=0.8698;SAS_AF=0.8323;AA=.|||;VT=SNP   GT  0|1 1|1
1   1303878 rs2649588   T   C   .   .   .   GT  0/0 0/1
1   1695996 rs6603811   C   T   .   .   .   GT  0/0 0/0
1   1782971 rs10907192  G   A   .   .   .   GT  0/0 0/0
1   1878053 rs3820011   C   A   .   .   .   GT  0/1 0/1
1   1882185 rs2803291   C   T   .   .   .   GT  0/0 0/0

Is awk the best way to do this? I'm not really sure how to make loops of any sort. All help and explanations are much appreciated!

2
So you want to print 637 lines for each line matched? One for each field?123
I need to print 637 columns, based on a row match in columns across two files.Hannah6746576
@Hannah6746576 - have you checked this Answer ? stackoverflow.com/questions/40523371/…VIPIN KUMAR
It would be nice to get some feedback or something marked as an answer...Riccardo Petraglia
Sorry for the delay. I ran the awk only command, and it has worked, but it has not tab separated the files. Thank you so much for your help and patience!Hannah6746576

2 Answers

0
votes

I would do:

$ column_file1=`awk '{print NF}' file1 | tail -1`
$ paste file1 file2 | awk -v c1=column_file1 '{if($3==$(3+c1)){for(i=1;i<=647;i++)if(i<=c1 || i>c1+10){printf "%s ", $i}; printf "\n"}}'  

paste just join the two files line by line.

The first if in awk check if the third field of the two files match (consider that now, the first field of the second file is just $(1+c1)). If the condition is true enter a loop that print (on the same line -printf "%s ", $i-) all the field avoiding the first 10 of the second file (if(i<=c1 || i>c1+10)). Once the loop is finished (so you have printed all the line) go to new line. If the files are well structured (each field has the same wide), you can use print $0 and pipe the output to colrm.

If you prefere to use only awk

Copy the following into a file

#!/bin/awk -f

{if(NR == FNR)
    {
        a[$3] = $10;
        for(i=11;i<=647;i++){
            a[$3] = a[$3] "\t" $i
        };
        next
    }
    else{
        if($3 in a){print $0 "\t" a[$3]}
    }
}

then run it with

$ awk -f <name_of_file> file2 file1

Tell me if you have problems

EDIT:

I forgot a tail -1 in the first example. This works with the examples you provided.

-1
votes

try this -

File1.txt

    #cat file1.txt
    1 715348  rs3131984   T   G   100 PASS    AC=5008;AF=1;AN=5008;NS=2504;DP=16986;EAS_AF=1;AMR_AF=1;AFR_AF=1;EUR_AF=1;SAS_AF=1;AA=.|||;VT=SNP   GT  1|1 1|1 1|1
    1 723798  rs34882115  CAG C   100 PASS    AC=4012;AF=0.801118;AN=5008;NS=2504;DP=24752;EAS_AF=0.7946;AMR_AF=0.8775;AFR_AF=0.5416;EUR_AF=0.9602;SAS_AF=0.9407;VT=INDEL GT  1|1 1|1 1|1
    1 723891  rs2977670   G   C   100 PASS    AC=3906;AF=0.779952;AN=5008;NS=2504;DP=22718;EAS_AF=0.7917;AMR_AF=0.8689;AFR_AF=0.4849;EUR_AF=0.9483;SAS_AF=0.9305;AA=.|||;VT=SNP   GT  1|1 1|1 1|1
    1 729679  rs4951859   C   G   100 PASS    AC=3205;AF=0.639976;AN=5008;NS=2504;DP=18762;EAS_AF=0.6875;AMR_AF=0.7536;AFR_AF=0.2905;EUR_AF=0.841;SAS_AF=0.7761;AA=.|||;VT=SNP    GT  1|0 1|1 1|0
    1 752566  rs3094315   G   A   100 PASS    AC=3597;AF=0.718251;AN=5008;NS=2504;DP=21293;EAS_AF=0.8839;AMR_AF=0.804;AFR_AF=0.3873;EUR_AF=0.84;SAS_AF=0.8088;AA=.|||;VT=SNP  GT  0|1 1|1 0|1
    1 752721  rs3131972   A   G   100 PASS    AC=3272;AF=0.653355;AN=5008;NS=2504;DP=22729;EAS_AF=0.7659;AMR_AF=0.7363;AFR_AF=0.2905;EUR_AF=0.839;SAS_AF=0.7781;AA=.|||;VT=SNP    GT  0|1 1|1 0|1
    1 754182  rs3131969   A   G   100 PASS    AC=3398;AF=0.678514;AN=5008;NS=2504;DP=16315;EAS_AF=0.7331;AMR_AF=0.7565;AFR_AF=0.3525;EUR_AF=0.8718;SAS_AF=0.8088;AA=.|||;VT=SNP   GT  0|1 1|1 0|1
    1 754192  rs3131968   A   G   100 PASS    AC=3398;AF=0.678514;AN=5008;NS=2504;DP=16981;EAS_AF=0.7331;AMR_AF=0.7565;AFR_AF=0.3525;EUR_AF=0.8718;SAS_AF=0.8088;AA=.|||;VT=SNP   GT  0|1 1|1 0|1
    1 754334  rs3131967   T   C   100 PASS    AC=3427;AF=0.684305;AN=5008;NS=2504;DP=21917;EAS_AF=0.7629;AMR_AF=0.755;AFR_AF=0.3525;EUR_AF=0.8718;SAS_AF=0.8088;AA=.|||;VT=SNP    GT  0|1 1|1 0|1
    1 754503  rs3115859   G   A   100 PASS    AC=3325;AF=0.663938;AN=5008;NS=2504;DP=19944;EAS_AF=0.7629;AMR_AF=0.7378;AFR_AF=0.3374;EUR_AF=0.839;SAS_AF=0.771;AA=.|||;VT=SNP GT  0|1 1|1 0|1
    1 754964  rs3131966   C   T   100 PASS    AC=3322;AF=0.663339;AN=5008;NS=2504;DP=19476;EAS_AF=0.7629;AMR_AF=0.7378;AFR_AF=0.3366;EUR_AF=0.837;SAS_AF=0.771;AA=.|||;VT=SNP GT  0|1 1|1 0|1
    1 755887  rs3131964   C   G   100 PASS    AC=4905;AF=0.979433;AN=5008;NS=2504;DP=22796;EAS_AF=1;AMR_AF=0.9914;AFR_AF=0.9304;EUR_AF=0.995;SAS_AF=1;AA=.|||;VT=SNP  GT  1|1 1|1 1|1
    1 755890  rs3115858   A   T   100 PASS    AC=3763;AF=0.751398;AN=5008;NS=2504;DP=23185;EAS_AF=0.8839;AMR_AF=0.8242;AFR_AF=0.4539;EUR_AF=0.8728;SAS_AF=0.8405;AA=.|||;VT=SNP   GT  0|1 1|1 0|1
    1 756604  rs3131962   A   G   100 PASS    AC=3746;AF=0.748003;AN=5008;NS=2504;DP=28270;EAS_AF=0.8829;AMR_AF=0.8242;AFR_AF=0.4501;EUR_AF=0.8698;SAS_AF=0.8323;AA=.|||;VT=SNP   GT  0|1 1|1 0|1

file2.txt

#cat file2.txt
1   742429  rs3094315   A   G   .   .   .   GT  0/0 0/0
1   1011278 rs3737728   G   A   .   .   .   GT  0/0 0/1
1   1077546 rs9442380   C   T   .   .   .   GT  0/0 0/0
1   1084601 rs4970362   G   A   .   .   .   GT  0/0 0/1
1   1089205 rs9660710   C   A   .   .   .   GT  0/0 0/0
1   1300787 rs2765033   C   T   .   .   .   GT  0/0 0/1
1   756604  rs3131962   A   G   100 PASS    AC=3746;AF=0.748003;AN=5008;NS=2504;DP=28270;EAS_AF=0.8829;AMR_AF=0.8242;AFR_AF=0.4501;EUR_AF=0.8698;SAS_AF=0.8323;AA=.|||;VT=SNP   GT  0|1 1|1
1   1303878 rs2649588   T   C   .   .   .   GT  0/0 0/1
1   1695996 rs6603811   C   T   .   .   .   GT  0/0 0/0
1   1782971 rs10907192  G   A   .   .   .   GT  0/0 0/0
1   1878053 rs3820011   C   A   .   .   .   GT  0/1 0/1
1   1882185 rs2803291   C   T   .   .   .   GT  0/0 0/0

Join -

#awk 'NR==FNR {val[$3]=$10;next;} $3 in val {print $0,val[$3]}' file2.txt file1.txt
1 752566  rs3094315   G   A   100 PASS    AC=3597;AF=0.718251;AN=5008;NS=2504;DP=21293;EAS_AF=0.8839;AMR_AF=0.804;AFR_AF=0.3873;EUR_AF=0.84;SAS_AF=0.8088;AA=.|||;VT=SNP  GT  0|1 1|1 0|1 0/0
1 756604  rs3131962   A   G   100 PASS    AC=3746;AF=0.748003;AN=5008;NS=2504;DP=28270;EAS_AF=0.8829;AMR_AF=0.8242;AFR_AF=0.4501;EUR_AF=0.8698;SAS_AF=0.8323;AA=.|||;VT=SNP   GT  0|1 1|1 0|1 0|1