0
votes

I have two text files with different text organization. Both files contain few identical patterns (numbers) in the text. I'd like to find which patterns (numbers) are present in both files and write them to the output file.

file1.txt:

blablabla_25947.bkwjcnwelkcnwelckme

blablabla_111.bkwjcnwelkcnwelckme

blablabla_65155.bkwjcnwelkcnwelckme

blablabla_56412.bkwjcnwelkcnwelckme

file2.txt:

blablabla_647728.bkwjcnwelkcnwelck
kjwdhcwkejcwmekcjwhemckwejhcmwekch

blablabla_6387.bkwjcnwelkcnwelckme
wexkwhenqlciwuehnqweiugfnwekfiugew
wedhwnejchwenckhwqecmwequhcnkwjehc
owichjwmelcwqhemclekcelmkjcelkwejc

blablabla_59148.bkwjcnwelkcnwelckme
ecmwequhcnkwjehcowichjwmelcwqhemcle
kcelmkjcelkwejcwecawecwacewwAWWAXEG

blablabla_111.bkwjcnwelkcnwelckm
WESETRBRVSSCQEsfdveradassefwaefawecc

output_file.txt:

111
2
This is computationally a very hard problem, since you need to match "every possible length of string from every possible starting point". In the above, how many matches of b, bl, bla, blab, .. etc would you expect to get? Is there a "minimum length of match" you would want? I think there is no built in command that does exactly what you want; if you had the search strings on a line by themselves, it would be easy (using grep -F file1 file2), but you don't... Can you do anything to bound the problem better?Floris
I had not appreciated that you only wanted to match the numbers... that does make it somewhat easier.Floris

2 Answers

1
votes

How about:

$ egrep -o '_[0-9]+\.' file1 | grep -of - file2 | tr -d '_.'
111

# Redirect to new file
$ egrep -o '_[0-9]+\.' file1 | grep -of - file2 | tr -d '_.' > file3

First grep gets all the digit strings (preceded by _ and preceding .) from file1 and this list is used to grep the matches in file2. The _ and . are stripped using tr.

0
votes

I did in fact try to solve the "hard problem" that I thought you were posing. The following code looks for the longest string found in both file1 and file2. If there are multiple "longest" strings, it only reports the first one found. May be helpful to someone, at some point (although maybe not the solution you are looking for here):

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <sys/stat.h>

/* This routine returns the size of the file it is called with. */

static unsigned
get_file_size (const char * file_name)
{
    struct stat sb;
    if (stat (file_name, & sb) != 0) {
        fprintf (stderr, "'stat' failed for '%s': %s.\n",
                 file_name, strerror (errno));
        exit (EXIT_FAILURE);
    }
    return sb.st_size;
}

/* This routine reads the entire file into memory. */

static unsigned char *
read_whole_file (const char * file_name)
{
    unsigned s;
    unsigned char * contents;
    FILE * f;
    size_t bytes_read;
    int status;

    s = get_file_size (file_name);
    contents = malloc (s + 1);
    if (! contents) {
        fprintf (stderr, "Not enough memory.\n");
        exit (EXIT_FAILURE);
    }

    f = fopen (file_name, "r");
    if (! f) {
        fprintf (stderr, "Could not open '%s': %s.\n", file_name,
                 strerror (errno));
        exit (EXIT_FAILURE);
    }
    bytes_read = fread (contents, sizeof (unsigned char), s, f);
    if (bytes_read != s) {
        fprintf (stderr, "Short read of '%s': expected %d bytes "
                 "but got %d: %s.\n", file_name, s, bytes_read,
                 strerror (errno));
        exit (EXIT_FAILURE);
    }
    status = fclose (f);
    if (status != 0) {
        fprintf (stderr, "Error closing '%s': %s.\n", file_name,
                 strerror (errno));
        exit (EXIT_FAILURE);
    }
    return contents;
}

int main(int argc, char* argv[]){
    int i1, i2, l1, l2, lm;
    unsigned char longestString[1000]; // lazy way to make big enough.
    unsigned char tempString[1000];
    int longestFound=0;
    unsigned char *f1, *f2; // buffers with entire file contents
    f1  = read_whole_file (argv[1]);
    f2  = read_whole_file (argv[2]);

    l1 = strlen(f1);
    l2 = strlen(f2);

    for(i1 = 0; i1 < l1; i1++) {
        lm = 0;// length of match
        for(i2 = i1; i2<l2; i2++) {
            lm = 0;

            while (f1[i1+lm] == f2[i2+lm] && (i1+lm<l1) && (i2+lm<l2) && lm < 1000-1) {
                tempString[lm] = f1[i1+lm];
                lm++;
            }

            if (lm > longestFound) {
                tempString[lm]=0; // terminate string
                strcpy(longestString, tempString);
                longestFound = lm;
            }
        }

    }

    printf("longest string found is %d characters:\n", longestFound);
    printf("%s\n", longestString);
    free(f1);
    free(f2);
    return 0;
}

The code for reading entire file contents was found at http://www.lemoda.net/c/read-whole-file/index.html