7
votes

Here is my toy dataframe.

df <- tibble::tribble(
  ~var1, ~var2, ~var3, ~var4, ~var5, ~var6, ~var7,
    "A",   "C",    1L,    5L,  "AA",  "AB",    1L,
    "A",   "C",    2L,    5L,  "BB",  "AC",    2L,
    "A",   "D",    1L,    7L,  "AA",  "BC",    2L,
    "A",   "D",    2L,    3L,  "BB",  "CC",    1L,
    "B",   "C",    1L,    8L,  "AA",  "AB",    1L,
    "B",   "C",    2L,    6L,  "BB",  "AC",    2L,
    "B",   "D",    1L,    9L,  "AA",  "BC",    2L,
    "B",   "D",    2L,    6L,  "BB",  "CC",    1L)

How can I get the combination of a minimum number of variables that uniquely identify the observations in the dataframe i.e which variables together can make the primary key?

The way I approached this problem is to find the combination of variables for which distinct values is equal to the number of observations of the data frame. So, those variable combinations that will give me 8 observation, in this case. I randomly tried that and found few:

df %>% distinct(var1, var2, var3)

df %>% distinct(var1, var2, var5)

df %>% distinct(var1, var3, var7)

So vars123, vars125, vars137 deserves to the Primary Key here. How can I find these variable combinations programmatically using R. Also, more preference should be given to character, factor, date, and (maybe) integer variables, if possible, as doubles should not make the Primary Key.

The output could be list or dataframe stating combinations "var1, var2, var3", "var1, var2, var5", "var1, var3, var7".

5

5 Answers

3
votes

A bit of a variation on the other answers, but here's the requested tabular output:

nms <- unlist(lapply(seq_len(length(df)), combn, x=names(df), simplify=FALSE), rec=FALSE)
out <- data.frame(
  vars = vapply(nms, paste, collapse=",", FUN.VALUE=character(1)),
  counts = vapply(nms, function(x) nrow(unique(df[x])), FUN.VALUE=numeric(1))
)

Then take the least number of variables required to be a primary key:

out[match(nrow(df), out$counts),]
#        vars counts
#12 var1,var6      8
3
votes

There may be a better way, but here's a brute-force method

combs <- lapply(seq(ncol(df)), function(x) combn(names(df), x, simplify = F))

keys <- list()
for(i in seq_along(combs)){
  keys[[i]] <- combs[[i]][sapply(combs[[i]], function(x) nrow(distinct(df[x])) == nrow(df))]
  if(length(keys[[i]])) stop(paste('Found key of', i, 'columns, stopping'))
}


keys

# [[1]]
# list()
# 
# [[2]]
# [[2]][[1]]
# [1] "var1" "var6"
# 
# [[2]][[2]]
# [1] "var4" "var6"
# 
# [[2]][[3]]
# [1] "var4" "var7"
3
votes

Here's a brute force method enumerating all possible combination of variables. It seems like there are 80 possible combinations that meets your criteria.

>df
  var1 var2 var3 var4 var5 var6 var7
1    A    C    1    5   AA   AB    1
2    A    C    2    5   BB   AC    2
3    A    D    1    7   AA   BC    2
4    A    D    2    3   BB   CC    1
5    B    C    1    8   AA   AB    1
6    B    C    2    6   BB   AC    2
7    B    D    1    9   AA   BC    2
8    B    D    2    6   BB   CC    1

>n<-ncol(df)
>combinations<-unlist(lapply(1:n, function(x) unlist(apply(combn(n,x),2,list), recursive=F) ), recursive=F)
>length(combinations)
[1] 127
>count_distinct<-sapply(combinations, function(x){ nrow(unique(df[,x,drop=F])) } )
>length(which(count_distinct==8))
[1] 80
>combinations[which(count_distinct==8)]
[[1]]
[1] 1 6

[[2]]
[1] 4 6

[[3]]
[1] 4 7

[[4]]
[1] 1 2 3

[[5]]
[1] 1 2 5

[[6]]
[1] 1 2 6

[[7]]
[1] 1 2 7

[[8]]
[1] 1 3 6

[[9]]
[1] 1 3 7

...
2
votes

Possible approach:

library(dplyr)

lst <- c()

for (i in 2:ncol(df)) {

  lst_combinations <- combn(names(df), i ,simplify=FALSE)

  lst <- c(lst, lst_combinations)

}

lst_results <- c()

for (i in 1:length(lst)) {

  nms <- lst[i][[1]]

  lgth_df <- df %>% .[, colnames(.) %in% nms] %>% distinct() %>% count()

  if (lgth_df$n == nrow(df)) {

    nms <- paste(nms, collapse = ", ")

    lst_results <- c(lst_results, nms)

  }

}

First few combinations (found 80 in total for your example):

[1] "var1, var6"                              
 [2] "var4, var6"                              
 [3] "var4, var7"                              
 [4] "var1, var2, var3"                        
 [5] "var1, var2, var5"                        
 [6] "var1, var2, var6"                        
 [7] "var1, var2, var7"                        
 [8] "var1, var3, var6"                        
 [9] "var1, var3, var7"                        
[10] "var1, var4, var6"
2
votes

Borrowing completely from thelatemail's answer and converting that to purrr:

library(tidyverse)

m_in_comb <- seq_len(length(df))

var_combs_listoflist <- map(m_in_comb, ~combn(x=names(df), m = .x, simplify=F)) %>% 
  unlist(recursive = F)

var_combs_listofchr  <-  map_chr(var_combs_listoflist, ~paste(.x, collapse = ","))

distinct_obs_per_var_comb = map_int(var_combs_listoflist, ~(select(df, .x) %>% n_distinct()))

keys <- tibble(var_combs = var_combs_listofchr, distinct_count = distinct_obs_per_var_comb)

primarykeys <- keys %>% 
   filter(distinct_count==nrow(df)) %>% 
   mutate(n_vars = str_count(var_combs, ",")+1) %>% 
   filter(n_vars==min(n_vars))