I have a database of family relations:
with example_data as(
SELECT 'abc' as relative_1, 'def' as relative_2
union all
SELECT 'abc' as relative_1, '123' as relative_2
union all
SELECT 'def' as relative_1, '334' as relative_2
union all
SELECT 'fdc' as relative_1, '123' as relative_2
union all
SELECT 'fgl' as relative_1, '342' as relative_2
)
How can I create base on this data full family clans, so that I receive this kind of result:
The code I wrote to create the desired output does not seem at all practical and indeed with a table of 100k rows of input my query hits the 6 hour limit after the 5th self-join. I'm not worried that might end up with the whole table connected in one long string -- I know there are only so many family members in a clan.
Also alternatively, it would be great, if it was possible to return the result as a nested table with list_all_relatives
as a repeated field with minimum numbers of steps between relative_1 and the respective distant relative.
My inefficient code that returns the result in the image:
SELECT 'abc' as relative_1, 'def' as relative_2, 'abc' as list_0
union all
SELECT 'abc' as relative_1, '123' as relative_2, 'abc' as list_0
union all
SELECT 'def' as relative_1, '334' as relative_2, 'def' as list_0
union all
SELECT 'fdc' as relative_1, '123' as relative_2, 'fdc' as list_0
union all
SELECT 'fgl' as relative_1, '342' as relative_2, 'fgl' as list_0
)
,
step_0 as (
SELECT relative_1, relative_2,
ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(relative_1,',',relative_2,',',list_0), ',')) AS x ORDER BY x), ',') AS combined_list
from raw_data
)
,
step_1 as (
SELECT relative_1, relative_2, list_1,
ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(relative_1,',',relative_2,',',list_1), ',')) AS x ORDER BY x), ',') AS combined_list
from
step_0
left join
(select relative_2,combined_list as list_1 from step_0)
using(relative_2)
)
,
step_2 as (
SELECT distinct * except (combined_list,list_1),
ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(combined_list,',',list_2), ',')) AS x ORDER BY x), ',') AS combined_list,
from
step_1
left join
(select relative_1,combined_list as list_2 from step_1)
using(relative_1)
)
,
step_3 as (
SELECT distinct * except (combined_list,list_2),
ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(combined_list,',',list_3), ',')) AS x ORDER BY x), ',') AS combined_list,
from
step_2
left join
(select relative_2,combined_list as list_3 from step_2)
using(relative_2)
)
,
step_4 as (
SELECT distinct * except (combined_list,list_3),
ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(combined_list,',',list_4), ',')) AS x ORDER BY x), ',') AS combined_list,
from
step_3
left join
(select relative_1,combined_list as list_4 from step_3)
using(relative_1)
),
step_N as (
SELECT *
from step_4
)
,
step_prefinal as (
SELECT distinct
relative_1,list_4, combined_list,
1+length(REGEXP_REPLACE(REGEXP_REPLACE(LOWER(combined_list), '[a-z]', ''),'[0-9]','')) as n_elements_in_list,
max(1+length(REGEXP_REPLACE(REGEXP_REPLACE(LOWER(combined_list), '[a-z]', ''),'[0-9]',''))) over (partition by relative_1) as longest_list_relatives
from step_N
)
,
step_final as (
SELECT relative_1, combined_list, n_elements_in_list,
count(*) over (partition by relative_1) as cnt_lists_per_relative,
max(n_elements_in_list) over (partition by relative_1) as max_elements
from
step_prefinal
where true
and longest_list_relatives = n_elements_in_list
group by 1,2,3
)
,
stats as (
SELECT cnt_lists_per_relative, count(distinct relative_1) as cnt,
max(max_elements) as max_elements
from
step_final
group by 1
order by 1
)
SELECT relative_1, combined_list as list_all_relatives
from step_final
where true