I try to run a big query which involve joining 5 tables. The query need to run every half hour, most time it completes in matter of seconds, sometimes it takes hours, worst scenario is it fails at the end and error is "Connection error, please try again". I'm not using "join each" but "join" only due to performance reason and underlying dataset seems not too big for now.
The query is:
select
'2014-10-13' as date,
i.eventId as event_id,
e.event_name as event_name,
datediff('2014-10-13', e.event_start_date)+1 as event_day,
i.impressionCts as total_viewers,
round(i.top10Cts*100/i.impressionCts) as percent_pos_top10,
round(i.ct1020*100/i.impressionCts) as percent_pos_1020,
round(i.ct2030*100/i.impressionCts) as percent_pos_2030,
round(i.ct3050*100/i.impressionCts) as percent_pos_3050,
round(i.over50Cts*100/i.impressionCts) as percent_pos_50above,
p.purchasers as purchasers,
round(p.purchasers*100/i.impressionCts) as conversion_rate_view_to_purchase,
p.total_units as total_purchased_units,
round(p.total_demand,2) as total_purchased_demand,
ck.total_clickiers as total_clickers,
ck.total_clicks as total_clicks,
round(ck.total_clickiers*100/i.impressionCts) as conversion_rate_view_to_click,
round(ck.clickers_pos_top10*100/i.top10Cts) as conversion_rate_view_to_click_pos_top10,
round(ck.clickers_pos_1020*100/i.ct1020) as conversion_rate_view_to_click_pos_1020,
round(ck.clickers_pos_2030*100/i.ct2030) as conversion_rate_view_to_click_pos_2030,
round(ck.clickers_pos_3050*100/i.ct3050) as conversion_rate_view_to_click_pos_3050,
round(ck.clickers_pos_above50Cts/i.over50Cts) as conversion_rate_view_to_click_pos_50above,
ca.cartadders as total_cartadders,
ca.qty as total_qty_addedToCart,
round(ca.cartadders*100/ck.total_clickiers) as conversion_rate_click_to_cartadd,
round(p.purchasers*100/ca.cartadders) as conversion_rate_cartadd_to_purchase
from
(select integer(eventId) as eventId,count(distinct customerId) as impressionCts,
count(distinct if(position < 10, customerId,null) ) as top10Cts,
count(distinct if(position between 10 and 19, customerId,null)) as ct1020,
count(distinct if(position between 20 and 29, customerId,null)) as ct2030,
count(distinct if(position between 30 and 49, customerId,null)) as ct3050,
count(distinct if(position >= 50, customerId,null)) as over50Cts
from clickstream.event_impression_20141013 group each by eventId) i
join
(select magenta_event_id,event_name,event_start_date,event_end_date from zudw.event where date(event_start_date)<='2014-10-13' and date(event_end_date)>='2014-10-13') e
on e.magenta_event_id = i.eventId
left join
(select
integer(eventId) as eventId,count(distinct customerId) as total_clickiers, count(customerId) as total_clicks,
count(distinct if(position < 10, customerId,null) ) as clickers_pos_top10,
count(distinct if(position between 10 and 19, customerId,null)) as clickers_pos_1020,
count(distinct if(position between 20 and 29, customerId,null)) as clickers_pos_2030,
count(distinct if(position between 30 and 49, customerId,null)) as clickers_pos_3050,
count(distinct if(position >= 50, customerId,null)) as clickers_pos_above50Cts
from clickstream.click_20141013 where type='event'
and (refererUrl is null or refererUrl='http://www.zulily.com/' or instr(refererUrl,'.zulily.com/new-today/')>0 or instr(refererUrl,'.zulily.com/newToday')>0
or instr(refererUrl,'.zulily.com/endsSoon')>0 or instr(refererUrl,'.zulily.com/toys')>0 or instr(refererUrl,'.zulily.com/ready')>0
or instr(refererUrl,'.zulily.com/lastDay/')>0 or instr(refererUrl,'.zulily.com/shopByAge/')>0 or instr(refererUrl,'.zulily.com/shopByCategory/')>0
or instr(refererUrl,'.zulily.com/girls')>0 or instr(refererUrl,'.zulily.com/boys')>0 or instr(refererUrl,'.zulily.com/women')>0 or instr(refererUrl,'.zulily.com/men')>0
or instr(refererUrl,'.zulily.com/home')>0 or instr(refererUrl,'.zulily.com/shoes')>0 or instr(refererUrl,'.zulily.com/health')>0 or instr(refererUrl,'.zulily.com/baby')>0)
group each by eventId
) ck on i.eventId=ck.eventId
left join
(select integer(eventId) as eventId, count(distinct customerId) as cartadders, sum(qty) as qty
from clickstream.cartadd_20141013 group each by eventId
) ca on i.eventId=ca.eventId
left join
(select integer(eventId) as eventId,
count(distinct customerId) as purchasers,
sum(qty) as total_units,
sum(price) as total_demand
from clickstream.purchase_20141013
group each by eventId
) p on i.eventId = p.eventId
where date(e.event_start_date)<='2014-10-13'