2
votes

I'm using BigQuery to report on Google Analytics data. I'm trying to recreate landing page data using BigQuery.

The following query reports 18% fewer sessions than in the Google Analytics interface:

SELECT DISTINCT
  fullVisitorId,
  visitID,
  h.page.pagePath AS LandingPage
FROM
  `project-name.dataset.ga_sessions_*`, UNNEST(hits) AS h
WHERE 
  hitNumber = 1
AND h.type = 'PAGE'
AND _TABLE_SUFFIX BETWEEN '20170331' AND '20170331'
ORDER BY fullVisitorId DESC

Where am I going wrong with my approach? Why can't I get to within a small margin of the number in the GA interface's reported figure?

4

4 Answers

4
votes

Multiple reasons :

1.Big Query for equivalent landing page:

SELECT
  LandingPage,
  COUNT(sessionId) AS Sessions,
  100 * SUM(totals.bounces)/COUNT(sessionId) AS BounceRate,
  AVG(totals.pageviews) AS AvgPageviews,
  SUM(totals.timeOnSite)/COUNT(sessionId) AS AvgTimeOnSite,
from(
  SELECT
    CONCAT(fullVisitorId,STRING(visitId)) AS sessionID,
    totals.bounces,
    totals.pageviews,
    totals.timeOnSite,
    hits.page.pagePath AS landingPage
  FROM (
    SELECT
      fullVisitorId,
      visitId,
      hits.page.pagePath,
      totals.bounces,
      totals.pageviews,
      totals.timeOnSite,
      MIN(hits.hitNumber) WITHIN RECORD AS firstHit,
      hits.hitNumber AS hitNumber
    FROM (TABLE_DATE_RANGE ([XXXYYYZZZ.ga_sessions_],TIMESTAMP('2016-08-01'), TIMESTAMP ('2016-08-31')))
    WHERE
      hits.type = 'PAGE'
      AND hits.page.pagePath'')
  WHERE
    hitNumber = firstHit)
GROUP BY
  LandingPage
ORDER BY
  Sessions DESC,
  LandingPage

Next :

Pre-calculated data -- pre-aggregated tables

These are the precalculated data that Google uses to speed up the UI. Google does not specify when this is done but it can be at any point of the time. These are known as pre-aggregated tables


So if you compare the numbers from GA UI to your Big Query output, you will always see a discrepancy. Please go ahead and rely on your big query data .

1
votes

You can achieve the same thing by simply adding the below to your select statement:

,(SELECT page.pagePath FROM UNNEST(hits) WHERE hitnumber = (SELECT MIN(hitnumber) FROM UNNEST(hits) WHERE type = 'PAGE')) landingpage

I can get a 1 to 1 match with the GA UI on my end when I run something like below, which is a bit more concise than the original answer:

SELECT DISTINCT
   a.landingpage
  ,COUNT(DISTINCT(a.sessionId)) sessions
  ,SUM(a.bounces) bounces
  ,AVG(a.avg_pages) avg_pages
  ,(SUM(tos)/COUNT(DISTINCT(a.sessionId)))/60 session_duration
FROM
(
    SELECT DISTINCT 
       CONCAT(CAST(fullVisitorId AS STRING),CAST(visitStartTime AS STRING)) sessionId
      ,(SELECT page.pagePath FROM UNNEST(hits) WHERE hitnumber = (SELECT MIN(hitnumber) FROM UNNEST(hits) WHERE type = 'PAGE')) landingpage
      ,totals.bounces bounces
      ,totals.timeonsite tos
      ,(SELECT COUNT(hitnumber) FROM UNNEST(hits) WHERE type = 'PAGE') avg_pages
    FROM `tablename_*`
      WHERE _TABLE_SUFFIX >= '20180801'
       AND _TABLE_SUFFIX <= '20180808'
        AND totals.visits = 1   
) a
GROUP BY 1
0
votes

another way here! you can get the same number :

    SELECT 
LandingPage,
COUNT(DISTINCT(sessionID)) AS sessions
FROM(
SELECT    
    CONCAT(fullVisitorId,CAST(visitId AS STRING)) AS sessionID,
    FIRST_VALUE(hits.page.pagePath) OVER (PARTITION BY  CONCAT(fullVisitorId,CAST(visitId AS STRING)) ORDER BY hits.hitNumber ASC ) AS LandingPage
FROM
    `xxxxxxxx1.ga_sessions_*`,
    UNNEST(hits) AS hits
  WHERE
    _TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
    AND FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
    AND hits.type ='PAGE'
GROUP BY fullVisitorId, visitId, sessionID,hits.page.pagePath,hits.hitNumber
)
GROUP BY LandingPage
ORDER BY sessions DESC
0
votes

There is a hit.isEntrance field in the schema that can be used for this purpose. The example below would show you yesterday's landing pages:

#standardSQL
select
  date,
  hits.page.pagePath as landingPage,
  sum(totals.visits) as visits,
  sum(totals.bounces) as bounces,
  sum(totals.transactions) as transactions
from
  `project.dataset.ga_sessions_*`,
  unnest(hits) as hits
where
  (_table_suffix
    between format_date("%Y%m%d", date_sub(current_date(), interval 1 day))
    and format_date("%Y%m%d", date_sub(current_date(), interval 1 day)))
  and hits.isEntrance = True
  and totals.visits = 1 #avoid counting midnight-split sessions
group by
  1, 2
order by 3 desc

There is still one source of discrepancy though, which comes from the sessions without a landing page (if you check in GA in the landing pages report, there will sometimes be a (not set) value.

In order to include those as well, you can do:

with
landing_pages_set as (
  select
    concat(cast(fullVisitorId as string), cast(visitId as string), cast(date as string)) as fullVisitId,
    hits.page.pagePath as virtualPagePath
  from
    `project.dataset.ga_sessions_*`,
    unnest(hits) as hits
  where
    (_table_suffix
      between format_date("%Y%m%d", date_sub(current_date(), interval 1 day))
      and format_date("%Y%m%d", date_sub(current_date(), interval 1 day)))
    and totals.visits = 1 #avoid counting midnight-split sessions
    and hits.isEntrance = TRUE
  group by 1, 2
),

landing_pages_not_set as (
  select
    concat(cast(fullVisitorId as string), cast(visitId as string), cast(date as string)) as fullVisitId,
    date,
    "(not set)" as virtualPagePath,
    count(distinct concat(cast(fullVisitorId as string), cast(visitId as string), cast(date as string))) as visits,
    sum(totals.bounces) as bounces,
    sum(totals.transactions) as transactions
  from
    `project.dataset.ga_sessions_*`
  where
    (_table_suffix
      between format_date("%Y%m%d", date_sub(current_date(), interval 1 day))
      and format_date("%Y%m%d", date_sub(current_date(), interval 1 day)))
    and totals.visits = 1 #avoid counting midnight-split sessions
  group by 1, 2, 3
),

landing_pages as (
  select
    l.fullVisitId as fullVisitId,
    date,
    coalesce(r.virtualPagePath, l.virtualPagePath) as virtualPagePath,
    visits,
    bounces,
    transactions
  from
    landing_pages_not_set l left join landing_pages_set r on l.fullVisitId = r.fullVisitId
)

select virtualPagePath, sum(visits) from landing_pages group by 1 order by 2 desc