0
votes

I have a SQL table (in BigQuery) with possible duplicated rows. The table has over 20 columns, some of them are nested (data type "STRUCT)". I want to deduplicate the table.

I can't simply query SELECT DISTINCT * because I get an error

Query error: Column options of type STRUCT cannot be used in SELECT DISTINCT

So far, I tried to create a unique ID based on a hash of certain columns. I have now this unique ID (called sha256), but I can't figure out a way of selecting only rows with unique hash.

I tried to GROUP BY, but it doesn't work with STRUCT type, and I tried also to INNER JOIN with a table containing only unique hashed, but I get duplicates also.

For reference, here are 2 example rows of the dataset:

  {
    "sha256": "un2k3TUtzwzmQMvxfrjztsh/A/GW3WWzV4U4CezqceA=",
    "has_phone": true,
    "options": {
      "sub_toplist": true,
      "gallery": false,
      "urgent": false,
      "has_option": true,
      "photosup": true,
      "booster": false
    },
    "calendar": {
      "dates": null
    },
    "owner": {
      "siren": null,
      "pro_rates_link": null,
      "user_id": "f0d94687-1a24-4ed4-8adb-7faded053ca8",
      "type": "private",
      "no_salesmen": true,
      "name": "marius",
      "store_id": "5022456"
    },
    "location": {
      "feature": {
        "properties": null,
        "geometry": {
          "coordinates": [
            "9.41733",
            "42.54701"
          ],
          "type": "Point"
        },
        "type": "Feature"
      },
      "is_shape": true,
      "provider": "here",
      "lng": "9.41733",
      "lat": "42.54701",
      "zipcode": "20290",
      "city_label": "Lucciana 20290",
      "city": "Lucciana",
      "region_name": "Corse",
      "department_name": null,
      "source": "city",
      "department_id": "0",
      "region_id": "9"
    },
    "attributes": {
      "pro_rates_link": null,
      "immo_sell_type": "old",
      "ges": "a",
      "square": "92",
      "rooms": "4",
      "energy_rate": "b",
      "is_import": false,
      "custom_ref": null,
      "lease_type": "sell",
      "real_estate_type": "1",
      "fai_included": null,
      "type_real_estate_sale": null
    },
    "price_calendar": null,
    "price": [
      "270000"
    ],
    "body": "text",
    "url": "https://www.example.fr/ventes_immobilieres/1729537955.htm",
    "category_name": "Ventes immobilières",
    "category_id": "9",
    "images": {
      "urls_thumb": [
        "https://img3.example.fr/ad-thumb/d63e236ce3546906b3ce661640a7cf858d0a0593.jpg"
      ],
      "urls": [
        "https://img3.example.fr/ad-image/ac6bd9ce0cc3aa507727ddece51f437d77ae4cfa.jpg",
      ],
      "nb_images": "7",
      "small_url": "https://img3.example.fr/ad-small/ac6bd9ce0cc3aa507727ddece51f437d77ae4cfa.jpg",
      "thumb_url": "https://img3.example.fr/ad-thumb/ac6bd9ce0cc3aa507727ddece51f437d77ae4cfa.jpg"
    },
    "ad_type": "offer",
    "first_publication_date": "2020-01-02 15:00:46 UTC",
    "status": "active",
    "subject": "Villa à Lucciana",
    "index_date": "2020-01-16 15:00:45 UTC",
    "expiration_date": "2020-03-02 15:00:46 UTC",
    "list_id": "1729537955"
  },
  {
    "sha256": "wCMrggkqSJ3PgbkuWAgBpCMtFfkJDRlz6TOeO5Nngsg=",
    "has_phone": true,
    "options": {
      "sub_toplist": false,
      "gallery": false,
      "urgent": false,
      "has_option": false,
      "photosup": false,
      "booster": false
    },
    "calendar": {
      "dates": null
    },
    "owner": {
      "siren": null,
      "pro_rates_link": null,
      "user_id": "ae0f432d-0aa2-4828-a20b-3472255588b4",
      "type": "private",
      "no_salesmen": true,
      "name": "M.Milleliri",
      "store_id": "12132533"
    },
    "location": {
      "feature": {
        "properties": null,
        "geometry": {
          "coordinates": [
            "9.1917",
            "41.54506"
          ],
          "type": "Point"
        },
        "type": "Feature"
      },
      "is_shape": true,
      "provider": "here",
      "lng": "9.1917",
      "lat": "41.54506",
      "zipcode": "20146",
      "city_label": "Sotta 20146",
      "city": "Sotta",
      "region_name": "Corse",
      "department_name": null,
      "source": "city",
      "department_id": "0",
      "region_id": "9"
    },
    "attributes": {
      "pro_rates_link": null,
      "immo_sell_type": "old",
      "ges": "Non renseigné",
      "square": null,
      "rooms": null,
      "energy_rate": "Non renseigné",
      "is_import": false,
      "custom_ref": null,
      "lease_type": "sell",
      "real_estate_type": "3",
      "fai_included": null,
      "type_real_estate_sale": null
    },
    "price_calendar": null,
    "price": [
      "100000"
    ],
    "body": "text",
    "url": "https://www.example.fr/ventes_immobilieres/1736199673.htm",
    "category_name": "Ventes immobilières",
    "category_id": "9",
    "images": {
      "urls_thumb": [
        "https://img3.example.fr/ad-thumb/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg"
      ],
      "urls": [
        "https://img3.example.fr/ad-image/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg"
      ],
      "urls_large": [
        "https://img3.example.fr/ad-large/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg"
      ],
      "nb_images": "1",
      "small_url": "https://img3.example.fr/ad-small/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg",
      "thumb_url": "https://img3.example.fr/ad-thumb/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg"
    },
    "ad_type": "offer",
    "first_publication_date": "2020-01-16 14:21:05 UTC",
    "status": "active",
    "subject": "Terrain 1250 m2 Sotta",
    "index_date": "2020-01-16 14:21:05 UTC",
    "expiration_date": "2020-03-16 14:21:05 UTC",
    "list_id": "1736199673"
  }

and the query I'm working on so far:

WITH
  table_unique_hash AS (
  SELECT
    DISTINCT(SHA256(CONCAT(FORMAT_TIMESTAMP('%Y/%m/%d_%H:%M:%S_', index_date), CAST(list_id AS STRING)))) AS sha256
  FROM
    `test_bucket_data.daily_table`),

  table_hashed AS (
  SELECT
    SHA256(CONCAT(FORMAT_TIMESTAMP('%Y/%m/%d_%H:%M:%S_', index_date), CAST(list_id AS STRING))) AS sha256, *
  FROM
    `test_bucket_data.daily_table`)
SElECT * FROM table_hashed
limit 10;

A solution would be to find a way of inner join the table_hashed and the table_unique_hash on the sha256 column...

Thanks for your help!

1
. . My (and perhaps "the") general solution to limitation on structs is to convert the values to JSON. Strings are generally fine. That said, the conversion back to the nested structures may be a bit painful.Gordon Linoff

1 Answers

2
votes

I found a workaround based on this topic. The combination of GROUP BY and ANY function for all the STRUCT columns made it work!

SELECT
  has_phone,
  ANY_VALUE(options) as options,
  ANY_VALUE(calendar) as calendar,
  ANY_VALUE(owner) as owner,
  ANY_VALUE(location) as location,
  ANY_VALUE(attributes) as attributes,
  price_calendar,
  price,
  body,
  url,
  category_name,
  category_id,
  ANY_VALUE(images) as images,
  ad_type,
  first_publication_date,
  status,
  subject,
  index_date,
  expiration_date,
  list_id,
FROM
  `{table_name}`
Group by
  has_phone,
  price_calendar,
  price,
  body,
  url,
  category_name,
  category_id,
  ad_type,
  first_publication_date,
  status,
  subject,
  index_date,
  expiration_date,
  list_id

Note: my "price" field was previously an array; I transformed it in my source json to an int