I'm trying to load the json data from an API into bigquery table on GCP however I got an issue that the json data seem to miss a square bracket so it got an error '"Repeated record with name trip_update added outside of an array."}]'. I don't know how Here is the data sample:
{
"header": {
"gtfs_realtime_version": "1.0",
"timestamp": 1607630971
},
"entity": [
{
"id": "65.5.17-120-cm1-1.18.O",
"trip_update": {
"trip": {
"trip_id": "65.5.17-120-cm1-1.18.O",
"start_time": "18:00:00",
"start_date": "20201210",
"schedule_relationship": "SCHEDULED",
"route_id": "17-120-cm1-1"
},
"stop_time_update": [
{
"stop_sequence": 1,
"departure": {
"delay": 0
},
"stop_id": "8220B1351201",
"schedule_relationship": "SCHEDULED"
},
{
"stop_sequence": 23,
"arrival": {
"delay": 2340
},
"departure": {
"delay": 2340
},
"stop_id": "8260B1025301",
"schedule_relationship": "SCHEDULED"
}
]
}
}
]
}
Here is a schema and code: schema
[
{ "name":"header",
"type": "record",
"fields": [
{ "name":"gtfs_realtime_version",
"type": "string",
"description": "version of speed specification"
},
{ "name": "timestamp",
"type": "integer",
"description": "The moment where this dataset was generated on server e.g. 1593102976"
}
]
},
{"name":"entity",
"type": "record",
"mode": "REPEATED",
"description": "Multiple entities can be included in the feed",
"fields": [
{"name":"id",
"type": "string",
"description": "unique identifier for the entity"
},
{"name": "trip_update",
"type": "struct",
"mode": "REPEATED",
"description": "Data about the realtime departure delays of a trip. At least one of the fields trip_update, vehicle, or alert must be provided - all these fields cannot be empty.",
"fields": [
{ "name":"trip",
"type": "record",
"mode": "REPEATED",
"fields": [
{"name": "trip_id",
"type": "string",
"description": "selects which GTFS entity (trip) will be affected"
},
{ "name":"start_time",
"type": "string",
"description": "The initially scheduled start time of this trip instance 13:30:00"
},
{ "name":"start_date",
"type": "string",
"description": "The start date of this trip instance in YYYYMMDD format. Whether start_date is required depends on the type of trip: e.g. 20200625"
},
{ "name":"schedule_relationship",
"type": "string",
"description": "The relation between this trip and the static schedule e.g. SCHEDULED"
},
{ "name":"route_id",
"type": "string",
"description": "The route_id from the GTFS feed that this selector refers to e.g. 10-263-e16-1"
}
]
}
]
},
{ "name":"stop_time_update",
"type": "record",
"mode": "REPEATED",
"description": "Updates to StopTimes for the trip (both future, i.e., predictions, and in some cases, past ones, i.e., those that already happened). The updates must be sorted by stop_sequence, and apply for all the following stops of the trip up to the next specified stop_time_update. At least one stop_time_update must be provided for the trip unless the trip.schedule_relationship is CANCELED - if the trip is canceled, no stop_time_updates need to be provided.",
"fields": [
{"name":"stop_sequence",
"type": "string",
"description": "Must be the same as in stop_times.txt in the corresponding GTFS feed e.g 3"
},
{ "name":"arrival",
"type": "record",
"mode": "REPEATED",
"fields": [
{ "name":"delay",
"type": "string",
"description": "Delay (in seconds) can be positive (meaning that the vehicle is late) or negative (meaning that the vehicle is ahead of schedule). Delay of 0 means that the vehicle is exactly on time e.g 5"
}
]
},
{ "name": "departure",
"type": "record",
"mode": "REPEATED",
"fields": [
{ "name":"delay",
"type": "integer"
}
]
},
{ "name":"stop_id",
"type": "string",
"description": "Must be the same as in stops.txt in the corresponding GTFS feed e.g. 8430B2552301"
},
{"name":"schedule_relationship",
"type": "string",
"description": "The relation between this StopTime and the static schedule e.g. SCHEDULED , SKIPPED or NO_DATA"
}
]
}
]
}
]
function (following google guideline https://cloud.google.com/solutions/streaming-data-from-cloud-storage-into-bigquery-using-cloud-functions?authuser=2#before-you-begin)
def _insert_into_bigquery(bucket_name, file_name):
blob = CS.get_bucket(bucket_name).blob(file_name)
row = json.loads(blob.download_as_string())
table = BQ.dataset(BQ_DATASET).table(BQ_TABLE)
errors = BQ.insert_rows_json(table,
json_rows=row,
ignore_unknown_values=True,
retry=retry.Retry(deadline=30))
if errors != []:
raise BigQueryError(errors)