I have a GCS bucket from which I'm trying to read about 200k files and then write them to BigQuery. The problem is that I'm having trouble creating a PCollection that works well with the code. I'm following this tutorial for reference.
I have this code:
from __future__ import absolute_import
import argparse
import logging
import os
from past.builtins import unicode
import apache_beam as beam
from apache_beam.io import ReadFromText, ReadAllFromText
from apache_beam.io import WriteToText
from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from google.cloud import storage
import regex as re
# storage_client = storage.Client()
# bucket = storage_client.get_bucket('mybucket')
#
# blobs = bucket.list_blobs()
# l=list(blobs)
# x=[y.name for y in l]
# c=x[1:]
# print(len(c))
files = ['gs://mybucket/_chunk1',
'gs://mybucket/_chunk0']
class DataIngestion:
"""A helper class which contains the logic to translate the file into
a format BigQuery will accept."""
def parse_method(self, string_input):
x="""{}""".format(string_input)
rx = re.compile(r"""\{[^{}]+\}(*SKIP)(*FAIL)|,""")
d = {}
d['name'], d['date'], d['geometry'], d['value0'], d['value1'], d['value2']=rx.split(x)
d['geometry']=d['geometry'].strip('"')
return d
def run(argv=None):
"""Main entry point; defines and runs the pipeline."""
data_ingestion = DataIngestion()
p = beam.Pipeline(options=PipelineOptions())
(p
| 'Create PCollection' >> beam.Create(files)
| 'Read from a File' >> beam.io.ReadAllFromText(skip_header_lines=1)
| 'String To BigQuery Row' >> beam.Map(lambda s:
data_ingestion.parse_method(s))
| 'Write to BigQuery' >> beam.io.Write(
beam.io.BigQuerySink(
'mytable',
dataset='mydataset',
schema=myschema,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)))
result = p.run()
result.wait_until_finish()
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
The problem is that this code works perfect if files
list has only one element. As soon as as there are more than 1 elements, the transform 'String To BigQuery Row' errors out and says error: nothing to repeat [while running 'String To BigQuery Row']
. This probably is related to the regex module but I can't figure out what's wrong because it works perfectly when given 1 file.
Edit: Strangely it runs well via DirectRunner. I'm passing the requirements.txt
file as given here.
This is how I'm executing the pipeline:
python streaming_inserts.py --runner=DataFlowRunner --project=my-project --temp_location=gs://temp/ --staging_location=gs://stage/ --requirements_file requirements.txt --disk_size_gb 1000 --region us-east1
My requirements.txt
looks like this:
regex
google-cloud-storage