I have a simple pipeline that receives data from PubSub, prints it and then at every 10 seconds fires a window into a GroupByKey and prints that message again.
However this window seems to be delaying sometimes. Is this a google limitation or is there something wrong with my code:
with beam.Pipeline(options=pipeline_options) as pipe:
messages = (
pipe
| beam.io.ReadFromPubSub(subscription=known_args.input_subscription).with_output_types(bytes)
| 'decode' >> beam.Map(lambda x: x.decode('utf-8'))
| 'Ex' >> beam.ParDo(ExtractorAndPrinter())
| beam.WindowInto(window.FixedWindows(10), allowed_lateness=0, accumulation_mode=AccumulationMode.DISCARDING, trigger=AfterProcessingTime(10) )
| 'group' >> beam.GroupByKey()
| 'PRINTER' >> beam.ParDo(PrinterWorker()))
Edit for the most recent code. I removed the triggers however the problem persists:
class ExtractorAndCounter(beam.DoFn):
def __init__(self):
beam.DoFn.__init__(self)
def process(self, element, *args, **kwargs):
import logging
logging.info(element)
return [("Message", json.loads(element)["Message"])]
class PrinterWorker(beam.DoFn):
def __init__(self):
beam.DoFn.__init__(self)
def process(self, element, *args, **kwargs):
import logging
logging.info(element)
return [str(element)]
class DefineTimestamp(beam.DoFn):
def process(self, element, *args, **kwargs):
from datetime import datetime
return [(str(datetime.now()), element)]
def run(argv=None, save_main_session=True):
"""Build and run the pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument(
'--output_topic',
required=True,
help=(
'Output PubSub topic of the form '
'"projects/<PROJECT>/topics/<TOPIC>".'))
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument(
'--input_topic',
help=(
'Input PubSub topic of the form '
'"projects/<PROJECT>/topics/<TOPIC>".'))
group.add_argument(
'--input_subscription',
help=(
'Input PubSub subscription of the form '
'"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
pipeline_options.view_as(StandardOptions).streaming = True
with beam.Pipeline(options=pipeline_options) as pipe:
messages = (
pipe
| beam.io.ReadFromPubSub(subscription=known_args.input_subscription).with_output_types(bytes)
| 'decode' >> beam.Map(lambda x: x.decode('utf-8'))
| 'Ex' >> beam.ParDo(ExtractorAndCounter())
| beam.WindowInto(window.FixedWindows(10))
| 'group' >> beam.GroupByKey()
| 'PRINTER' >> beam.ParDo(PrinterWorker())
| 'encode' >> beam.Map(lambda x: x.encode('utf-8'))
| beam.io.WriteToPubSub(known_args.output_topic))
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()