Dataflow streaming pipeline using TextIO does not autoscale - streaming

I created a streaming pipeline which does the following actions:
get pubsub messages containing addresses of csv files
read the corresponding csv files
write the content of the csv files to a BigQuery table
My pipeline code looks like this:
def run():
options = PipelineOptions( save_main_session=True, streaming=False, autoscaling_algorithm='THROUGHPUT_BASED',max_num_workers=500)
options.view_as(GoogleCloudOptions).project = 'my_project'
options.view_as(GoogleCloudOptions).region = 'europe-west1'
options.view_as(GoogleCloudOptions).staging_location = 'staging_address'
options.view_as(GoogleCloudOptions).temp_location = 'temp_address'
options.view_as(StandardOptions).runner = 'DataflowRunner'
p = beam.Pipeline(options=options)
road = (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub('projects/my_project/topics/my_topic')
| 'ParseJson' >> beam.Map(parse_json)
| 'GetFileAddress' >> beam.Map(lambda element: element["fileAddress"])
| 'ReadCSVFiles' >> beam.io.ReadAllFromText(skip_header_lines=1)
| 'FormatLines' >> beam.Map(read_line)
| 'WriteAggToBQ1' >> beam.io.WriteToBigQuery(
destination_table,
schema=schema,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
)
)
p.run()
if __name__ == '__main__':
run()
When running the pipeline on Dataflow and trying to read big csv files (50 GB), the pipeline never scales and keeps being stuck at one worker.
Why does Dataflow refuse to scale in this situation?

Related

unable to Write json to Pubsub topic using apache beam python

I am trying to read a topic from pubsub and do some cleanup/transfermation and write the final result to another pubsub topic. however i am ending up with the following error. pls guide me.
code:
Ingest = ( p
| 'Read from Topic' >> beam.io.ReadFromPubSub(topic=known_args.topic).with_output_types(bytes)
| 'Parse' >> beam.Map(parse_json)
| 'Cleanup' >> beam.Map(cleanup)
| 'write to pubsub' | beam.io.WriteToPubSub("projects/test/topics/cdp_aa_food" , with_attributes=False)
)
the error which i am getting is below:
raise TypeError("Expected a PTransform object, got %s" % transform)
TypeError: Expected a PTransform object, got write to pubsub
not sure what i am doing wrong..
Ingest = ( p
| 'Read from Topic' >> beam.io.ReadFromPubSub(topic=known_args.topic).with_output_types(bytes)
| 'Parse' >> beam.Map(parse_json)
| 'Cleanup' >> beam.Map(cleanup)
| 'write to pubsub' >> beam.io.WriteToPubSub("projects/test/topics/cdp_aa_food" , with_attributes=False)
)
There is a typo in your pipeline, you need >> instead of | for the write to pubsub step.

Dataflow job doesn't emit messages after GroupByKey()

I have a streaming dataflow pipeline that writes to BQ, and I want to window all the failed rows and do some further analysis. The pipeline looks like this, I'm getting all the error messages in the 2nd step but all the messages are getting stuck to the beam.GroupByKey(). Nothing moves downstream after that. Does anyone have any idea how to fix this?
data = (
| "Read PubSub Messages" >> beam.io.ReadFromPubSub(subscription=options.input_subscription,
with_attributes=True)
...
| "write to BQ" >> beam.io.WriteToBigQuery(
table=f"{options.bq_dataset}.{options.bq_table}",
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
method='STREAMING_INSERTS',
insert_retry_strategy=beam.io.gcp.bigquery_tools.RetryStrategy.RETRY_NEVER
)
)
(
data[beam.io.gcp.bigquery.BigQueryWriteFn.FAILED_ROWS]
| f"Window into: {options.window_size}m" >> GroupWindowsIntoBatches(options.window_size)
| f"Failed Rows for " >> beam.ParDo(BadRows(options.bq_dataset, 'table'))
)
and
class GroupWindowsIntoBatches(beam.PTransform):
"""A composite transform that groups Pub/Sub messages based on publish
time and outputs a list of dictionaries, where each contains one message
and its publish timestamp.
"""
def __init__(self, window_size):
# Convert minutes into seconds.
self.window_size = int(window_size * 60)
def expand(self, pcoll):
return (
pcoll
# Assigns window info to each Pub/Sub message based on its publish timestamp.
| "Window into Fixed Intervals" >> beam.WindowInto(window.FixedWindows(10))
# If the windowed elements do not fit into memory please consider using `beam.util.BatchElements`.
| "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
| "Groupby" >> beam.GroupByKey()
| "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val)
)
also, I don't know if it's relevant but the beam.DoFn.TimestampParam inside my GroupWindowsIntoBatches has invalid timestamp (negative)
Ok, so the issue was that the messages coming from BigQuery FAILED_ROWS were not timestamped. adding | 'Add Timestamps' >> beam.Map(lambda x: beam.window.TimestampedValue(x, time.time())) seems to fix the group by.
class GroupWindowsIntoBatches(beam.PTransform):
"""A composite transform that groups Pub/Sub messages based on publish
time and outputs a list of dictionaries, where each contains one message
and its publish timestamp.
"""
def __init__(self, window_size):
# Convert minutes into seconds.
self.window_size = int(window_size * 60)
def expand(self, pcoll):
return (
pcoll
| 'Add Timestamps' >> beam.Map(lambda x: beam.window.TimestampedValue(x, time.time())) <----- Added This line
| "Window into Fixed Intervals" >> beam.WindowInto(window.FixedWindows(30))
| "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
| "Groupby" >> beam.GroupByKey()
| "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val)
)

Trying to install a corpus for countVectorizer in sklearn package

I am trying to load a corpus from my local drive into python at one time with a for loop and then read each text file and save it for analysis with countVectorizer. But, I am only getting the last file. How do I get the results from all of the files to be stored for analysis with countVectorizer?
This code brings out the text from last file in folder.
folder_path = "folder"
#import and read all files in animal_corpus
for filename in glob.glob(os.path.join(folder_path, '*.txt')):
with open(filename, 'r') as f:
txt = f.read()
print(txt)
MyList= [txt]
## Create a CountVectorizer object that you can use
MyCV1 = CountVectorizer()
## Call your MyCV1 on the data
DTM1 = MyCV1.fit_transform(MyList)
## get col names
ColNames=MyCV1.get_feature_names()
print(ColNames)
## convert DTM to DF
MyDF1 = pd.DataFrame(DTM1.toarray(), columns=ColNames)
print(MyDF1)
This code works, but would not work for a huge corpus that I am preparing it for.
#import and read text files
f1 = open("folder/animal_1.txt",'r')
f1r = f1.read()
f2 = open("/folder/animal_2.txt",'r')
f2r = f2.read()
f3 = open("/folder/animal_3.txt",'r')
f3r = f3.read()
#reassemble corpus in python
MyCorpus=[f1r, f2r, f3r]
## Create a CountVectorizer object that you can use
MyCV1 = CountVectorizer()
## Call your MyCV1 on the data
DTM1 = MyCV1.fit_transform(MyCorpus)
## get col names
ColNames=MyCV1.get_feature_names()
print(ColNames)
## convert DTM to DF
MyDF2 = pd.DataFrame(DTM1.toarray(), columns=ColNames)
print(MyDF2)
I figured it out. Just gotta keep grinding.
MyCorpus=[]
#import and read all files in animal_corpus
for filename in glob.glob(os.path.join(folder_path, '*.txt')):
with open(filename, 'r') as f:
txt = f.read()
MyCorpus.append(txt)

Converting stream json files into parquet format in S3

Kinesis Firehose continuously stream json files to S3 bucket. Using pyspark I need to convert into parquet and write to another bucket. Plannig to run this EMR job once in every 30 mins. I used the following code, but EMR job is running long(more than 4 hours and then i killed the job) to convert small files(Around 400K small files and total size is around 30GB).
Can someone give your input to increase the performance of the job?
Code:
# Loop through all folders
for k in sourceBucket.list():
source = k.bucket.name + '/' + k.key
srcsubfolder = source.split('/')
if srcsubfolder[2] != '':
sourcePath = 's3://' + k.bucket.name + '/' + k.key
# derive target path
removefilenamefromkey = k.key.split('/')[:-1]
targetPath = 's3://' + targetBucket.name +'/' + '/'.join(removefilenamefromkey)
#Covert Json into Parquet files
dfjson = spark.read.json(sourcePath)
dfjsoncollower = dfjson.toDF(*[x.lower() for x in dfjson.columns])
dfjsoncollower.write.mode("append").parquet(targetPath)

Collecting output from Apache Beam pipeline and displaying it to console

I have been working on Apache Beam for a couple of days. I wanted to quickly iterate on the application I am working and make sure the pipeline I am building is error free. In spark we can use sc.parallelise and when we apply some action we get the value that we can inspect.
Similarly when I was reading about Apache Beam, I found that we can create a PCollection and work with it using following syntax
with beam.Pipeline() as pipeline:
lines = pipeline | beam.Create(["this is test", "this is another test"])
word_count = (lines
| "Word" >> beam.ParDo(lambda line: line.split(" "))
| "Pair of One" >> beam.Map(lambda w: (w, 1))
| "Group" >> beam.GroupByKey()
| "Count" >> beam.Map(lambda (w, o): (w, sum(o))))
result = pipeline.run()
I actually wanted to print the result to console. But I couldn't find any documentation around it.
Is there a way to print the result to console instead of saving it to a file each time?
You don't need the temp list. In python 2.7 the following should be sufficient:
def print_row(row):
print row
(pipeline
| ...
| "print" >> beam.Map(print_row)
)
result = pipeline.run()
result.wait_until_finish()
In python 3.x, print is a function so the following is sufficient:
(pipeline
| ...
| "print" >> beam.Map(print)
)
result = pipeline.run()
result.wait_until_finish()
After exploring furthermore and understanding how I can write testcases for my application I figure out the way to print the result to console. Please not that I am right now running everything to a single node machine and trying to understand functionality provided by apache beam and how can I adopt it without compromising industry best practices.
So, here is my solution. At the very last stage of our pipeline we can introduce a map function that will print result to the console or accumulate the result in a variable later we can print the variable to see the value
import apache_beam as beam
# lets have a sample string
data = ["this is sample data", "this is yet another sample data"]
# create a pipeline
pipeline = beam.Pipeline()
counts = (pipeline | "create" >> beam.Create(data)
| "split" >> beam.ParDo(lambda row: row.split(" "))
| "pair" >> beam.Map(lambda w: (w, 1))
| "group" >> beam.CombinePerKey(sum))
# lets collect our result with a map transformation into output array
output = []
def collect(row):
output.append(row)
return True
counts | "print" >> beam.Map(collect)
# Run the pipeline
result = pipeline.run()
# lets wait until result a available
result.wait_until_finish()
# print the output
print output
Maybe logging info instead of print?
def _logging(elem):
logging.info(elem)
return elem
P | "logging info" >> beam.Map(_logging)
Follow an example from pycharm Edu
import apache_beam as beam
class LogElements(beam.PTransform):
class _LoggingFn(beam.DoFn):
def __init__(self, prefix=''):
super(LogElements._LoggingFn, self).__init__()
self.prefix = prefix
def process(self, element, **kwargs):
print self.prefix + str(element)
yield element
def __init__(self, label=None, prefix=''):
super(LogElements, self).__init__(label)
self.prefix = prefix
def expand(self, input):
input | beam.ParDo(self._LoggingFn(self.prefix))
class MultiplyByTenDoFn(beam.DoFn):
def process(self, element):
yield element * 10
p = beam.Pipeline()
(p | beam.Create([1, 2, 3, 4, 5])
| beam.ParDo(MultiplyByTenDoFn())
| LogElements())
p.run()
Output
10
20
30
40
50
Out[10]: <apache_beam.runners.portability.fn_api_runner.RunnerResult at 0x7ff41418a210>
I know it isn't what you asked for but why don't you store it to a text file? It's always better than printing it via stdout and it isn't volatile