Does GCP Dataflow support kafka IO in python? - apache-kafka

I am trying to read data from kafka topic using kafka.ReadFromKafka() method in python code.My code looks like below:
from apache_beam.io.external import kafka
import apache_beam as beam
options = PipelineOptions()
with beam.Pipeline(options=options) as p:
plants = (
p
| 'read' >> kafka.ReadFromKafka({'bootstrap.servers': 'public_ip:9092'}, ['topic1']))
But getting below error message.
ERROR:apache_beam.runners.runner:Error while visiting read Traceback (most recent call last): File "test_file.py", line 16, in <module> | 'read' >> kafka.ReadFromKafka({'bootstrap.servers': 'localhost:9092'}, ['topic1']) File "/usr/local/lib/python3.7/dist-packages/apache_beam/pipeline.py", line 547, in __exit__ self.run().wait_until_finish() File "/usr/local/lib/python3.7/dist-packages/apache_beam/pipeline.py", line 526, in run return self.runner.run_pipeline(self, self._options) File "/usr/local/lib/python3.7/dist-packages/apache_beam/runners/dataflow/dataflow_runner.py", line 565, in run_pipeline self.visit_transforms(pipeline, options) File "/usr/local/lib/python3.7/dist-packages/apache_beam/runners/runner.py", line 224, in visit_transforms pipeline.visit(RunVisitor(self)) File "/usr/local/lib/python3.7/dist-packages/apache_beam/pipeline.py", line 572, in visit self._root_transform().visit(visitor, self, visited) File "/usr/local/lib/python3.7/dist-packages/apache_beam/pipeline.py", line 1075, in visit part.visit(visitor, pipeline, visited) File "/usr/local/lib/python3.7/dist-packages/apache_beam/pipeline.py", line 1078, in visit visitor.visit_transform(self) File "/usr/local/lib/python3.7/dist-packages/apache_beam/runners/runner.py", line 219, in visit_transform self.runner.run_transform(transform_node, options) File "/usr/local/lib/python3.7/dist-packages/apache_beam/runners/runner.py", line 249, in run_transform (transform_node.transform, self)) NotImplementedError: Execution of [<ReadFromKafka(PTransform) label=[ReadFromKafka(beam:external:java:kafka:read:v1)]>] not implemented in runner <apache_beam.runners.dataflow.dataflow_runner.DataflowRunner object at 0x7f72463344a8>.
Is it because apache beam Dataflow runner doesn't support kafkaIO ?

The python SDK for beam does support connecting to Kafka. Below is a code snippet
from __future__ import print_function
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from beam_nuggets.io import kafkaio
kafka_topic = "notifications"
kafka_config = {"topic": kafka_topic,
"bootstrap_servers": "localhost:9092",
"group_id": "notification_consumer_group"}
with beam.Pipeline(options=PipelineOptions()) as p:
notifications = p | "Reading messages from Kafka" >> kafkaio.KafkaConsume(kafka_config)
notifications | 'Writing to stdout' >> beam.Map(print)
The bootstrap_servers is a comma separated host and port configuration where your brokers are deployed. You will get this information from your Kafka cluster configuration.

Related

Show() brings error after applying pandas udf to dataframe

I am having problems to make this trial code work. The final line df.select(plus_one(col("x"))).show() doesn't work, I also tried to save in a variable ( vardf = df.select(plus_one(col("x"))) followed by vardf.show() and fails too.
import pyspark
import pandas as pd
from typing import Iterator
from pyspark.sql.functions import col, pandas_udf, struct
spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
pdf = pd.DataFrame([1, 2, 3], columns=["x"])
df = spark.createDataFrame(pdf)
df.show()
#pandas_udf("long")
def plus_one(batch_iter: Iterator[pd.Series]) -> Iterator[pd.Series]:
for s in batch_iter:
yield s + 1
df.select(plus_one(col("x"))).show()
Error message (parts of it):
File "C:\bigdatasetup\anaconda3\envs\pyspark-env\lib\site-packages\spyder_kernels\py3compat.py", line 356, in compat_exec
exec(code, globals, locals)
File "c:\bigdatasetup\dataanalysiswithpythonandpyspark-trunk\code\ch09\untitled0.py", line 24, in
df.select(plus_one(col("x"))).show()
File "C:\bigdatasetup\anaconda3\envs\pyspark-env\lib\site-packages\pyspark\sql\dataframe.py", line 494, in show
print(self._jdf.showString(n, 20, vertical))
File "C:\bigdatasetup\anaconda3\envs\pyspark-env\lib\site-packages\py4j\java_gateway.py", line 1321, in call
return_value = get_return_value(
File "C:\bigdatasetup\anaconda3\envs\pyspark-env\lib\site-packages\pyspark\sql\utils.py", line 117, in deco
raise converted from None
PythonException:
An exception was thrown from the Python worker. Please see the stack trace below.
...
...
ERROR 2022-04-21 09:48:24,423 7608 org.apache.spark.scheduler.TaskSetManager [task-result-getter-0] Task 0 in stage 3.0 failed 1 times; aborting job

PyPDF2.PdfFileReader hangs indefinitely

I'm trying to read this pdf file (https://www.accessdata.fda.gov/cdrh_docs/pdf14/K141693.pdf) and am following these suggestions from SO
Opening pdf urls with pyPdf
I have actually downloaded the file locally and am running the following code
import PyPDF2
pdf_file = open("K141693.pdf")
pdf_read = PyPDF2.PdfFileReader(pdf_file)
but my code hangs indefinitely. I'm running Python 2.7 and here is the stacktrace.
Traceback (most recent call last):
File "", line 1, in
runfile('C:/PoC/pdf_reader.py', wdir='C:/PoC')
File
"C:\ProgramData\Anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py",
line 880, in runfile
execfile(filename, namespace)
File
"C:\ProgramData\Anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py",
line 87, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "C:/PoC/pdf_reader.py", line 13, in
pdf_read = PyPDF2.PdfFileReader(pdf_file)
File "C:\ProgramData\Anaconda2\lib\site-packages\PyPDF2\pdf.py",
line 1084, in init
self.read(stream)
File "C:\ProgramData\Anaconda2\lib\site-packages\PyPDF2\pdf.py",
line 1697, in read
line = self.readNextEndLine(stream)
File "C:\ProgramData\Anaconda2\lib\site-packages\PyPDF2\pdf.py",
line 1938, in readNextEndLine
x = stream.read(1)
KeyboardInterrupt
I came across another post here PyPDF2 hangs on processing but that too doesn't have a response.
You need to parse the file in binary ('rb') mode. (This works in Python 3:)
import PyPDF2
pdf_file = open("K141693.pdf", "rb")
read_pdf = PyPDF2.PdfFileReader(pdf_file)

TensorFlow: use gfile.FastGfile() method can't not read a file with its path include Chinese characters

I want to read use gfile.FastGFile(image_path, 'rb').read() to read a picture and use it as the input of my project, and I use the directory name as the lable of these pictures which are include in the directory, when the directory name is in English, my code works fine, but when the directory name is in Chinese, it throws this Error:
Traceback (most recent call last):
File "F:/pythonWS/imageFilter/jpegFileJudge.py", line 27, in <module>
image_data = gfile.FastGFile(image_path, 'rb').read()
File "C:\Program Files\Python35\lib\site-
packages\tensorflow\python\lib\io\file_io.py", line 106, in read
self._preread_check()
File "C:\Program Files\Python35\lib\site-
packages\tensorflow\python\lib\io\file_io.py", line 73, in _preread_check
compat.as_bytes(self.__name), 1024 * 512, status)
File "C:\Program Files\Python35\lib\contextlib.py", line 66, in __exit__
next(self.gen)
File "C:\Program Files\Python35\lib\site-
packages\tensorflow\python\framework\errors_impl.py", line 466, in
raise_exception_on_not_ok_status
pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.NotFoundError: NewRandomAccessFile
failed to Create/Open: F:\vsWorkspace\pics\test\三宝鸟
\0ff41bd5ad6eddc403fa02d13bdbb6fd526633fe.jpg :
ϵͳ\udcd5Ҳ\udcbb\udcb5\udcbdָ\udcb6\udca8\udcb5\udcc4\udcceļ\udcfe\udca1\udca3
my test code is :
# -*- coding: utf-8 -*-
import glob
import os
import random
import numpy as np
import tensorflow as tf
from tensorflow.python.platform import gfile
image_folder='F:/vsWorkspace/pics/test'
os.chdir(image_folder)
count=0
for each in os.listdir(image_folder):
each=os.path.abspath(each)
os.chdir(each)
for image_path in os.listdir(each):
image_path = os.path.abspath(image_path)
print(image_path)
image_data = gfile.FastGFile(image_path, 'rb').read()
count += 1
os.chdir(image_folder)
My envirorment is Windows7 x64, python 3.5.3 and TensorFlow 1.0, How can I solve this problem?
By the way,I have to use Chinese directories' name use my pictures lables.

How spark-streaming dealing with snappy compressed data which in kafka

The kafka producer sample code is:
#!/usr/bin/env python
#-*- coding: utf-8 -*-
import ConfigParser as configparser
from pykafka import KafkaClient
import time
import snappy
config = configparser.ConfigParser()
config.read("conf.ini")
app_name = "test_word_counter"
kafka_hosts = config.get(app_name, 'kafka_hosts')
kafka_topic = config.get(app_name, 'kafka_topic')
print("kafka client: %s" % kafka_hosts)
print("kafka topic: %s" % kafka_topic)
kafka_client = KafkaClient(hosts=kafka_hosts) # Create Kafka client
topic = kafka_client.topics[kafka_topic] # This will create the topic if it does not exist
with topic.get_producer() as producer: # Create Kafka producer on the given topic
while True:
msg = "just a test for snappy compress with kafka and spark"
msg = snappy.compress(msg) # add snappy compress
producer.produce(msg) # Send the message to Kafka
print("send data len(%d)" % len(msg))
print(msg)
time.sleep(5)
Code is very simple, use python snappy, and compressing data, then put it to kafka.
The pyspark code is:
def word_counter(zk_host, topic):
sc = SparkContext(appName="PythonStreamingKafkaWordCounter")
sc = SparkContext(conf=spark_conf)
ssc = StreamingContext(sc, 30)
kvs = KafkaUtils.createStream(ssc, zk_host, "spark-streaming-consumer", {topic: 2})
lines = kvs.map(lambda x: x[1])
counts = lines.flatMap(lambda line: line.split(" ")) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a+b)
counts.pprint()
ssc.start()
ssc.awaitTermination()
Then, run spark-commit:
spark-submit --jars /usr/local/services/metrics-spark-analyser/external/spark-streaming-kafka-0-8-assembly_2.11-2.0.2.jar spark_word_counter_consumer.py
I got following spark error msg:
UnicodeDecodeError: 'utf8' codec can't decode byte 0xcc in position 1: invalid continuation byte
more detail error code bellow:
16/12/18 13:58:30 ERROR Executor: Exception in task 5.0 in stage 7.0 (TID 30)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/usr/local/services/spark/python/lib/pyspark.zip/pyspark/worker.py", line 172, in main
process()
File "/usr/local/services/spark/python/lib/pyspark.zip/pyspark/worker.py", line 167, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/usr/local/services/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 2371, in pipeline_func
File "/usr/local/services/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 2371, in pipeline_func
File "/usr/local/services/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 317, in func
File "/usr/local/services/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1792, in combineLocally
File "/usr/local/services/spark/python/lib/pyspark.zip/pyspark/shuffle.py", line 236, in mergeValues
for k, v in iterator:
File "/usr/local/services/spark/python/lib/pyspark.zip/pyspark/streaming/kafka.py", line 73, in <lambda>
File "/usr/local/services/spark/python/lib/pyspark.zip/pyspark/streaming/kafka.py", line 36, in utf8_decoder
return s.decode('utf-8')
File "/usr/local/services/python/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xcc in position 1: invalid continuation byte
It seems that spark-streaming can't not uncommpress snappy data from kafka.
Is any configuration should I add in spark?
Thanks~
software detail:
kafka 0.10.1.0
spark 2.0.2 per-build for hadoop 2.7
python-snappy 0.5
ps:
I have wrote a simple kafka consumer to read kafka snappy data, snappy uncompress process is success.

using boto3 in a python3 virtual env in AWS Lambda

I am trying to use Python3.4 and boto3 to walk an S3 bucket and publish some file locations to an RDS instance. The part of this effort I am having trouble with is when using boto3. My lambda function looks like the following:
import subprocess
def lambda_handler(event, context):
args = ("venv/bin/python3.4", "run.py")
popen = subprocess.Popen(args, stdout=subprocess.PIPE)
popen.wait()
output = popen.stdout.read()
print(output)
and, in my run.py file I have some lines:
import boto3
s3c = boto3.client('s3')
which cause an exception. The run.py file is not relevant for this question however, so in order make this post more concise, I've found that the cause of this error is generated with executing the lambda function:
import subprocess
def lambda_handler(event, context):
args = ("python3.4", "-c", "import boto3; print(boto3.client('s3'))")
popen = subprocess.Popen(args, stdout=subprocess.PIPE)
popen.wait()
output = popen.stdout.read()
print(output)
My logstream reports the error:
Event Data
START RequestId: 2b65421a-664d-11e6-81db-974c7c09d283 Version: $LATEST
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/var/runtime/boto3/__init__.py", line 79, in client
return _get_default_session().client(*args, **kwargs)
File "/var/runtime/boto3/session.py", line 250, in client
aws_session_token=aws_session_token, config=config)
File "/var/runtime/botocore/session.py", line 818, in create_client
client_config=config, api_version=api_version)
File "/var/runtime/botocore/client.py", line 63, in create_client
cls = self._create_client_class(service_name, service_model)
File "/var/runtime/botocore/client.py", line 85, in _create_client_class
base_classes=bases)
File "/var/runtime/botocore/hooks.py", line 227, in emit
return self._emit(event_name, kwargs)
File "/var/runtime/botocore/hooks.py", line 210, in _emit
response = handler(**kwargs)
File "/var/runtime/boto3/utils.py", line 61, in _handler
module = import_module(module)
File "/var/runtime/boto3/utils.py", line 52, in import_module
__import__(name)
File "/var/runtime/boto3/s3/inject.py", line 13, in <module>
from boto3.s3.transfer import S3Transfer
File "/var/runtime/boto3/s3/transfer.py", line 135, in <module>
from concurrent import futures
File "/var/runtime/concurrent/futures/__init__.py", line 8, in <module>
from concurrent.futures._base import (FIRST_COMPLETED,
File "/var/runtime/concurrent/futures/_base.py", line 357
raise type(self._exception), self._exception, self._traceback
^
SyntaxError: invalid syntax
END RequestId: 2b65421a-664d-11e6-81db-974c7c09d283
REPORT RequestId: 2b65421a-664d-11e6-81db-974c7c09d283 Duration: 2673.45 ms Billed Duration: 2700 ms Memory Size: 1024 MB Max Memory Used: 61 MB
I need to use boto3 downstream of run.py. Any ideas on how to resolve this are much appreciated. Thanks!