Error in running Apache Beam Python SplittableDoFn - apache-beam

Error encountered while trying pubsub io > splittable dofn
RuntimeError: Transform node
AppliedPTransform(ParDo(TestDoFn)/ProcessKeyedElements/GroupByKey/GroupByKey,
_GroupByKeyOnly) was not replaced as expected.
Can someone help me with reviewing the code for anything I might be doing incorrectly in there
Code:
"""
python examples/test_restriction_unbounded.py --project mk2 --topic projects/mk2/topics/testing
"""
# pytype: skip-file
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import csv
import logging
import sys
import time
from datetime import datetime
import apache_beam as beam
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.io.restriction_trackers import OffsetRestrictionTracker, OffsetRange
from apache_beam.transforms.core import RestrictionProvider
class TestProvider(RestrictionProvider):
def initial_restriction(self, element):
return OffsetRange(0, 1)
def create_tracker(self, restriction):
return OffsetRestrictionTracker(restriction)
def restriction_size(self, element, restriction):
return restriction.size()
class TestDoFn(beam.DoFn):
def process(
self,
element,
restriction_tracker=beam.DoFn.RestrictionParam(
TestProvider())):
import pdb; pdb.set_trace()
cur = restriction_tracker.current_restriction().start
while restriction_tracker.try_claim(cur):
return element
def run(argv=None, save_main_session=True):
parser = argparse.ArgumentParser()
parser.add_argument('--topic', type=str, help='Pub/Sub topic to read from')
args, pipeline_args = parser.parse_known_args(argv)
options = PipelineOptions(pipeline_args)
options.view_as(StandardOptions).streaming = True
with beam.Pipeline(options=options) as p:
# data = ['abc', 'defghijklmno', 'pqrstuv', 'wxyz']
# actual = (p | beam.Create(data) | beam.ParDo(ExpandingStringsDoFn()))
scores = p | beam.io.ReadFromPubSub(topic=args.topic) | beam.ParDo(TestDoFn())
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()

You are ingesting data from pub/sub by steaming. Then you have to create batches by window before apply this kind of transforms: (ParDo(TestDoFn)/ProcessKeyedElements/GroupByKey/GroupByKey, _GroupByKeyOnly)
Pub/Sub with window example: https://cloud.google.com/pubsub/docs/pubsub-dataflow
Try to do like this:
class GroupWindowsIntoBatches(beam.PTransform):
"""A composite transform that groups Pub/Sub messages
"""
def __init__(self, window_size):
# Convert minutes into seconds.
self.window_size = int(window_size * 60)
def expand(self, pcoll):
return (
pcoll
# Assigns window info to each Pub/Sub message based on its
# publish timestamp.
| "Window into Fixed Intervals"
>> beam.WindowInto(window.FixedWindows(self.window_size))
)
def run(argv=None, save_main_session=True):
parser = argparse.ArgumentParser()
parser.add_argument('--topic', type=str, help='Pub/Sub topic to read from')
args, pipeline_args = parser.parse_known_args(argv)
options = PipelineOptions(pipeline_args)
options.view_as(StandardOptions).streaming = True
window_size = 1.0
with beam.Pipeline(options=options) as p:
scores = (p
| beam.io.ReadFromPubSub(topic=args.topic)
| "WindowInto" >> GroupWindowsIntoBatches(window_size)
| beam.ParDo(TestDoFn())
)

I had the same error. Removing the streaming option solved the problem for me.

Related

locust 0.9 to 1.3 Exception: No tasks defined. use the #task decorator or set the tasks property of the User

I have the following code which run fine in locust 0.9. Now with 1.3 it throws the exception mentioned in the title. Can anyone see what's wrong?
import time
import random
import datetime
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import logging
import json
import os
from random import randint, choice
from locust import HttpUser, TaskSet, task
from pyquery import PyQuery
requests.packages.urllib3.disable_warnings()
class FrontPage(TaskSet):
def on_start(self):
self.client.verify = False
#task(20)
def index(self):
self.client.get("/")
class DestinationPagesFixed(TaskSet):
de_paths = ["/belgien", "daenemark", "deutschland", "frankreich", "griechenland"
, "italien"
, "luxemburg"
]
def on_start(self):
self.client.verify = False
#task
def test_1(self):
paths = self.de_paths
path = choice(paths)
self.client.get(path, name="Static page")
class UserBehavior(TaskSet):
tasks = {FrontPage: 15, DestinationPagesFixed: 19}
class WebsiteUser(HttpUser):
task_set = UserBehavior
min_wait = 400
max_wait = 10000
Change
task_set = UserBehavior
to
tasks = [UserBehavior]
Or (skipping your UserBehaviour class entirely)
tasks = {FrontPage: 15, DestinationPagesFixed: 19}

Can not receive message from queues when the Exchange object's type is "direct"

When I change the "direct" type to "fanout" or "topic", it is work.But the "direct" type is not work.
I wrote according to the example of the official website.
And I tried deleting the code => "from future import absolute_import, unicode_literals".But it still not work.
queues.py
it is work if I change type to 'fanout' or 'topic'.
from __future__ import absolute_import, unicode_literals
from kombu import Queue, Exchange
exchange = Exchange('demo_exchange', type='topic')
demo_queues = [
Queue('one', exchange, routing_key='o'),
Queue('two', exchange, routing_key='t'),
Queue('three', exchange, routing_key='th'),
]
worker.py
from __future__ import absolute_import, unicode_literals
from kombu.mixins import ConsumerMixin
from kombu import Connection
from queues import demo_queues
class Worker(ConsumerMixin):
def __init__(self, connection):
self.connection = connection
def get_consumers(self, Consumer, channel):
return [Consumer(queues=demo_queues, accept=['pickle', 'json'], callbacks=[self.on_task])]
def on_task(self, body, message):
args = body['args']
func = body['func']
print(args, func)
func()
message.ack()
if __name__ == '__main__':
with Connection('redis://localhost:6379/0') as conn:
try:
worker = Worker(conn)
worker.run()
except KeyboardInterrupt:
print('bye')
client.py
from __future__ import absolute_import, unicode_literals
from kombu.pools import producers
from queues import exchange
priority_to_routing_key = {
'high': 'o',
'mid': 't',
'low': 'th',
}
def send_tasks(conn, func, args, priority='high'):
data = {'func': func, 'args': args}
with producers[conn].acquire(block=True) as producer:
routing_key = priority_to_routing_key[priority]
producer.publish(data,
serializer='pickle',
exchange=exchange,
declare=[exchange],
routing_key=routing_key)
if __name__ == '__main__':
from kombu import Connection
from tasks import func_task
connection = Connection('redis://localhost:6379/0')
send_tasks(connection, func=func_task, args=('test hello'), priority='mid')
tasks.py
def func_task(n='hello'):
print(n, '---====')
I hope to work out this.

Why "missing parameter type error" when i run scala REPL in Flink with Java?

When I run the flink scala REPL script in java cannot compile.
I tried this java code to run Flink scala REPL for test, bug always exception.
Settings settings = new Settings();
((MutableSettings.BooleanSetting) settings.usejavacp()).value_$eq(true);
IMain main = new IMain(settings, new PrintWriter(System.out));
// Thread.currentThread().setContextClassLoader(main.classLoader());
for (String imp : imports) {
main.interpret(MessageFormat.format("import {0}", imp));
}
ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
String script = FileUtils.readFileToString(new File("/opt/project/security-detection/sappo/src/sappo-interpreter/src/test/resources/demo.txt"), StandardCharsets.UTF_8);
main.bind(new NamedParamClass("env", ExecutionEnvironment.class.getName(), env));
main.interpret(script);
scala text
val text = env.fromElements("Who's there?", "I think I hear them. Stand, ho! Who's there?")
// result 1
val counts = text.flatMap { _.toLowerCase.split("\\W+") filter { _.nonEmpty } } map { (_, 1) } groupBy(0) sum(1)
counts.print()
// result 2
val counts = text.map((x:String) => 1)
counts.print()
// result 3
text.print()
result 1
import org.apache.flink.core.fs._
import org.apache.flink.core.fs.local._
import org.apache.flink.api.common.io._
import org.apache.flink.api.common.aggregators._
import org.apache.flink.api.common.accumulators._
import org.apache.flink.api.common.distributions._
import org.apache.flink.api.common.operators._
import org.apache.flink.api.common.operators.base.JoinOperatorBase.JoinHint
import org.apache.flink.api.common.functions._
import org.apache.flink.api.java.io._
import org.apache.flink.api.java.aggregation._
import org.apache.flink.api.java.functions._
import org.apache.flink.api.java.operators._
import org.apache.flink.api.java.sampling._
import org.apache.flink.api.scala._
import org.apache.flink.api.scala.utils._
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time._
env: org.apache.flink.api.java.ExecutionEnvironment = Local Environment (parallelism = 8) : ee335d29eefca69ee5fe7279414fc534
console:67: error: missing parameter type for expanded function ((x$1) => x$1.toLowerCase.split("\\W+").filter(((x$2) => x$2.nonEmpty)))
val counts = text.flatMap { _.toLowerCase.split("\\W+") filter { _.nonEmpty } } map { (_, 1) } groupBy(0) sum(1)
result 2
import org.apache.flink.core.fs._
import org.apache.flink.core.fs.local._
import org.apache.flink.api.common.io._
import org.apache.flink.api.common.aggregators._
import org.apache.flink.api.common.accumulators._
import org.apache.flink.api.common.distributions._
import org.apache.flink.api.common.operators._
import org.apache.flink.api.common.operators.base.JoinOperatorBase.JoinHint
import org.apache.flink.api.common.functions._
import org.apache.flink.api.java.io._
import org.apache.flink.api.java.aggregation._
import org.apache.flink.api.java.functions._
import org.apache.flink.api.java.operators._
import org.apache.flink.api.java.sampling._
import org.apache.flink.api.scala._
import org.apache.flink.api.scala.utils._
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time._
env: org.apache.flink.api.java.ExecutionEnvironment = Local Environment (parallelism = 8) : 5cbf8e476ebf32fd8fdf91766bd40af0
console:71: error: type mismatch;
found : String => Int
required: org.apache.flink.api.common.functions.MapFunction[String,?]
val counts = text.map((x:String) => 1)
result 3
import org.apache.flink.core.fs._
import org.apache.flink.core.fs.local._
import org.apache.flink.api.common.io._
import org.apache.flink.api.common.aggregators._
import org.apache.flink.api.common.accumulators._
import org.apache.flink.api.common.distributions._
import org.apache.flink.api.common.operators._
import org.apache.flink.api.common.operators.base.JoinOperatorBase.JoinHint
import org.apache.flink.api.common.functions._
import org.apache.flink.api.java.io._
import org.apache.flink.api.java.aggregation._
import org.apache.flink.api.java.functions._
import org.apache.flink.api.java.operators._
import org.apache.flink.api.java.sampling._
import org.apache.flink.api.scala._
import org.apache.flink.api.scala.utils._
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time._
env: org.apache.flink.api.java.ExecutionEnvironment = Local Environment (parallelism = 8) : ee335d29eefca69ee5fe7279414fc534
Who's there?
I think I hear them. Stand, ho! Who's there?
text: org.apache.flink.api.java.operators.DataSource[String] = org.apache.flink.api.java.operators.DataSource#53e28097
PASSED: testIMain
PASSED: testIMainScript
Try using the Scala REPL that comes with Flink:
$ bin/start-scala-shell.sh local
I tried the three examples you shared (with Flink 1.7.0), and they all worked just fine.

pytest with classes python

I wrote the following code :
publisher.py:
import six
from google.api_core.exceptions import AlreadyExists
from google.cloud.pubsub import types
class publisher(object):
"""Publisher Object which has the following attributes
Attributes:
pubsub: publisher client
project_name: Name of project
topic_name: Name of topic
"""
def __init__(self, pubsub, project_name, topic_name, batch_settings=(), *args, **kwargs):
self.pubsub = pubsub
self.project_name = project_name
self.topic_name = topic_name
self.batch_settings = types.BatchSettings(
*batch_settings) # Batch setting Pub/Sub accepts a maximum of 1,000 messages in a batch,
# and the size of a batch can not exceed 10 megabytes
def _full_project_name(self):
"""Returns Fully Qualified Name of project"""
return self.pubsub.project_path(self.project_name)
and I wrote 3 test unfortunately the third one has been failing.
Below is the code I wrote for tests:
test_publisher.py:
from google.cloud import pubsub
import pytest
from publisher import publisher
PROJECT = 'ProjectTest'
TOPIC_NAME = 'TopicTest'
#pytest.fixture
def pubsub():
yield pubsub.PublisherClient()
def test_init_value():
sample_publisher=publisher(pubsub,PROJECT,TOPIC_NAME,())
assert sample_publisher.project_name == 'ProjectTest'
assert sample_publisher.topic_name == 'TopicTest'
assert sample_publisher.pubsub == pubsub
assert sample_publisher.batch_settings.max_messages == 1000
assert sample_publisher.batch_settings.max_bytes == 10 * (2 ** 20)
assert sample_publisher.batch_settings.max_latency == 0.05
def test_init_with_no_values():
with pytest.raises(Exception) as e_info:
sample_bad_init = publisher()
def test_full_project_name ():
sample_publisher = publisher(pubsub, PROJECT, TOPIC_NAME, ())
assert sample_publisher._full_project_name() == 'projects/ProjectTest'
I am currently getting the following error, which I can't understand, unfortunately:
line 26, in _full_project_name
return self.pubsub.project_path(self.project_name)
AttributeError: 'function' object has no attribute 'project_path'
Any help with this, please.
Thanks a lot
The name of fixture should be changed.
#pytest.fixture
def google_pubsub():
yield pubsub.PublisherClient()
You should add google_pubsub as argument to test test_full_project_name(google_pubsub) and test_init_value(google_pubsub).
In Test test_init_value you use module pubsub imported from from google.cloud import pubsub, what is wrong.
Test test_init_value passes because you comparing module(pubsub) in line
assert sample_publisher.pubsub == google_pubsub

Scala Futures not executing when sending to Kinesis (Amazon AWS)

I am attempting to asynchronously write messages to Amazon Kinesis using Scala Futures so I can load test an application.
This code works, and I can see data moving down my pipeline as well as the output printing to the console.
import com.amazonaws.services.kinesis.AmazonKinesisClient
import java.nio.CharBuffer
import java.nio.charset.Charset
import java.text.SimpleDateFormat
import java.util.{Date, TimeZone}
object KinesisDummyDataProducer extends App {
val kinesis = new AmazonKinesisClient(PipelineConfig.awsCredentials)
println("Connected")
lazy val encoder = Charset.forName("UTF-8").newEncoder()
lazy val tz = TimeZone.getTimeZone("UTC")
lazy val df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'Z")
df.setTimeZone(tz)
(1 to args(0).toInt).map(int => send(int)).map(msg => println(msg))
private def send(int: Int) = {
val msg = "{\"event_name\":\"test\",\"timestamp\":\"%s\",\"int\":%s}".format(df.format(new Date()), int.toString)
val bytes = encoder.encode(CharBuffer.wrap(msg))
encoder.flush(bytes)
kinesis.putRecord("PrimaryEventStream", bytes, "123")
msg
}
}
This code works with Scala Futures.
import scala.concurrent.future
import scala.concurrent.ExecutionContext.Implicits.global
def doIt(x: Int) = {Thread.sleep(1000); x + 1}
(1 to 10).map(x => future{doIt(x)}).map(y => y.onSuccess({case x => println(x)}))
You'll note that the syntax is nearly identical on the mapping of sequences. However, the follwoing does not work (i.e., it neither prints to the console nor sends data down my pipeline).
import com.amazonaws.services.kinesis.AmazonKinesisClient
import java.nio.CharBuffer
import java.nio.charset.Charset
import java.text.SimpleDateFormat
import java.util.{Date, TimeZone}
import scala.concurrent.future
import scala.concurrent.ExecutionContext.Implicits.global
object KinesisDummyDataProducer extends App {
val kinesis = new AmazonKinesisClient(PipelineConfig.awsCredentials)
println("Connected")
lazy val encoder = Charset.forName("UTF-8").newEncoder()
lazy val tz = TimeZone.getTimeZone("UTC")
lazy val df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'Z")
df.setTimeZone(tz)
(1 to args(0).toInt).map(int => future {send(int)}).map(f => f.onSuccess({case msg => println(msg)}))
private def send(int: Int) = {
val msg = "{\"event_name\":\"test\",\"timestamp\":\"%s\",\"int\":%s}".format(df.format(new Date()), int.toString)
val bytes = encoder.encode(CharBuffer.wrap(msg))
encoder.flush(bytes)
kinesis.putRecord("PrimaryEventStream", bytes, "123")
msg
}
}
Some more notes about this project. I am using Maven to do the build (from the command line), and running all of the above code (also from the command line) works just dandy.
My question is: Why with using the same syntax does my function 'send' appear to not be executing?