Apache beam JOIN using java- extensions library (beam-sdks-java-extensions-join-library) - apache-beam

I am using the below code for join data in the apache beam in java and the beam version is 2.19. (without CoGroupByKey). But I am getting NullPointerException.
Any Idea what I am doing wrong here. Below is the code.
public class InnerJoinExample {
public static void main(String[] args) {
Pipeline p = Pipeline.create();
Schema left = Schema.builder().addStringField("a1")
.addStringField("a2")
.build();
Row row1 =
Row
.withSchema(left)
.addValues("key1", "5")
.build();
PCollection<Row> leftPCollection = p.apply(Create.of(row1)).setRowSchema(left);
Schema right = Schema.builder().addStringField("b1")
.addStringField("b2")
.build();
Row row2 =
Row
.withSchema(right)
.addValues("key1", "15")
.build();
PCollection<Row> rightCollection = p.apply(Create.of(row2)).setRowSchema(right);
PCollection<Row> joined = leftPCollection.apply(Join.innerJoin(rightCollection));
p.run();
}
}

Related

How to merge two streams and perform stateful operations on merged stream using Apache Beam

I have 2 Kafka streams, I want to merge by some key and on top of the merged stream I want to perform the stateful operation so that I can sum up counts from both streams
this what I tried but dint work ..
PCollection<String> stream1 = .. read from kafka
PCollection<String> stream2 = .. read from kafka
PCollection<String,Long> wonrdCount1 = stream1.apply(...)
PCollection<String,Long> wonrdCount2 = stream2.apply(...)
PCollection<String,Long> merged = merge wordcount1 and wordcount2 using CoGroupByKey
Pcolection<String,Long> finalStream = mergred.apply(...)
for finalstream apply state
public class KafkaWordCount implements Serializable {
private String kafkaBrokers =null;
private String topic =null;
public KafkaWordCount(String brokers, String topic){
this.kafkaBrokers =brokers;
this.topic =topic;
}
public PCollection<KV<String,Long>> build(Pipeline p){
final String myState="HELLO";
PCollection<KV<String,Long>> res =
p.apply(KafkaIO.<Long, String>read()
.withBootstrapServers(this.kafkaBrokers )
.withTopic(this.topic)
.withKeyDeserializer(LongDeserializer.class)
.withValueDeserializer(StringDeserializer.class))
.apply(ParDo.of(new DoFn<KafkaRecord<Long, String>, String>() {
#ProcessElement
public void processElement(ProcessContext processContext) {
KafkaRecord<Long, String> record = processContext.element();
processContext.output(record.getKV().getValue());
}
}))
.apply("ExtractWords",
ParDo.of(new DoFn<String, KV<String, Long>>() {
#ProcessElement
public void processElement(ProcessContext c) {
for (String word : c.element().split("[^\\p{L}]+")) {
if (!word.isEmpty()) {
c.output(KV.of(word,1L));
}
}
}
}));
return res;
}
}
public class DataPipe {
public static void main(String[] args) {
final String stateId = "myMapState";
final String myState = "myState";
PipelineOptions options = PipelineOptionsFactory.create();
options.as(FlinkPipelineOptions.class).setRunner(FlinkRunner.class);
Pipeline p = Pipeline.create(options);
PCollection<KV<String,Long>> stream1 =
new KafkaWordCount("localhost:9092","idm")
.build(p)
.apply(
Window
.<KV<String,Long>>into(
FixedWindows.of(Duration.millis(3600000)))
.triggering(
Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
.withAllowedLateness(Duration.ZERO)
.discardingFiredPanes());
PCollection<KV<String,Long>> stream2 =
new KafkaWordCount("localhost:9092","assist")
.build(p)
.apply(
Window
.<KV<String,Long>>into(
FixedWindows.of(Duration.millis(3600000)))
.triggering(
Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
.withAllowedLateness(Duration.ZERO)
.discardingFiredPanes());
final TupleTag<Long> web = new TupleTag<Long>();
final TupleTag<Long> assist = new TupleTag<Long>();
PCollection<KV<String, CoGbkResult>> joinedStream =
KeyedPCollectionTuple.of(web, stream1)
.and(assist, stream2)
.apply(CoGroupByKey.<String>create());
PCollection<KV<String,Long>> finalCountStream =
joinedStream
.apply(ParDo.of(
new DoFn<KV<String, CoGbkResult>, KV<String,Long>>() {
#StateId(stateId)
private final StateSpec<MapState<String, Long>> mapState =
StateSpecs.map();
#ProcessElement
public void processElement(
ProcessContext processContext,
#StateId(stateId) MapState<String, Long> state) {
KV<String,CoGbkResult> element = processContext.element();
Iterable<Long> count1 = element.getValue().getAll(web);
Iterable<Long> count2 = element.getValue().getAll(assist);
Long sumAmount =
StreamSupport
.stream(
Iterables
.concat(count1, count2)
.spliterator(),
false)
.collect(Collectors.summingLong(n -> n));
System.out.println(element.getKey()+"::"+sumAmount);
// processContext.output(element.getKey()+"::"+sumAmount);
Long currCount = state.get(element.getKey()).read()==null? 0L:state.get(element.getKey()).read();
Long newCount = currCount+sumAmount;
state.put(element.getKey(),sumAmount);
processContext.output(KV.of(element.getKey(),sumAmount));
}
}));
finalCountStream
.apply(ParDo.of(new DoFn<KV<String,Long>, KV<String,Long>>() {
#ProcessElement
public void processElement(ProcessContext processContext) {
processContext.output(processContext.element());
}
}))
.apply("finalState", ParDo.of(new DoFn<KV<String,Long>, String>() {
#StateId(myState)
private final StateSpec<MapState<String, Long>> mapState =
StateSpecs.map();
#ProcessElement
public void processElement(
ProcessContext c,
#StateId(myState) MapState<String, Long> state){
KV<String,Long> e = c.element();
System.out.println("Thread ID :"
+ Thread.currentThread().getId());
Long currCount =
state.get(e.getKey()).read()==null
? 0L
: state.get(e.getKey()).read();
Long newCount = currCount+e.getValue();
state.put(e.getKey(),newCount);
c.output(e.getKey()+":"+newCount);
}
}))
.apply(KafkaIO.<Void, String>write()
.withBootstrapServers("localhost:9092")
.withTopic("test")
.withValueSerializer(StringSerializer.class)
.values());
/* finalCountStream.apply(KafkaIO.<Void, String>write()
.withBootstrapServers("localhost:9092")
.withTopic("test")
.withValueSerializer(StringSerializer.class)
.values()
);*/
//finalCountStream.apply(TextIO.write().to("wordcounts"));
p.run().waitUntilFinish();
}
}
This Beam pipeline reads text from two kafka streams , split it into words and merge both streams based on word and finally emits word count from both stream to another kafka topic
For reference: if I understand correctly the problem, you can simplify the first part of your pipeline by using KafkaIO.withTopics(List<String>) and read from two (or more) topics in one step. So, there is no need to join data from different topics after.

How to add spaces before delimiter for each column using spring batch?

I need to add spaces based on conditions before delimiter for each column when writing from db to csv file using spring batch.
For example, i required spacing in csv file
Column1;Column2;Column3;Column4
AI22;FIENC234DK;EDEJJEDK;JEND4
OR2 ;JFJRN3D ;DEDERF3E;FEF
EK3R;DJE3DJJEJE;JDJENEJ ;3NEN3
I have shown block of code run currently
#Bean
ItemReader<CdVehicle> databaseToCsvItemReader() {
JdbcCursorItemReader<CdVehicle> databaseReader = new JdbcCursorItemReader<>();
databaseReader.setDataSource(dataSource);
databaseReader.setSql(QUERY_FIND_FRIENDS);
databaseReader.setRowMapper(new BeanPropertyRowMapper<>(CdVehicle.class));
return databaseReader;
}
#Bean
ItemWriter<CdVehicle> databaseToCsvItemWriter() {
FlatFileItemWriter<CdVehicle> csvFileWriter = new FlatFileItemWriter<>();
String exportFileHeader = "CV20VEHID;CV20RFDPR;CV20USAGE;CV20CRVEH;CV20PERID;CV20SITE;CV27PROJ;CV27TYCAI;CV27PHASE;CV27MILLE;CV27CRV;O;CV20DTECL;CV20TEINT;CV20PRINCIPALE;CV20SECONDAIRE;CV20CRBIS;CV36TYPRX";
StringHeaderWriter headerWriter = new StringHeaderWriter(exportFileHeader);
csvFileWriter.setHeaderCallback(headerWriter);
String userhome = System.getProperty("user.home");
String exportFilePath = userhome + "/tmp/cdv.txt";
csvFileWriter.setResource(new FileSystemResource(exportFilePath));
LineAggregator<CdVehicle> lineAggregator = newPersonLineAggregator();
csvFileWriter.setLineAggregator(lineAggregator);
return csvFileWriter;
}
#Bean
public Step databaseToCsvStep() {
return stepBuilderFactory.get("databaseToCsvStep")
.<CdVehicle, CdVehicle>chunk(100)
.reader(databaseToCsvItemReader())
.writer(databaseToCsvItemWriter())
.build();
}
#Bean
Job databaseToCsvJob() {
return jobBuilderFactory.get("databaseToCsvJob")
.incrementer(new RunIdIncrementer())
.flow(databaseToCsvStep())
.end()
.build();
}
private LineAggregator<CdVehicle> newPersonLineAggregator() {
DelimitedLineAggregator<CdVehicle> lineAggregator = new DelimitedLineAggregator<>();
lineAggregator.setDelimiter(";");
FieldExtractor<CdVehicle> fieldExtractor = newPersonFieldExtractor();
lineAggregator.setFieldExtractor(fieldExtractor);
return lineAggregator;
}
private FieldExtractor<CdVehicle> newPersonFieldExtractor() {
BeanWrapperFieldExtractor<CdVehicle> extractor = new BeanWrapperFieldExtractor<>();
extractor.setNames(new String[] {"CV20VEHID","CV20RFDPR","CV20USAGE","CV20CRVEH","CV20DTAPP","CV20PERID","CV20SITE","CV27PROJ","CV27TYCAI","CV27PHASE","CV27MILLE","CV27CRV","CV27TYMOT","O","CV20DTECL","CV20TEINT","CV20PRINCIPALE", "CV20SECONDAIRE","CV20CRBIS","CV36TYPRX"});
return extractor;
}
After running spring batch job i am getting the result for example as
Column1;Column2;Column3;Column4
AI22;FIENC234DK;EDEJJEDK;JEND4
OR2;JFJRN3D;DEDERF3E;FEF
EK3R;DJE3DJJEJE;JDJENEJ;3NEN3
in order to achieve spacing how to check column by column with conditions. Is some one have any idea how to check by column header name condition and give space before delimiter during iteration.
You can take advantage of String Formatter syntax by using the Spring Batch FormatterLineAggregator. Just replace your existing LineAggregator implementation with the following:
private LineAggregator<CdVehicle> newPersonLineAggregator() {
FormatterLineAggregator<CdVehicle> lineAggregator = new FormatterLineAggregator<>();
lineAggregator.setFieldExtractor(newPersonFieldExtractor());
lineAggregator.setFormat("%-4s;%-10s;%-8d;%-5s");
return lineAggregator;

How to get table metadata from camel-sql component

I'm looking for a way to get all the column meta data for the given table name using camel-sql component.
Though it uses spring-jdbc behind the scenes i do not see a way to get the ResultSetMetaData.
I couldn't find a direct way to get the column details from camel-sql component, For now managed to get the information using spring jdbc template and data source.
public List<String> getColumnNamesFromTable(final TableData tableData) throws MetaDataAccessException {
final List<String> columnNames = new ArrayList<String>();
JdbcTemplate jdbcTemplate = new JdbcTemplate(dataSource);
StringBuilder query = new StringBuilder();
query.append("SELECT * FROM ").append(SINGLE_BLANK_SPACE);
query.append(tableData.getSchemaName());
query.append(tableData.getTableName()).append(SINGLE_BLANK_SPACE);
query.append("WHERE rownum < 0");
jdbcTemplate.query(query.toString(), new ResultSetExtractor<Integer>() {
#Override
public Integer extractData(ResultSet rs) throws SQLException, DataAccessException {
ResultSetMetaData rsmd = rs.getMetaData();
int columnCount = rsmd.getColumnCount();
for (int i = 1; i <= columnCount; i++) {
columnNames.add(rsmd.getColumnName(i).toUpperCase());
}
return columnCount;
}
});
return columnNames;
}

Too many connections in MongoDB and Spark

My Spark Streaming application stores the data in MongoDB.
Unfortunately each Spark worker opening too many connections while storing it in MongoDB
Following is my code Spark - Mongo DB code:
public static void main(String[] args) {
int numThreads = Integer.parseInt(args[3]);
String mongodbOutputURL = args[4];
String masterURL = args[5];
Logger.getLogger("org").setLevel(Level.OFF);
Logger.getLogger("akka").setLevel(Level.OFF);
// Create a Spark configuration object to establish connection between the application and spark cluster
SparkConf sparkConf = new SparkConf().setAppName("AppName").setMaster(masterURL);
// Configure the Spark microbatch with interval time
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(60*1000));
Configuration config = new Configuration();
config.set("mongo.output.uri", "mongodb://host:port/database.collection");
// Set the topics that should be consumed from Kafka cluster
Map<String, Integer> topicMap = new HashMap<String, Integer>();
String[] topics = args[2].split(",");
for (String topic: topics) {
topicMap.put(topic, numThreads);
}
// Establish the connection between kafka and Spark
JavaPairReceiverInputDStream<String, String> messages =
KafkaUtils.createStream(jssc, args[0], args[1], topicMap);
JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
#Override
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
});
JavaPairDStream<Object, BSONObject> save = lines.mapToPair(new PairFunction<String, Object, BSONObject>() {
#Override
public Tuple2<Object, BSONObject> call(String input) {
BSONObject bson = new BasicBSONObject();
bson.put("field1", input.split(",")[0]);
bson.put("field2", input.split(",")[1]);
return new Tuple2<>(null, bson);
}
});
// Store the records in database
save.saveAsNewAPIHadoopFiles("prefix","suffix" ,Object.class, Object.class, MongoOutputFormat.class, config);
jssc.start();
jssc.awaitTermination();
}
How to control the no of connections at each worker?
Am I missing any configuration parameters?
Update 1:
I am using Spark 1.3 with Java API.
I was not able to perform coalesce() but I was able to do repartition(2) operation.
Now no of connections got controlled.
But I think connections are not being closed or not reused at worker.
Please find the below screenshot:
Streaming interval 1-minute and 2 partitions
You can try map partitions, which works on partition level instead of record level, I.e, task execute on one node will shares one database connection instead of for every record.
Also I guess you can use a pre partition( not the stream RDD). Spark is smart enough to utilize this to reduce shuffle.
I was able to solve the issue by using foreachRDD.
I am establishing the connection and closing it after every DStream.
myRDD.foreachRDD(new Function<JavaRDD<String>, Void>() {
#Override
public Void call(JavaRDD<String> rdd) throws Exception {
rdd.foreachPartition(new VoidFunction<Iterator<String>>() {
#Override
public void call(Iterator<String> record) throws Exception {
MongoClient mongo = new MongoClient(server:port);
DB db = mongo.getDB(database);
DBCollection targetTable = db.getCollection(collection);
BasicDBObject doc = new BasicDBObject();
while (record.hasNext()) {
String currentRecord = record.next();
String[] delim_records = currentRecord.split(",");
doc.append("column1", insert_time);
doc.append("column2", delim_records[1]);
doc.append("column3",delim_records[0]);
targetTable.insert(doc);
doc.clear();
}
mongo.close();
}
});
return null;
}
});

The column name ... was not found in this ResultSet

We are using java jdk 1.7.0_45, postgresql jdbc connector postgresql-9.3-1100.jdbc41.jar.
Here is a synopsis of our problem, as much as possible of code pasted below.
This code:
ResultSet rs = DbConn.getInstance().doQuery("Select d.deptId from Depts d");
while (rs.next()){
System.out.println(rs.getInt("d.deptId"));
Produces the error:
org.postgresql.util.PSQLException: The column name d.deptId was not found in this ResultSet.
This code:
ResultSet rs = DbConn.getInstance().doQuery("Select d.deptId from Depts d");
while (rs.next()){
System.out.println(rs.getInt("deptId"));
Produces no error.
Is there a way, besides removing the "d." from the first query, to make the first code snippet not throw the error message?
Here is the source code:
public class JoinTest {
#Test
public void test(){
boolean pass = false;
try {
ResultSet rs = DbConn.getInstance().doQuery("Select d.deptId from Depts d");
String label = rs.getMetaData().getColumnLabel(1); // What do you get?
System.out.println("label = " + label);
while (rs.next()){
System.out.println(rs.getInt("d.deptId"));
pass = true;
}
} catch (SQLException e) {
e.printStackTrace();
pass=false;
}
assertTrue(pass);
}
#Test
public void test2(){
boolean pass = false;
try {
ResultSet rs = DbConn.getInstance().doQuery("Select d.deptId from Depts d");
while (rs.next()){
System.out.println(rs.getInt("deptId"));
pass = true;
}
} catch (SQLException e) {
e.printStackTrace();
pass=false;
}
assertTrue(pass);
}
}
public class DbConn {
private static String url = "jdbc:postgresql://server:port/schema";
private static Properties props = new Properties(); {
props.setProperty("user","userid");
props.setProperty("password","passwprd");
}
private Connection conn;
private DbConn(){}
private static DbConn instance;
public static DbConn getInstance() throws SQLException{
if (instance == null){
instance = new DbConn();
instance.conn = DriverManager.getConnection(url, props);
}
return instance;
}
public ResultSet doQuery(String query) throws SQLException{
Logger.log("DbConn.doQuery: " + query);
Statement st = conn.createStatement();
ResultSet rs = st.executeQuery(query);
return rs;
}
}
}
The query:
select d.deptId from Depts d
produces a single-column resultset with the result-alias "deptId". There is no "d.deptId" column. If you want one, you can request that as the column alias instead:
select d.deptId AS "d.deptId" from Depts d
PgJDBC can't do anything about this because it has no idea that the resultset column "deptId" is related to the "d.deptId" in the select-list. Teaching it about that would force it to understand way more about the SQL it processes than would be desirable, and lead to maintenance and performance challenges.
The second one works - why isn't that acceptable?
You can also do this:
System.out.println(rs.getInt(1));
If you change the query you have to change the code, too.