//I am getting exception while running hadoop jar file which convert pdf to //text and parse to mapper
java.lang.Exception: java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, received org.apache.hadoop.io.LongWritable
at org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:489)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:549)
Caused by: java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, received org.apache.hadoop.io.LongWritable
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1072)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:715)
at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89)
at org.apache.hadoop.mapreduce.lib.map.WrappedMapper$Context.write(WrappedMapper.java:112)
at org.apache.hadoop.mapreduce.Mapper.map(Mapper.java:125)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:146)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:787)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:270)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
17/06/08 19:12:10 INFO mapreduce.Job: Job job_local815278758_0001 running in uber mode : false
17/06/08 19:12:10 INFO mapreduce.Job: map 0% reduce 0%
17/06/08 19:12:10 INFO mapreduce.Job: Job job_local815278758_0001 failed with state FAILED due to: NA
17/06/08 19:12:10 INFO mapreduce.Job: Counters: 0
//Mapper class
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordCountMapper extends
Mapper<Object, Object, Object, Object> {
private Text word = new Text();
private final static LongWritable one = new LongWritable(1);
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.progress();
context.write(word, one);
}
}
}
//Reducer class
package com.amal.pdf;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends
Reducer<Object, Object, Object, Object> {
protected void reduce(Text key, Iterable<LongWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (LongWritable value : values) {
sum += value.get();
}
context.write(key, new LongWritable(sum));
}
}
//PDF record Reader class
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
public class PdfRecordReader extends RecordReader<Object, Object> {
private String[] lines = null;
private LongWritable key = null;
private Text value = null;
#Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit split = (FileSplit) genericSplit;
Configuration job = context.getConfiguration();
final Path file = split.getPath();
/*
* The below code contains the logic for opening the file and seek to
* the start of the split. Here we are applying the Pdf Parsing logic
*/
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
PDDocument pdf = null;
String parsedText = null;
PDFTextStripper stripper;
pdf = PDDocument.load(fileIn);
stripper = new PDFTextStripper();
//getting exception because of this line
parsedText = stripper.getText(pdf);
this.lines = parsedText.split("\n"); }
#Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (key == null) {
key = new LongWritable();
key.set(1);
value = new Text();
value.set(lines[0]);
} else {
int temp = (int) key.get();
if (temp < (lines.length - 1)) {
int count = (int) key.get();
value = new Text();
value.set(lines[count]);
count = count + 1;
key = new LongWritable(count);
} else {
return false;
}
}
if (key == null || value == null) {
return false;
} else {
return true;
}
}
#Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return key;
}
#Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
#Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
#Override
public void close() throws IOException {
}
}
//One more thing can anyone help to create runnable jar, configuration is not //showing inside eclipse because main is for hadoop environment.
The error on your console says:
Caused by: java.io.IOException:
Type mismatch in key from map: expected org.apache.hadoop.io.Text, received org.apache.hadoop.io.LongWritable
That means that the key-value pair you are providing to your mapper doesn't match it's definition.
Your mapper class should look something like this:
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private Text word = new Text();
private final static IntWritable one = new IntWritable(1);
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.progress();
context.write(word, one);
}
}
}
Related
When I launch a simple Strorm script on IntellijIdea or Eclipse i encounter this problem: "Unable to canonicalize address localhost/:2000 because it's not resolvable".
import org.apache.storm.*;
import org.apache.storm.generated.*;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.*;
import org.apache.storm.utils.Utils;
import java.util.Map;
import java.util.Random;
public class ExclamationExample {
public static class TestWordSpout extends BaseRichSpout {
private SpoutOutputCollector _collector;
#Override
public void open(Map map, TopologyContext topologyContext, SpoutOutputCollector spoutOutputCollector) {
_collector = spoutOutputCollector;
}
#Override
public void nextTuple() {
Utils.sleep(100);
final String[] words = new String[]{"nathan", "mike", "jackson", "golda", "bertels"};
final Random rand = new Random();
final String word = words[rand.nextInt(words.length)];
_collector.emit(new Values(word));
System.out.printf("Word spout: %s\n", word);
}
#Override
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
outputFieldsDeclarer.declare(new Fields("word"));
}
}
public static class ExclamationBolt extends BaseRichBolt {
private OutputCollector collector;
#Override
public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
this.collector = collector;
}
#Override
public void execute(Tuple tuple) {
String val = tuple.getString(0) + "!!!";
collector.emit(tuple, new Values(val));
System.out.printf("Exclamation bolt: %s\n", val);
collector.ack(tuple);
}
#Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("word"));
}
}
//TOPOLOGY
public static void main(String[] args) throws Exception {
Config conf = new Config();
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout("word", new TestWordSpout(), 2);
builder.setBolt("exclaim1", new ExclamationBolt(), 2).shuffleGrouping("word");
builder.setBolt("exclaim2", new ExclamationBolt(), 2).shuffleGrouping("exclaim1");
conf.setDebug(false);
String topologyName = "test";
conf.setNumWorkers(2);
//If there are arguments, we are running on a cluster
if (args != null && args.length > 0) {
conf.setNumWorkers(3);
topologyName = args[0];
StormSubmitter.submitTopology(topologyName, conf, builder.createTopology());
}
try (LocalCluster cluster = new LocalCluster()) {
cluster.submitTopology(topologyName, conf, builder.createTopology());
Utils.sleep(600000); // wait [param] ms
cluster.killTopology(topologyName);
cluster.shutdown();
} catch (Exception e) {
e.printStackTrace();
}
}
}
What is a possible solution?
I resolved my problem. If can help someone i tell the solution. I ran the Java Application using Java SDK 17, this give me problem, then I created new project with JDK 1.8 and everything works!
I'm making a Flink CEP application that reads data through Kafka. When I try to catch the patterns, the sink operation does not occur when there is no data after it. For example, I expect A-> B-> C as a pattern. And from the kafka comes A, B, C data. However, in order for the sink operation I added to the patternProcess function to work, the data coming from the kafka must be like A, B, C, X. How do I fix this problem please help.
READ KAFKA
DataStream<String> dataStream = env.addSource(KAFKA).assignTimestampsAndWatermarks(WatermarkStrategy
.forBoundedOutOfOrderness(Duration.ofSeconds(0)));
dataStream.print("DS:"); //to see every incoming data
PATTERN
Pattern<Event, ?> pattern = Pattern.<Event>begin("start").where(
new SimpleCondition<Event>() {
#Override
public boolean filter(Event event) {
return event.actionId.equals("2.24");
}
}
).next("middle").where(
new SimpleCondition<Event>() {
#Override
public boolean filter(Event event) {
return event.actionId.equals("2.24");
}
}
).within(Time.seconds(5));
CEP And Sink
PatternStream<Event> patternStream = CEP.pattern(eventStringKeyedStream, pattern);
patternStream.process(new PatternProcessFunction<Event, Event>() {
#Override
public void processMatch(Map<String, List<Event>> map, Context context, Collector<Event> collector) throws Exception {
collector.collect(map.get("start").get(0));
}
}).print();//or sink function
My Program RESULT
DS::2> {"ActionID":"2.24"}
DS::2> {"ActionID":"2.24"}
DS::2> {"ActionID":"2.25"}
4> {ActionID='2.24'}
I was expecting
DS::2> {"ActionID":"2.24"}
DS::2> {"ActionID":"2.24"}
4> {ActionID='2.24'}
So why does it produce results when one more data comes after the conditions are met, not when the conditions are met for the pattern?
Please help me.
EDIT
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.cep.CEP;
import org.apache.flink.cep.functions.PatternProcessFunction;
import org.apache.flink.cep.pattern.Pattern;
import org.apache.flink.cep.pattern.conditions.SimpleCondition;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
import java.time.Duration;
import java.util.List;
import java.util.Map;
public class EventTimePattern {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStream<String> input = env.socketTextStream("localhost",9999)
.map(new MapFunction<String, Tuple2<String, Long>>() {
#Override
public Tuple2<String, Long> map (String value) throws Exception {
String[] fields = value.split(",");
if (fields.length == 2) {
return new Tuple2<String, Long>(
fields[0] ,
Long.parseLong(fields[1]));
}
return null;
}
})
/* env.fromElements(
Tuple2.of("A", 5L),
Tuple2.of("A", 10L)
)*/
.assignTimestampsAndWatermarks(
WatermarkStrategy
.<Tuple2<String, Long>>forBoundedOutOfOrderness(Duration.ofMillis(0))
.withTimestampAssigner((event, timestamp) -> event.f1))
.map(event -> event.f0);
Pattern<String, ?> pattern =
Pattern.<String>begin("start")
.where(
new SimpleCondition<String>() {
#Override
public boolean filter(String value) throws Exception {
return value.equals("A");
}
})
.next("end")
.where(
new SimpleCondition<String>() {
#Override
public boolean filter(String value) throws Exception {
return value.equals("A");
}
})
.within(Time.seconds(5));
input.print("I");
DataStream<String> result =
CEP.pattern(input, pattern)
.process(new PatternProcessFunction<String, String>() {
#Override
public void processMatch(
Map<String, List<String>> map,
Context context,
Collector<String> out) throws Exception {
StringBuilder builder = new StringBuilder();
builder.append(map.get("start").get(0))
.append(",")
.append(map.get("end").get(0));
out.collect(builder.toString());
}
});
result.print();
env.execute();
}
}
I failed to reproduce your problem. Here's a similar example that works fine (I used Flink 1.12.2):
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.cep.CEP;
import org.apache.flink.cep.functions.PatternProcessFunction;
import org.apache.flink.cep.pattern.Pattern;
import org.apache.flink.cep.pattern.conditions.SimpleCondition;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
public class EventTimePattern {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStream<String> input =
env.fromElements(
Tuple2.of("A", 5L),
Tuple2.of("A", 10L)
)
.assignTimestampsAndWatermarks(
WatermarkStrategy
.<Tuple2<String, Long>>forBoundedOutOfOrderness(Duration.ofMillis(0))
.withTimestampAssigner((event, timestamp) -> event.f1))
.map(event -> event.f0);
Pattern<String, ?> pattern =
Pattern.<String>begin("start")
.where(
new SimpleCondition<String>() {
#Override
public boolean filter(String value) throws Exception {
return value.equals("A");
}
})
.next("end")
.where(
new SimpleCondition<String>() {
#Override
public boolean filter(String value) throws Exception {
return value.equals("A");
}
})
.within(Time.seconds(5));
DataStream<String> result =
CEP.pattern(input, pattern)
.process(new PatternProcessFunction<String, String>() {
#Override
public void processMatch(
Map<String, List<String>> map,
Context context,
Collector<String> out) throws Exception {
StringBuilder builder = new StringBuilder();
builder.append(map.get("start").get(0))
.append(",")
.append(map.get("end").get(0));
out.collect(builder.toString());
}
});
result.print();
env.execute();
}
}
Please share a simple, complete, reproducible example that illustrates the problem you're having.
I have installed Hadoop 2.6 in centos7 and it's running fine. But when I run a jar exported from Eclipse, it gives the following error:
[root#myspark ~]# hadoop jar fengcount.jar intput output1
17/05/26 21:24:51 INFO client.RMProxy: Connecting to ResourceManager
at myspark/192.168.44.100:8032 17/05/26 21:24:53 INFO mapreduce.JobSubmitter: Cleaning up the staging area /tmp/hadoop-yarn/staging/root/.staging/job_1495765615548_0004 Exception in thread "main" org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input path does not exist: hdfs://myspark:54310/user/root/intput
at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:321)
at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:264)
at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:385)
at org.apache.hadoop.mapreduce.JobSubmitter.writeNewSplits(JobSubmitter.java:302)
at org.apache.hadoop.mapreduce.JobSubmitter.writeSplits(JobSubmitter.java:319)
at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:197)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1297)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1294)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1692)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1294)
at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1315)
at hdfs.hadoop_hdfs.fengcount.main(fengcount.java:39)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.util.RunJar.run(RunJar.java:221)
at org.apache.hadoop.util.RunJar.main(RunJar.java:136)
The file input/test1.txt actually exists:
[root#myspark ~]# hdfs dfs -ls -R
drwxr-xr-x - root supergroup 0 2017-05-26 21:02 input
-rw-r--r-- 1 root supergroup 16 2017-05-24 01:57 input/test1.txt
My code:
package hdfs.hadoop_hdfs;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class fengcount {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub
Configuration conf=new Configuration();
String[] otherargs=new GenericOptionsParser(conf,args).getRemainingArgs();
if (otherargs.length!=2) {
System.err.println("Usage:fengcount<int><out>");
System.exit(2);
}
#SuppressWarnings("deprecation")
Job job=new Job(conf, "fengcount");
job.setJarByClass(fengcount.class);
job.setMapperClass(TokerizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherargs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherargs[1]));
System.exit(job.waitForCompletion(true)?0:1);
}
// mapper class
public static class TokerizerMapper extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
#Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
System.out.println("key=" + key.toString());
System.out.println("value=" + value.toString());
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
//reduce process
public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
#Override
public void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
//mapreduce process
}
From the error log, I could see hdfs://myspark:54310/user/root/intput, I suspect its incorrect. I guess the path ends with input.
Good luck!
Can any one please tell me how can I get current attempt count in quartz.
Example : if Quartz scheduler is started with repeat count of 5. I want to get the current repeat count.
Here is the Example I am trying with
public class SimpleTriggerExample implements Job
{
int count = 0;
JobDetail job = null;
JobDataMap data = null;
public static void main( String[] args ) throws Exception
{
new SimpleTriggerExample().schedule();
}
public void schedule() throws ParseException, SchedulerException{
job = JobBuilder.newJob(SimpleTriggerExample.class)
.withIdentity("dummyJobName", "group1").build();
Trigger trigger = TriggerBuilder
.newTrigger()
.withIdentity("dummyTriggerName", "group1")
.withSchedule(SimpleScheduleBuilder.simpleSchedule()
.withIntervalInSeconds(10).withRepeatCount(3))
.build();
System.out.println("before in main jobdatamap");
Scheduler scheduler = new StdSchedulerFactory().getScheduler();
scheduler.start();
scheduler.scheduleJob(job, trigger);
}
public void execute(JobExecutionContext context)
throws JobExecutionException {
//count
data = context.getJobDetail().getJobDataMap();
System.out.println("after jobdatamap");
int count1 = data.getInt("EXECUTION_COUNT");
System.out.println("count1-->before"+count1);
count1++;
System.out.println("count1-->after"+count1);
job.getJobDataMap().put("EXECUTION_COUNT", count1);
count = count1;
System.out.println("count"+count);
}
}
Use JobDataMap along with #PersistJobDataAfterExecution annotation.
Make sure when you modify data in JobDataMap the key value should be same.
If you do like this you can persist your attempts as per your requirement.
Example Code Snippet:
package com.mss.quartz.demo;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.quartz.InterruptableJob;
import org.quartz.Job;
import org.quartz.JobBuilder;
import org.quartz.JobDataMap;
import org.quartz.JobDetail;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import org.quartz.JobKey;
import org.quartz.PersistJobDataAfterExecution;
import org.quartz.Scheduler;
import org.quartz.SchedulerContext;
import org.quartz.SchedulerException;
import org.quartz.SimpleScheduleBuilder;
import org.quartz.TriggerBuilder;
import org.quartz.UnableToInterruptJobException;
import org.quartz.impl.StdSchedulerFactory;
#PersistJobDataAfterExecution
public class HelloJob implements InterruptableJob
{
SchedulerContext schedulerContext = null;
testQuartz test = new testQuartz();
boolean result;
private boolean _interrupted = false;
private JobKey _jobKey = null;
Thread t = null;
//public static int count = 0;
public void interrupt() throws UnableToInterruptJobException {
System.out.println("---" + this._jobKey + " -- INTERRUPTING --");
this._interrupted = true;
}
public void execute(JobExecutionContext context)
throws JobExecutionException {
Scheduler scd = context.getScheduler();
JobDataMap dataMap = context.getJobDetail().getJobDataMap();
String jobSays = dataMap.getString("test1");
int myFloatValue = dataMap.getIntValue("id");
System.out.println("In Job Class"+jobSays+ " "+myFloatValue+" Current time in
Job class "+new Date().toString());
JobKey jobKey = context.getJobDetail().getKey();
int attemps = dataMap.getInt("attempts");
attemps++;
dataMap.put("attempts", attemps);
System.out.println("After putting count in job data map:"+dataMap.get("attempts"));
}
}
Try to add the #PersistJobDataAfterExecution annotation to SimpleTriggerExample class:
#PersistJobDataAfterExecution
public class SimpleTriggerExample implements Job
{ ...}
I have each record spread across multiple lines in the input file(Very huge file).
Ex:
Id: 2
ASIN: 0738700123
title: Test tile for this product
group: Book
salesrank: 168501
similar: 5 0738700811 1567184912 1567182813 0738700514 0738700915
categories: 2
|Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Earth-Based Religions[12472]|Wicca[12484]
|Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Earth-Based Religions[12472]|Witchcraft[12486]
reviews: total: 12 downloaded: 12 avg rating: 4.5
2001-12-16 cutomer: A11NCO6YTE4BTJ rating: 5 votes: 5 helpful: 4
2002-1-7 cutomer: A9CQ3PLRNIR83 rating: 4 votes: 5 helpful: 5
How to identify and process each multi line record in spark?
If the multi-line data has a defined record separator, you could use the hadoop support for multi-line records, providing the separator through a hadoop.Configuration object:
Something like this should do:
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
val conf = new Configuration
conf.set("textinputformat.record.delimiter", "id:")
val dataset = sc.newAPIHadoopFile("/path/to/data", classOf[TextInputFormat], classOf[LongWritable], classOf[Text], conf)
val data = dataset.map(x=>x._2.toString)
This will provide you with an RDD[String] where each element corresponds to a record. Afterwards you need to parse each record following your application requirements.
I have done this by implementing custom input format and record reader.
public class ParagraphInputFormat extends TextInputFormat {
#Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) {
return new ParagraphRecordReader();
}
}
public class ParagraphRecordReader extends RecordReader<LongWritable, Text> {
private long end;
private boolean stillInChunk = true;
private LongWritable key = new LongWritable();
private Text value = new Text();
private FSDataInputStream fsin;
private DataOutputBuffer buffer = new DataOutputBuffer();
private byte[] endTag = "\n\r\n".getBytes();
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
FileSplit split = (FileSplit) inputSplit;
Configuration conf = taskAttemptContext.getConfiguration();
Path path = split.getPath();
FileSystem fs = path.getFileSystem(conf);
fsin = fs.open(path);
long start = split.getStart();
end = split.getStart() + split.getLength();
fsin.seek(start);
if (start != 0) {
readUntilMatch(endTag, false);
}
}
public boolean nextKeyValue() throws IOException {
if (!stillInChunk) return false;
boolean status = readUntilMatch(endTag, true);
value = new Text();
value.set(buffer.getData(), 0, buffer.getLength());
key = new LongWritable(fsin.getPos());
buffer.reset();
if (!status) {
stillInChunk = false;
}
return true;
}
public LongWritable getCurrentKey() throws IOException, InterruptedException {
return key;
}
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
public float getProgress() throws IOException, InterruptedException {
return 0;
}
public void close() throws IOException {
fsin.close();
}
private boolean readUntilMatch(byte[] match, boolean withinBlock) throws IOException {
int i = 0;
while (true) {
int b = fsin.read();
if (b == -1) return false;
if (withinBlock) buffer.write(b);
if (b == match[i]) {
i++;
if (i >= match.length) {
return fsin.getPos() < end;
}
} else i = 0;
}
}
}
endTag identifies the end of each record.