I am using Spring Batch Partitioning to merge data from group of related flat files to a single file. The batch is failing with below two issues:
First slave step thread is failing as the data to file writer is written before it is opened. The value for variable inputFileNames (step context data provided by partitioner) for this thread is[20002", 20003]
Second slave step thread is failing as the partitioning data is missing from the step context. The value for variable inputFileNames for this thread is null
Please let me know if I am missing some thing in the configuration.
// log with Error info
2015-12-26 17:59:14,165 DEBUG [SimpleAsyncTaskExecutor-1] c.d.d.b.r.ReaderConfiguration [ReaderBatchConfiguration.java:473] inputFileNames ----[20002", 20003]
2015-12-26 17:59:14,165 DEBUG [SimpleAsyncTaskExecutor-1] c.d.d.b.r.BatchConfiguration [BatchConfiguration.java:389] consumer ----p2
2015-12-26 17:59:14,275 ERROR [SimpleAsyncTaskExecutor-1] o.s.b.c.s.AbstractStep [AbstractStep.java:225] Encountered an error executing step testConsumersInputFileMergeStep in job testFileForInputJob
org.springframework.batch.item.WriterNotOpenException: Writer must be open before it can be written to
at org.springframework.batch.item.file.FlatFileItemWriter.write(FlatFileItemWriter.java:255) ~[spring-batch-infrastructure-3.0.3.RELEASE.jar:3.0.3.RELEASE]
2015-12-26 18:00:14,421 DEBUG [SimpleAsyncTaskExecutor-2] c.d.d.b.r.ReaderBatchConfiguration [ReaderConfiguration.java:474] inputFileNames ----null
// Partitioner
public class ProvisioningInputFilePartitioner implements Partitioner {
#Override
public Map<String, ExecutionContext> partition(int gridSize) {
Map<String, ExecutionContext> filesToProcess = getFilesToProcess(outboundSourceFolder);
Map<String, ExecutionContext> execCtxs = new HashMap<>();
for(Entry<String, ExecutionContext> entry : filesToProcess.entrySet()) {
execCtxs.put(entry.getKey(), entry.getValue());
}
return execCtxs;
}
private Map<String, ExecutionContext> getFilesToProcess(String outboundSourceFolder2) {
Map<String, ExecutionContext> contexts = new HashMap<>();
ExecutionContext execCtx1 = new ExecutionContext();
List<String> inputFileNames1 = Arrays.asList("20001", "22222");
execCtx1.put("consumer", "p1");
execCtx1.put("inputFileNames", inputFileNames1);
contexts.put("p1", execCtx1);
ExecutionContext execCtx2 = new ExecutionContext();
List<String> inputFileNames2 = Arrays.asList("20002", "20003");
execCtx1.put("consumer", "p2");
execCtx1.put("inputFileNames", inputFileNames2);
contexts.put("p2", execCtx2);
return contexts;
}
}
// Writer
#Bean
#StepScope
public ItemWriter<String> testConsumerFileItemWriter (#Value("#{stepExecutionContext[consumer]}") String consumer){
logger.debug("consumer ----"+ consumer);
FileSystemResource fileSystemResource = new FileSystemResource(new File(outboundSourceFolder, consumer + ".txt"));
FlatFileItemWriter<String> fileItemWriter = new FlatFileItemWriter<>();
fileItemWriter.setResource(fileSystemResource);
fileItemWriter.setLineAggregator(new PassThroughLineAggregator<String>());
return fileItemWriter;
}
#Bean
public Partitioner provisioningInputFilePartitioner() {
return new ProvisioningInputFilePartitioner();
}
#Bean
public TaskExecutor taskExecutor() {
return new SimpleAsyncTaskExecutor();
}
// Reader
#Bean
#StepScope
public ItemReader<String> testInputFilesReader (#Value("#{stepExecutionContext[inputFileNames]}") List<String> inputFileNames) {
logger.debug("inputFileNames ----" + inputFileNames);
MultiResourceItemReader<String> multiResourceItemReader = new MultiResourceItemReader<String>();
...
return multiResourceItemReader;
}
// slave step
#Bean
public Step testConsumersInputFileMergeStep(StepBuilderFactory stepBuilder, ItemReader<String> testInputFilesReader,
ItemWriter<String> testConsumerFileItemWriter){
return stepBuilder.get("testConsumersInputFileMergeStep").<String, String>chunk(1).reader(testInputFilesReader)
.writer(testConsumerFileItemWriter).build();
}
// master step
#Bean
public Step testConsumersFilePartitionerStep(StepBuilderFactory stepBuilder, Step testConsumersInputFileMergeStep, Partitioner provisioningInputFilePartitioner,
TaskExecutor taskExecutor ){
return stepBuilder.get("testConsumersFilePartitionerStep").partitioner(testConsumersInputFileMergeStep)
.partitioner("testConsumersInputFileMergeStep", provisioningInputFilePartitioner)
.taskExecutor(taskExecutor)
.build();
}
//Job
#Bean
public Job testFileForInputJob(JobBuilderFactory factory, Step testFileForInputStep, Step testConsumersFilePartitionerStep) {
return factory.get("testFileForInputJob").incrementer(new RunIdIncrementer()).start(testConsumersFilePartitionerStep).build();
}
Related
I'm using Spring Batch with Spring Cloud Task for remote partitioning. But for each new Job execution it is created with the same task execution id. Is there any way to create a new task execution id for new job execution?
In the following Task Execution table, each job is running with same parent execution id.
For each new job execution it is starting within the same task execution. The code for Batch configuration is as follows:
#Bean
public PartitionHandler partitionHandler(TaskLauncher taskLauncher, JobExplorer jobExplorer, Environment environment, DelegatingResourceLoader delegatingResourceLoader, TaskRepository taskRepository) {
Resource resource = delegatingResourceLoader.getResource(jarLocation);
DeployerPartitionHandler partitionHandler = new DeployerPartitionHandler(taskLauncher, jobExplorer, resource, "workerStep", taskRepository);
List<String> commandLineArguments = new ArrayList<>(5);
commandLineArguments.add("--spring.profiles.active=worker");
commandLineArguments.add("--spring.cloud.task.initialize.enable=false");
commandLineArguments.add("--spring.batch.initializer.enabled=false");
commandLineArguments.add("--spring.cloud.task.closecontext_enabled=true");
commandLineArguments.add("--logging.level=DEBUG");
partitionHandler.setCommandLineArgsProvider(new PassThroughCommandLineArgsProvider(commandLineArguments));
partitionHandler.setEnvironmentVariablesProvider(new SimpleEnvironmentVariablesProvider(environment));
partitionHandler.setMaxWorkers(2);
partitionHandler.setApplicationName("BatchApplicationWorker");
return partitionHandler;
}
#Bean
#StepScope
public Partitioner partitioner(#Value("#{jobParameters['inputFiles']}") String file, #Value("#{jobParameters['partitionSize']}") String partitionSize1){
int partitionSize = Integer.parseInt(partitionSize1);
return new Partitioner() {
public Map<String, ExecutionContext> partition(int gridSize) {
Map<String, ExecutionContext> partitions = new HashMap<>();
String[] ids = fetchAllPrimaryKeys(file);
List<List<String>> partitionPayloads = splitPayLoad(ids, partitionSize);
int size = partitionPayloads.size();
for(int i = 0 ; i < size ; i++) {
ExecutionContext executionContext = new ExecutionContext();
executionContext.put("partitionNumber", i);
executionContext.put("partitionPayLoad", new ArrayList<>(partitionPayloads.get(i)));
partitions.put("partition" + i, executionContext);
}
return partitions;
}
};
}
#Bean
public Step masterStep(Step workerStep, PartitionHandler partitionHandler) {
return this.stepBuilderFactory.get("masterStep")
.partitioner(workerStep.getName(), partitioner(null, null))
.step(workerStep)
.partitionHandler(partitionHandler)
.build();
}
#Bean
public Step workerStep(CustomWriter customWriter, CustomProcessor customProcessor) {
return this.stepBuilderFactory.get("workerStep")
.<User,User>chunk(10000)
.reader(reader(null))
.processor(customProcessor)
.writer(customWriter)
.build();
}
#Bean
public Job batchJob(Step masterStep, JobExecutionListnerClass jobExecutionListnerClass, JobBuilderFactory jobBuilderFactory) {
return jobBuilderFactory.get("batchJob")
.incrementer(new RunIdIncrementer())
.start(masterStep)
.listener(jobExecutionListnerClass)
.build();
public Long jobRunner(JobParams jobParams) throws BatchException {
Map<String, JobParameter> maps = new HashMap<>();
maps.put(Constants.TIME, new JobParameter(System.currentTimeMillis()));
maps.put(Constants.INPUT_FILES, new JobParameter(jobParams.getInputSource()));
maps.put(Constants.PARTITION_SIZE, new JobParameter(Integer.toString(jobParams.getPartitionSize())));
maps.put(Constants.MAIL_RECIPIENTS, new JobParameter(jobParams.getMailRecipients()));
maps.put(Constants.JOB_NAME, new JobParameter(jobParams.getJobName()));
maps.put(Constants.JOB_DESCRIPTION, new JobParameter(jobParams.getJobDescription()));
maps.put(Constants.JOB_RESTART, new JobParameter(Boolean.toString(jobParams.getRestart())));
JobParameters jobParameters = new JobParameters(maps);
JobExecution jobExecution;
try {
jobExecution = jobLauncher.run(job, jobParameters);
} catch (JobExecutionAlreadyRunningException | JobRestartException | JobInstanceAlreadyCompleteException
| JobParametersInvalidException e) {
throw new BatchException(e.getMessage());
}
return jobExecution.getId();
}
I have developed a Spring Batch Job which read from Kafka topic using KafkaItemReader class. I want to commit the offset only when the messages read in defined chunk are Processed and written successfully to an Output .dat file.
#Bean
public Job kafkaEventReformatjob(
#Qualifier("MaintStep") Step MainStep,
#Qualifier("moveFileToFolder") Step moveFileToFolder,
#Qualifier("compressFile") Step compressFile,
JobExecutionListener listener)
{
return jobBuilderFactory.get("kafkaEventReformatJob")
.listener(listener)
.incrementer(new RunIdIncrementer())
.flow(MainStep)
.next(moveFileToFolder)
.next(compressFile)
.end()
.build();
}
#Bean
Step MainStep(
ItemProcessor<IncomingRecord, List<Record>> flatFileItemProcessor,
ItemWriter<List<Record>> flatFileWriter)
{
return stepBuilderFactory.get("mainStep")
.<InputRecord, List<Record>> chunk(5000)
.reader(kafkaItemReader())
.processor(flatFileItemProcessor)
.writer(writer())
.listener(basicStepListener)
.build();
}
//Reader reads all the messages from akfka topic and sending back in form of IncomingRecord.
#Bean
KafkaItemReader<String, IncomingRecord> kafkaItemReader() {
Properties props = new Properties();
props.putAll(this.properties.buildConsumerProperties());
List<Integer> partitions = new ArrayList<>();
partitions.add(0);
partitions.add(1);
return new KafkaItemReaderBuilder<String, IncomingRecord>()
.partitions(partitions)
.consumerProperties(props)
.name("records")
.saveState(true)
.topic(topic)
.pollTimeout(Duration.ofSeconds(40L))
.build();
}
#Bean
public ItemWriter<List<Record>> writer() {
ListUnpackingItemWriter<Record> listUnpackingItemWriter = new ListUnpackingItemWriter<>();
listUnpackingItemWriter.setDelegate(flatWriter());
return listUnpackingItemWriter;
}
public ItemWriter<Record> flatWriter() {
FlatFileItemWriter<Record> fileWriter = new FlatFileItemWriter<>();
String tempFileName = "abc";
LOGGER.info("Output File name " + tempFileName + " is in working directory ");
String workingDir = service.getWorkingDir().toAbsolutePath().toString();
Path outputFile = Paths.get(workingDir, tempFileName);
fileWriter.setName("fileWriter");
fileWriter.setResource(new FileSystemResource(outputFile.toString()));
fileWriter.setLineAggregator(lineAggregator());
fileWriter.setForceSync(true);
fileWriter.setFooterCallback(customFooterCallback());
fileWriter.close();
LOGGER.info("Successfully created the file writer");
return fileWriter;
}
#StepScope
#Bean
public TransformProcessor processor() {
return new TransformProcessor();
}
==============================================================================
Writer Class
#BeforeStep
public void beforeStep(StepExecution stepExecution) {
this.stepExecution = stepExecution;
}
#AfterStep
public void afterStep(StepExecution stepExecution) {
this.stepExecution.setWriteCount(count);
}
#Override
public void write(final List<? extends List<Record>> lists) throws Exception {
List<Record> consolidatedList = new ArrayList<>();
for (List<Record> list : lists) {
if (!list.isEmpty() && null != list)
consolidatedList.addAll(list);
}
delegate.write(consolidatedList);
count += consolidatedList.size(); // to count Trailer record count
}
===============================================================
Item Processor
#Override
public List process(IncomingRecord record) {
List<Record> recordList = new ArrayList<>();
if (null != record.getEventName() and a few other conditions inside this section) {
// setting values of Record Class by extracting from the IncomingRecord.
recordList.add(the valid records which matching the condition);
}else{
return null;
}
Synchronizing a read operation and a write operation between two transactional resources (a queue and a database for instance)
is possible by using a JTA transaction manager that coordinates both transaction managers (2PC protocol).
However, this approach is not possible if one of the resources is not transactional (like the majority of file systems). So unless you use
a transactional file system and a JTA transaction manager that coordinates a kafka transaction manager and a file system transaction manager..
you need another approach, like the Compensating Transaction pattern. In your case, the "undo" operation (compensating action) would be rewinding the offset where it was before the failed chunk.
Recently started working with Spring Batch chunk based processing. I need to create batch for creating random 3 million strings, 1 million each of different type and count. Like a million strings starting with A, next 1.5 million ending with GH. Cannot do it with for loop as it will block a thread. I have to write them in db also. No idea how to make my ItemReader read each iteration.
I have understood the custom ItemReader through this
but not getting what should be the "item" here.
If i create a chunk of 1000, then how the counters will be handled, counting the chunk entry and the string generated counter.
The ItemReader interface is quite similar to an Iterator and like that has to maintain the iteration state internally. The Spring Batch framework even provides an IteratorItemReader.
Since Java 8 there are Streams, which I find quite versatile also for generating data and which can be converted into an Iterator.
Here is a possible solution along the lines of what you described:
#RunWith(SpringRunner.class)
#SpringBootTest(classes = Java8StreamReaderTest.TestConfig.class, properties = {"spring.batch.job.enabled=false"})
#EnableBatchProcessing
public class Java8StreamReaderTest {
#Configuration
static class TestConfig {
#Bean
JobBuilderFactory jobBuilderFactory(final JobRepository jobRepository) {
return new JobBuilderFactory(jobRepository);
}
#Bean
StepBuilderFactory stepBuilderFactory(final JobRepository jobRepository, final PlatformTransactionManager transactionManager) {
return new StepBuilderFactory(jobRepository, transactionManager);
}
#Bean
Job streamJob() {
return jobBuilderFactory(null).get("streamJob")
.start(stepBuilderFactory(null, null).get("streamStep")
.<String, String>chunk(1000)
.reader(streamReader())
.writer(listWriter())
.build()
)
.build();
}
#Bean
ListItemWriter<String> listWriter() {
return new ListItemWriter<>();
}
#Bean
ItemReader<String> streamReader() {
return new IteratorItemReader<String>(stream().iterator());
}
#Bean
Stream<String> stream() {
return Stream.of(
IntStream.range(0, 100000).boxed().map(i -> {
return new String("A"+ RandomStringUtils.random(10, true, false));
}),
IntStream.range(0, 100000).boxed().map(i -> {
return new String("B"+ RandomStringUtils.random(10, true, false));
}),
IntStream.range(0, 100000).boxed().map(i -> {
return new String(RandomStringUtils.random(10, true, false) + "GH");
})
)
.flatMap(s -> s);
}
}
#Autowired
Job streamJob;
#Autowired
JobLauncher jobLauncher;
#Autowired
ListItemWriter<String> listWriter;
#Test
public void shouldExecuteTestJob() throws Exception {
JobExecution execution = jobLauncher.run(streamJob, new JobParametersBuilder().toJobParameters());
assertEquals(ExitStatus.COMPLETED, execution.getExitStatus());
assertThat(listWriter.getWrittenItems(), hasSize(300000));
}
}
How can i stop spring kafka do not retry not readed messages from topic. For example is i kill application and then restart it my consumer is starting consuming not consumed messages. How can i prevent it?
#Bean
public ConsumerFactory<String, String> manualConsumerFactory() {
Map<String, Object> configs = consumerConfigs();
configs.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, false);
configs.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
return new DefaultKafkaConsumerFactory<>(configs);
}
/**
* Kafka manual ack listener container factory kafka listener container factory.
*
* #return the kafka listener container factory
*/
#Bean
public KafkaListenerContainerFactory<ConcurrentMessageListenerContainer<String, String>> kafkaManualAckListenerContainerFactory() {
ConcurrentKafkaListenerContainerFactory<String, String> factory = new ConcurrentKafkaListenerContainerFactory<>();
factory.setConsumerFactory(manualConsumerFactory());
ContainerProperties props = factory.getContainerProperties();
props.setAckMode(ContainerProperties.AckMode.MANUAL_IMMEDIATE);
return factory;
}
#Override
#EventListener
public void processSettlementFile(final Notification notification) {
LOG.info("Handling message [{}]", notification);
try {
final Map<String, JobParameter> parameters = new HashMap<>();
parameters.put("fileName", new JobParameter("1-101-D-2017-212-volume-per-transaction.csv"));
parameters.put("bucket", new JobParameter("bucket-name-can-be-passed-also-from-kafka-todo"));
final JobParameters jobParameters = new JobParameters(parameters);
final JobExecution execution = jobLauncher.run(succeededTransactionCsvFileToDatabaseJob, jobParameters);
LOG.info("Job Execution Status: " + execution.getStatus());
} catch (JobExecutionAlreadyRunningException | JobRestartException | JobInstanceAlreadyCompleteException | JobParametersInvalidException e) {
LOG.error("Failed to process job..", e);
}
}
#KafkaListener(topics = "topic", groupId = "processor-service", clientIdPrefix = "string", containerFactory = "kafkaManualAckListenerContainerFactory")
public void listenAsString(#Payload final String payload, Acknowledgment acknowledgment, final ConsumerRecord<String, String> consumerRecord) throws TopicEventException {
applicationEventPublisher.publishEvent(object);
acknowledgment.acknowledge();
}
You can add a ConsumerAwareRebalanceListener to the container configuration and call consumer.seekToEnd(partitions) in onPartitionsAssigned().
In the spring batch project, I used JdbcCursorItemReader to read data to process them in parallel. I can run the batch locally without any problem.
I also heard that JdbcPagingItemReader is recommended for parallel processing against JdbcCursorItemReader, as cursor reader will hold the connection too long while paging reader can release connection once the page size is reached.
I then switched to JdbcPagingItemReader in step2, but out of surprise, I got the exception below when running locally.
Caused by: java.sql.SQLTransientConnectionException: HikariPool-1 -
Connection is not available, request timed out after 300001ms.
However, it seems the above exception occurs in step1 before the paging reader in step2 is executed, and that is the only change made. Please shed some light on why the exception is thrown and if it is good practice to use paging reader instead of cursor in parallel processing. Much appreciated your help!
The code snippet is pasted below:
#Bean
#StepScope
public Flow createParallelSubFlow() {
List<Flow> subFlowList = new ArrayList<>();
List<Stream> streamList;
try {
streamList = dataSourceConfig.streamMapper().
getStreamListByStatus(Constants.PENDING_STATUS_CD);
} catch (Exception e) {
}
streamList.forEach(stream -> {
long id = stream.getStreamId();
String flowName = "stream" + id + "_flow";
Flow subFlow = new FlowBuilder<Flow>(flowName)
.start(step1(id))
.next(step2(id))
.end();
subFlowList.add(subFlow);
});
return new FlowBuilder<Flow>("splitFlow").split(new SimpleAsyncTaskExecutor())
.add(subFlowList.toArray(new Flow[0])).build();
}
public Step step1(long id) {
return stepBuilderFactory.get("step1")
.<Domain, Domain>chunk(100)
.reader(reader1(id))
.writer(writer1())
.build();
}
//#StepScope
//#Bean
public Step step2(long id) {
return stepBuilderFactory.get("step2")
.<Domain, Domain>chunk(100)
.reader(cursorReader2(id))
.processor(processor2)
.writer(writer2())
.build();
}
public JdbcCursorItemReader<Domain> cursorReader2(Long id) {
return new JdbcCursorItemReaderBuilder<Domain>()
.dataSource(dataSourceConfig.dataSource())
.name("cursorReader")
.sql(Constants.QUERY_SQL)
.preparedStatementSetter(new PreparedStatementSetter() {
#Override
public void setValues(PreparedStatement ps) throws SQLException {
ps.setLong(1, id);
}})
.rowMapper(new RowMapper())
.build();
}
//Switch from cursorReader2 to pagingReader2 in step2
public JdbcPagingItemReader<Domain> pagingReader2(Long id) {
return new JdbcPagingItemReaderBuilder<Domain>()
.dataSource(dataSourceConfig.dataSource())
.name("pagingReader")
.queryProvider(queryProvider())
.parameterValues(parameterValues(id))
.rowMapper(new RowMapper())
.pageSize(100)
.build();
}
#Bean
public PagingQueryProvider queryProvider() {
SqlPagingQueryProviderFactoryBean providerFactory = new SqlPagingQueryProviderFactoryBean();
Map<String, Order> sortKeys = new HashMap<>(2);
sortKeys.put("ID", Order.ASCENDING);
providerFactory.setDataSource(dataSourceConfig.dataSource());
providerFactory.setSelectClause("SELECT Clause");
providerFactory.setFromClause("FROM Clause");
providerFactory.setWhereClause("WHERE Clause");
providerFactory.setSortKeys(sortKeys);
PagingQueryProvider pagingQueryProvider = null;
try {
pagingQueryProvider = providerFactory.getObject();
} catch (Exception e) {
logger.error("Failed to get PagingQueryProvider", e);
throw new RuntimeException("Failed to get PagingQueryProvider", e);
}
return pagingQueryProvider;
}
private Map<String, Object> parameterValues(Long id) {
Map<String, Object> parameterValues = new HashMap<>();
parameterValues.put("1", id);
return parameterValues;
}