I get an exception when trying to cleanup a section of a PDF (text searchable). It only happens on some PDFs. I noticed the difference is the text might be a little bit skewed.
I create some annotations of the PDF. Save it to file, then use the cleanup procedure. It works like a charm on most of the PDFs.
(itextsharp: 5.5.13.0)
Thanks in advance.
at System.Drawing.Image.FromStream(Stream stream, Boolean useEmbeddedColorManagement, Boolean validateImageData)
at iTextSharp.xtra.iTextSharp.text.pdf.pdfcleanup.PdfCleanUpRenderListener.ProcessImage(Byte[] imageBytes, IList`1 areasToBeCleaned)
at iTextSharp.xtra.iTextSharp.text.pdf.pdfcleanup.PdfCleanUpRenderListener.RenderImage(ImageRenderInfo renderInfo)
at iTextSharp.text.pdf.parser.PdfContentStreamProcessor.ImageXObjectDoHandler.HandleXObject(PdfContentStreamProcessor processor, PdfStream xobjectStream, PdfIndirectReference refi, ICollection markedContentInfoStack)
at iTextSharp.text.pdf.parser.PdfContentStreamProcessor.DisplayXObject(PdfName xobjectName)
at iTextSharp.text.pdf.parser.PdfContentStreamProcessor.Do.Invoke(PdfContentStreamProcessor processor, PdfLiteral oper, List`1 operands)
at iTextSharp.xtra.iTextSharp.text.pdf.pdfcleanup.PdfCleanUpContentOperator.Invoke(PdfContentStreamProcessor pdfContentStreamProcessor, PdfLiteral oper, List`1 operands)
at iTextSharp.text.pdf.parser.PdfContentStreamProcessor.InvokeOperator(PdfLiteral oper, List`1 operands)
at iTextSharp.text.pdf.parser.PdfContentStreamProcessor.ProcessContent(Byte[] contentBytes, PdfDictionary resources)
at iTextSharp.xtra.iTextSharp.text.pdf.pdfcleanup.PdfCleanUpProcessor.CleanUpPage(Int32 pageNum, IList`1 cleanUpLocations)
at iTextSharp.xtra.iTextSharp.text.pdf.pdfcleanup.PdfCleanUpProcessor.CleanUp()
at AU.PDF.Redaction.PdfProcessor.Redact(String inputFilePath, String outputFilePath, IEnumerable`1 options) in C:\Users\Depen585\Documents\POC\Redaction\AU.PDF.Redaction\PdfProcessor.cs:line 123
at Poc.Program.Main(String[] args) in C:\Users\Depen585\Documents\POC\Redaction\Poc\Program.cs:line 91
UPDATE
Bruno, using your method I get this error now (works with some, fails with others):
at iText.PdfCleanup.PdfCleanUpFilter.FilterImage(ImageRenderInfo image, IList`1 imageAreasToBeCleaned)
at iText.PdfCleanup.PdfCleanUpFilter.FilterImage(FilteredImageKey imageKey)
at iText.PdfCleanup.PdfCleanUpProcessor.CheckIfImageAndClean(IList`1 operands)
at iText.PdfCleanup.PdfCleanUpProcessor.FilterContent(String operator, IList`1 operands)
at iText.PdfCleanup.PdfCleanUpProcessor.InvokeOperator(PdfLiteral operator, IList`1 operands)
at iText.Kernel.Pdf.Canvas.Parser.PdfCanvasProcessor.ProcessContent(Byte[] contentBytes, PdfResources resources)
at iText.PdfCleanup.PdfCleanUpProcessor.ProcessContent(Byte[] contentBytes, PdfResources resources)
at iText.Kernel.Pdf.Canvas.Parser.PdfCanvasProcessor.ProcessPageContent(PdfPage page)
at iText.PdfCleanup.PdfCleanUpProcessor.ProcessPageContent(PdfPage page)
at iText.PdfCleanup.PdfCleanUpTool.CleanUpPage(Int32 pageNumber, IList`1 cleanUpLocations)
at iText.PdfCleanup.PdfCleanUpTool.CleanUp()
at AU.PDF.PdfProcessor.Redact(String inputFilePath, String outputFilePath, String textToRedact) in C:\Users\Depen585\Documents\POC\Redaction\AU.PDF\PdfProcessor.cs:line 38
at Poc.Program.Main(String[] args) in C:\Users\Depen585\Documents\POC\Redaction\Poc\Program.cs:line 91
Here is the code:
public class PdfProcessor
{
public void Initialize(string licenseKeyFilePath)
{
if (string.IsNullOrEmpty(licenseKeyFilePath)) throw new ArgumentNullException(nameof(licenseKeyFilePath));
LicenseKey.LoadLicenseFile(licenseKeyFilePath);
}
public void Redact(string inputFilePath, string outputFilePath, string textToRedact)
{
if (string.IsNullOrEmpty(inputFilePath)) throw new ArgumentNullException(nameof(inputFilePath));
if (string.IsNullOrEmpty(outputFilePath)) throw new ArgumentNullException(nameof(outputFilePath));
if (string.IsNullOrEmpty(textToRedact)) throw new ArgumentNullException(nameof(textToRedact));
//CompositeLocationExtractionStrategy strategy = new CompositeLocationExtractionStrategy();
//strategy.add(new PatternLocationExtractionStrategy(textToRedact).setRedactionColor(Color.PINK));
using (PdfReader reader = new PdfReader(inputFilePath))
using (PdfWriter writer = new PdfWriter(outputFilePath))
using (PdfDocument pdf = new PdfDocument(reader, writer))
{
List<PdfCleanUpLocation> cleanUpLocations = new List<PdfCleanUpLocation>
{
new PdfCleanUpLocation(1, new Rectangle(97, 405, 383, 40), iText.Kernel.Colors.ColorConstants.BLACK)
};
new PdfCleanUpTool(pdf, cleanUpLocations).CleanUp();
//PdfAutoSweep autoSweep = new PdfAutoSweep(composite);
//autoSweep.CleanUp(pdf);
}
}
}
UPDATE 2:
Even with custom strategy (RegexBasedLocationExtractionStrategy) it fails on the same PDFS. See below...
public class RegexLocationExtractionStrategy : RegexBasedLocationExtractionStrategy, ICleanupStrategy
{
private readonly string Regex;
public RegexLocationExtractionStrategy(string regex) : base(regex)
{
Regex = regex ?? throw new ArgumentNullException(nameof(regex));
}
public Color GetRedactionColor(IPdfTextLocation location)
{
return ColorConstants.BLACK;
}
public ICleanupStrategy Reset()
{
return new RegexLocationExtractionStrategy(Regex);
}
}
public class PdfProcessor
{
public void Initialize(string licenseKeyFilePath)
{
if (string.IsNullOrEmpty(licenseKeyFilePath)) throw new ArgumentNullException(nameof(licenseKeyFilePath));
LicenseKey.LoadLicenseFile(licenseKeyFilePath);
}
public void Redact(string inputFilePath, string outputFilePath, string regex)
{
if (string.IsNullOrEmpty(inputFilePath)) throw new ArgumentNullException(nameof(inputFilePath));
if (string.IsNullOrEmpty(outputFilePath)) throw new ArgumentNullException(nameof(outputFilePath));
if (string.IsNullOrEmpty(regex)) throw new ArgumentNullException(nameof(regex));
using (PdfReader reader = new PdfReader(inputFilePath))
using (PdfWriter writer = new PdfWriter(outputFilePath))
using (PdfDocument pdf = new PdfDocument(reader, writer))
{
var strategy = new RegexLocationExtractionStrategy(regex);
PdfAutoSweep autoSweep = new PdfAutoSweep(strategy);
autoSweep.CleanUp(pdf);
}
}
}
UPDATE 3:
I can see the redaction annotations when I use the tentative cleanup. Then I pass the clean up locations to the redaction routine and it fails all the same. See below.
public IEnumerable<PdfCleanUpLocation> Annotate(string inputFilePath, string outputFilePath, string regex)
{
if (string.IsNullOrEmpty(inputFilePath)) throw new ArgumentNullException(nameof(inputFilePath));
if (string.IsNullOrEmpty(outputFilePath)) throw new ArgumentNullException(nameof(outputFilePath));
if (string.IsNullOrEmpty(regex)) throw new ArgumentNullException(nameof(regex));
using (PdfReader reader = new PdfReader(inputFilePath))
using (PdfWriter writer = new PdfWriter(outputFilePath))
using (PdfDocument pdf = new PdfDocument(reader, writer))
{
var strategy = new RegexLocationExtractionStrategy(regex);
var autoSweep = new PdfAutoSweep(strategy);
autoSweep.TentativeCleanUp(pdf);
return autoSweep.GetPdfCleanUpLocations(pdf);
}
}
public void Redact(string inputFilePath, string outputFilePath, IEnumerable<PdfCleanUpLocation> locations)
{
if (string.IsNullOrEmpty(inputFilePath)) throw new ArgumentNullException(nameof(inputFilePath));
if (string.IsNullOrEmpty(outputFilePath)) throw new ArgumentNullException(nameof(outputFilePath));
if (locations == null) throw new ArgumentNullException(nameof(locations));
using (PdfReader reader = new PdfReader(inputFilePath))
using (PdfWriter writer = new PdfWriter(outputFilePath))
using (PdfDocument pdf = new PdfDocument(reader, writer))
{
var cleanUpTool = new PdfCleanUpTool(pdf);
locations
.ToList()
.ForEach(location => cleanUpTool.AddCleanupLocation(location));
cleanUpTool.CleanUp();
}
}
Related
My parameter (workingDirectory) is always null and I do not understand why?
I am using the job launcher to kick off the job.
I see the job parameter in the database but it is not getting injected into my reader.
This is my reader.
#Component("customerReader")
#StepScope
public class CustomReader implements ItemReader<Customer> {
#Value("#{jobParameters['workingDirectory']}")
protected String workingDirectory;
private BufferedReader reader;
protected ObjectMapper objectMapper = new ObjectMapper();
private int fileIdx = 1;
#Override
public Customer read() throws IOException {
if(reader == null) {
reader = new BufferedReader(new FileReader(new File(workingDirectory + "output1.json")));
}
String line = reader.readLine();
if (line == null) {
try {
fileIdx++;
reader = new BufferedReader(
new FileReader(new File(workingDirectory + "output" + fileIdx + ".json")));
}
catch (FileNotFoundException ex) {
return null;
}
line = reader.readLine();
}
return objectMapper.readValue(line, Customer.class);
}
}
I am launching by calling:
JobParameters p = new JobParametersBuilder().addString("workingDirectory", workingDirectory).toJobParameters();
JobExecution e = jobLauncher.run(
recordGenerator.getJob(file, "/Users/abc/data/"), p);
I need to analyze path data of PDF files and manipulate content with iText 7. Manipulations include deletion/replacemant and coloring.
I can analyze the graphics alright with something like the following code:
public class ContentParsing {
public static void main(String[] args) throws IOException {
new ContentParsing().inspectPdf("testdata/test.pdf");
}
public void inspectPdf(String path) throws IOException {
File file = new File(path);
PdfDocument pdf = new PdfDocument(new PdfReader(file.getAbsolutePath()));
PdfDocumentContentParser parser = new PdfDocumentContentParser(pdf);
for (int i=1; i<=pdf.getNumberOfPages(); i++) {
parser.processContent(i, new PathEventListener());
}
pdf.close();
}
}
public class PathEventListener implements IEventListener {
public void eventOccurred(IEventData eventData, EventType eventType) {
PathRenderInfo pathRenderInfo = (PathRenderInfo) eventData;
for ( Subpath subpath : pathRenderInfo.getPath().getSubpaths() ) {
for ( IShape segment : subpath.getSegments() ) {
// Here goes some path analysis code
System.out.println(segment.getBasePoints());
}
}
}
public Set<EventType> getSupportedEvents() {
Set<EventType> supportedEvents = new HashSet<EventType>();
supportedEvents.add(EventType.RENDER_PATH);
return supportedEvents;
}
}
Now, what's the way to go with manipulating things and writing them back to the PDF? Do I have to construct an entirely new PDF document and copy everything over (in manipulated form), or can I somehow manipulate the read PDF data directly?
Now, what's the way to go with manipulating things and writing them back to the PDF? Do I have to construct an entirely new PDF document and copy everything over (in manipulated form), or can I somehow manipulate the read PDF data directly?
In essence you are looking for a class which is not merely parsing a PDF content stream and signaling the instructions in it like the PdfCanvasProcessor (the PdfDocumentContentParser you use is merely a very thin wrapper for PdfCanvasProcessor) but which also creates the content stream anew with the instructions you forward back to it.
A generic content stream editor class
For iText 5.5.x a proof-of-concept for such a content stream editor class can be found in this answer (the Java version is a bit further down in the answer text).
This is a port of that proof-of-concept to iText 7:
public class PdfCanvasEditor extends PdfCanvasProcessor
{
/**
* This method edits the immediate contents of a page, i.e. its content stream.
* It explicitly does not descent into form xobjects, patterns, or annotations.
*/
public void editPage(PdfDocument pdfDocument, int pageNumber) throws IOException
{
if ((pdfDocument.getReader() == null) || (pdfDocument.getWriter() == null))
{
throw new PdfException("PdfDocument must be opened in stamping mode.");
}
PdfPage page = pdfDocument.getPage(pageNumber);
PdfResources pdfResources = page.getResources();
PdfCanvas pdfCanvas = new PdfCanvas(new PdfStream(), pdfResources, pdfDocument);
editContent(page.getContentBytes(), pdfResources, pdfCanvas);
page.put(PdfName.Contents, pdfCanvas.getContentStream());
}
/**
* This method processes the content bytes and outputs to the given canvas.
* It explicitly does not descent into form xobjects, patterns, or annotations.
*/
public void editContent(byte[] contentBytes, PdfResources resources, PdfCanvas canvas)
{
this.canvas = canvas;
processContent(contentBytes, resources);
this.canvas = null;
}
/**
* <p>
* This method writes content stream operations to the target canvas. The default
* implementation writes them as they come, so it essentially generates identical
* copies of the original instructions the {#link ContentOperatorWrapper} instances
* forward to it.
* </p>
* <p>
* Override this method to achieve some fancy editing effect.
* </p>
*/
protected void write(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
{
PdfOutputStream pdfOutputStream = canvas.getContentStream().getOutputStream();
int index = 0;
for (PdfObject object : operands)
{
pdfOutputStream.write(object);
if (operands.size() > ++index)
pdfOutputStream.writeSpace();
else
pdfOutputStream.writeNewLine();
}
}
//
// constructor giving the parent a dummy listener to talk to
//
public PdfCanvasEditor()
{
super(new DummyEventListener());
}
//
// Overrides of PdfContentStreamProcessor methods
//
#Override
public IContentOperator registerContentOperator(String operatorString, IContentOperator operator)
{
ContentOperatorWrapper wrapper = new ContentOperatorWrapper();
wrapper.setOriginalOperator(operator);
IContentOperator formerOperator = super.registerContentOperator(operatorString, wrapper);
return formerOperator instanceof ContentOperatorWrapper ? ((ContentOperatorWrapper)formerOperator).getOriginalOperator() : formerOperator;
}
//
// members holding the output canvas and the resources
//
protected PdfCanvas canvas = null;
//
// A content operator class to wrap all content operators to forward the invocation to the editor
//
class ContentOperatorWrapper implements IContentOperator
{
public IContentOperator getOriginalOperator()
{
return originalOperator;
}
public void setOriginalOperator(IContentOperator originalOperator)
{
this.originalOperator = originalOperator;
}
#Override
public void invoke(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
{
if (originalOperator != null && !"Do".equals(operator.toString()))
{
originalOperator.invoke(processor, operator, operands);
}
write(processor, operator, operands);
}
private IContentOperator originalOperator = null;
}
//
// A dummy event listener to give to the underlying canvas processor to feed events to
//
static class DummyEventListener implements IEventListener
{
#Override
public void eventOccurred(IEventData data, EventType type)
{ }
#Override
public Set<EventType> getSupportedEvents()
{
return null;
}
}
}
(PdfCanvasEditor.java)
The explanations from the iText 5 answer still apply, the parsing framework has not changed much from iText 5.5.x to iText 7.0.x.
Usage examples
Unfortunately you wrote in very vague terms about how exactly you want to change the contents. Thus I simply ported some iText 5 samples which made use of the original iText 5 content stream editor class:
Watermark removal
These are ports of the use cases in this answer.
testRemoveBoldMTTextDocument
This example drops all text written in a font the name of which ends with "BoldMT":
try ( InputStream resource = getClass().getResourceAsStream("document.pdf");
PdfReader pdfReader = new PdfReader(resource);
OutputStream result = new FileOutputStream(new File(RESULT_FOLDER, "document-noBoldMTText.pdf"));
PdfWriter pdfWriter = new PdfWriter(result);
PdfDocument pdfDocument = new PdfDocument(pdfReader, pdfWriter) )
{
PdfCanvasEditor editor = new PdfCanvasEditor()
{
#Override
protected void write(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
{
String operatorString = operator.toString();
if (TEXT_SHOWING_OPERATORS.contains(operatorString))
{
if (getGraphicsState().getFont().getFontProgram().getFontNames().getFontName().endsWith("BoldMT"))
return;
}
super.write(processor, operator, operands);
}
final List<String> TEXT_SHOWING_OPERATORS = Arrays.asList("Tj", "'", "\"", "TJ");
};
for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
{
editor.editPage(pdfDocument, i);
}
}
(EditPageContent.java test method testRemoveBoldMTTextDocument)
testRemoveBigTextDocument
This example drops all text written with a large font size:
try ( InputStream resource = getClass().getResourceAsStream("document.pdf");
PdfReader pdfReader = new PdfReader(resource);
OutputStream result = new FileOutputStream(new File(RESULT_FOLDER, "document-noBigText.pdf"));
PdfWriter pdfWriter = new PdfWriter(result);
PdfDocument pdfDocument = new PdfDocument(pdfReader, pdfWriter) )
{
PdfCanvasEditor editor = new PdfCanvasEditor()
{
#Override
protected void write(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
{
String operatorString = operator.toString();
if (TEXT_SHOWING_OPERATORS.contains(operatorString))
{
if (getGraphicsState().getFontSize() > 100)
return;
}
super.write(processor, operator, operands);
}
final List<String> TEXT_SHOWING_OPERATORS = Arrays.asList("Tj", "'", "\"", "TJ");
};
for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
{
editor.editPage(pdfDocument, i);
}
}
(EditPageContent.java test method testRemoveBigTextDocument)
Text color change
This is a port of the use case in this answer.
testChangeBlackTextToGreenDocument
This example changes the color of black text to green.
try ( InputStream resource = getClass().getResourceAsStream("document.pdf");
PdfReader pdfReader = new PdfReader(resource);
OutputStream result = new FileOutputStream(new File(RESULT_FOLDER, "document-blackTextToGreen.pdf"));
PdfWriter pdfWriter = new PdfWriter(result);
PdfDocument pdfDocument = new PdfDocument(pdfReader, pdfWriter) )
{
PdfCanvasEditor editor = new PdfCanvasEditor()
{
#Override
protected void write(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
{
String operatorString = operator.toString();
if (TEXT_SHOWING_OPERATORS.contains(operatorString))
{
if (currentlyReplacedBlack == null)
{
Color currentFillColor = getGraphicsState().getFillColor();
if (Color.BLACK.equals(currentFillColor))
{
currentlyReplacedBlack = currentFillColor;
super.write(processor, new PdfLiteral("rg"), Arrays.asList(new PdfNumber(0), new PdfNumber(1), new PdfNumber(0), new PdfLiteral("rg")));
}
}
}
else if (currentlyReplacedBlack != null)
{
if (currentlyReplacedBlack instanceof DeviceCmyk)
{
super.write(processor, new PdfLiteral("k"), Arrays.asList(new PdfNumber(0), new PdfNumber(0), new PdfNumber(0), new PdfNumber(1), new PdfLiteral("k")));
}
else if (currentlyReplacedBlack instanceof DeviceGray)
{
super.write(processor, new PdfLiteral("g"), Arrays.asList(new PdfNumber(0), new PdfLiteral("g")));
}
else
{
super.write(processor, new PdfLiteral("rg"), Arrays.asList(new PdfNumber(0), new PdfNumber(0), new PdfNumber(0), new PdfLiteral("rg")));
}
currentlyReplacedBlack = null;
}
super.write(processor, operator, operands);
}
Color currentlyReplacedBlack = null;
final List<String> TEXT_SHOWING_OPERATORS = Arrays.asList("Tj", "'", "\"", "TJ");
};
for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
{
editor.editPage(pdfDocument, i);
}
}
(EditPageContent.java test method testChangeBlackTextToGreenDocument)
I have each record spread across multiple lines in the input file(Very huge file).
Ex:
Id: 2
ASIN: 0738700123
title: Test tile for this product
group: Book
salesrank: 168501
similar: 5 0738700811 1567184912 1567182813 0738700514 0738700915
categories: 2
|Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Earth-Based Religions[12472]|Wicca[12484]
|Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Earth-Based Religions[12472]|Witchcraft[12486]
reviews: total: 12 downloaded: 12 avg rating: 4.5
2001-12-16 cutomer: A11NCO6YTE4BTJ rating: 5 votes: 5 helpful: 4
2002-1-7 cutomer: A9CQ3PLRNIR83 rating: 4 votes: 5 helpful: 5
How to identify and process each multi line record in spark?
If the multi-line data has a defined record separator, you could use the hadoop support for multi-line records, providing the separator through a hadoop.Configuration object:
Something like this should do:
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
val conf = new Configuration
conf.set("textinputformat.record.delimiter", "id:")
val dataset = sc.newAPIHadoopFile("/path/to/data", classOf[TextInputFormat], classOf[LongWritable], classOf[Text], conf)
val data = dataset.map(x=>x._2.toString)
This will provide you with an RDD[String] where each element corresponds to a record. Afterwards you need to parse each record following your application requirements.
I have done this by implementing custom input format and record reader.
public class ParagraphInputFormat extends TextInputFormat {
#Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) {
return new ParagraphRecordReader();
}
}
public class ParagraphRecordReader extends RecordReader<LongWritable, Text> {
private long end;
private boolean stillInChunk = true;
private LongWritable key = new LongWritable();
private Text value = new Text();
private FSDataInputStream fsin;
private DataOutputBuffer buffer = new DataOutputBuffer();
private byte[] endTag = "\n\r\n".getBytes();
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
FileSplit split = (FileSplit) inputSplit;
Configuration conf = taskAttemptContext.getConfiguration();
Path path = split.getPath();
FileSystem fs = path.getFileSystem(conf);
fsin = fs.open(path);
long start = split.getStart();
end = split.getStart() + split.getLength();
fsin.seek(start);
if (start != 0) {
readUntilMatch(endTag, false);
}
}
public boolean nextKeyValue() throws IOException {
if (!stillInChunk) return false;
boolean status = readUntilMatch(endTag, true);
value = new Text();
value.set(buffer.getData(), 0, buffer.getLength());
key = new LongWritable(fsin.getPos());
buffer.reset();
if (!status) {
stillInChunk = false;
}
return true;
}
public LongWritable getCurrentKey() throws IOException, InterruptedException {
return key;
}
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
public float getProgress() throws IOException, InterruptedException {
return 0;
}
public void close() throws IOException {
fsin.close();
}
private boolean readUntilMatch(byte[] match, boolean withinBlock) throws IOException {
int i = 0;
while (true) {
int b = fsin.read();
if (b == -1) return false;
if (withinBlock) buffer.write(b);
if (b == match[i]) {
i++;
if (i >= match.length) {
return fsin.getPos() < end;
}
} else i = 0;
}
}
}
endTag identifies the end of each record.
I have a CAS consumer AE which is expected to iterates over CAS objects in a pipeline, serialize them and add the serialized CASs to an xml file.
public class DataWriter extends JCasConsumer_ImplBase {
private File outputDirectory;
public static final String PARAM_OUTPUT_DIRECTORY = "outputDir";
#ConfigurationParameter(name=PARAM_OUTPUT_DIRECTORY, defaultValue=".")
private String outputDir;
CasToInlineXml cas2xml;
public void initialize(UimaContext context) throws ResourceInitializationException {
super.initialize(context);
ConfigurationParameterInitializer.initialize(this, context);
outputDirectory = new File(outputDir);
if (!outputDirectory.exists()) {
outputDirectory.mkdirs();
}
}
#Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
String file = fileCollectionReader.fileName;
File outFile = new File(outputDirectory, file + ".xmi");
FileOutputStream out = null;
try {
out = new FileOutputStream(outFile);
String xmlAnnotations = cas2xml.generateXML(jCas.getCas());
out.write(xmlAnnotations.getBytes("UTF-8"));
/* XmiCasSerializer ser = new XmiCasSerializer(jCas.getCas().getTypeSystem());
XMLSerializer xmlSer = new XMLSerializer(out, false);
ser.serialize(jCas.getCas(), xmlSer.getContentHandler());*/
if (out != null) {
out.close();
}
}
catch (IOException e) {
throw new AnalysisEngineProcessException(e);
}
catch (CASException e) {
throw new AnalysisEngineProcessException(e);
}
}
I am using it inside a pipeline after all my annotators, but it couldn't read CAS objects (I am getting NullPointerException at jCas.getCas()). It looks like I don't seem to understand the proper usage of CAS consumer. I appreciate any suggestions.
The following extension works for EF6 alpha2 but stopped working with alpha3 with null reference exception. The failing statement is the EdmxWriter.WriteEdmx(..)
The views pre-generation is performed on a code-first context.
How to achieve pre-generate views using EF6 alpha3?
public static PreGeneratedViews PreGenerateViews<T>(this T dbContext) where T : DbContext
{
Trace.TraceInformation("PreGenerating views");
//define ef collections
EdmItemCollection edmItemCollection = null;
StoreItemCollection storeItemCollection = null;
StorageMappingItemCollection mappingItemCollection = null;
//get ef collections
GetItemCollections(
GetEdmx(dbContext),
out edmItemCollection,
out storeItemCollection,
out mappingItemCollection);
IList<EdmSchemaError> errors = null;
//get the generated views
Dictionary<string, string> extentViews = GetExtentViews(mappingItemCollection, out errors);
//return the pregenerated views as string (xml document)
return new PreGeneratedViews
{
EdmEntityContainerName = edmItemCollection.GetItems<EntityContainer>().Single().Name,
StoreEntityContainerName = storeItemCollection.GetItems<EntityContainer>().Single().Name,
HashOverMappingClosure =
ReflectionHelper.GetMappingClosureHash(edmItemCollection.EdmVersion,
mappingItemCollection),
HashOverAllExtentViews =
ReflectionHelper.GenerateHashForAllExtentViewsContent(edmItemCollection.EdmVersion,
extentViews),
ViewCount = extentViews.Count,
Views = CreateViews(extentViews),
ViewsEmbeddedResourceName =
string.Format("DbContextViews{0}.xml", Guid.NewGuid().ToString("N")),
};
}
private static XDocument GetEdmx(DbContext dbContext)
{
var ms = new MemoryStream();
using (XmlWriter writer = XmlWriter.Create(ms))
{
EdmxWriter.WriteEdmx(dbContext, writer);
}
ms.Position = 0;
return XDocument.Load(ms);
}
private static void SplitEdmx(XDocument edmx, out XmlReader csdlReader, out XmlReader ssdlReader,
out XmlReader mslReader)
{
// xml namespace agnostic to make it work with any version of Entity Framework
XNamespace edmxNs = edmx.Root.Name.Namespace;
XElement storageModels = edmx.Descendants(edmxNs + "StorageModels").Single();
XElement conceptualModels = edmx.Descendants(edmxNs + "ConceptualModels").Single();
XElement mappings = edmx.Descendants(edmxNs + "Mappings").Single();
ssdlReader = storageModels.Elements().Single(e => e.Name.LocalName == "Schema").CreateReader();
csdlReader = conceptualModels.Elements().Single(e => e.Name.LocalName == "Schema").CreateReader();
mslReader = mappings.Elements().Single(e => e.Name.LocalName == "Mapping").CreateReader();
}
private static void GetItemCollections(XDocument edmx, out EdmItemCollection edmItemCollection,
out StoreItemCollection storeItemCollection,
out StorageMappingItemCollection mappingItemCollection)
{
// extract csdl, ssdl and msl artifacts from the Edmx
XmlReader csdlReader, ssdlReader, mslReader;
SplitEdmx(edmx, out csdlReader, out ssdlReader, out mslReader);
// Initialize item collections
edmItemCollection = new EdmItemCollection(new[] {csdlReader});
storeItemCollection = new StoreItemCollection(new[] {ssdlReader});
mappingItemCollection = new StorageMappingItemCollection(edmItemCollection, storeItemCollection,
new[] {mslReader});
}
private static Dictionary<string, string> GetExtentViews(StorageMappingItemCollection mappingItemCollection,
out IList<EdmSchemaError> errors)
{
Dictionary<EntitySetBase, string> views = ReflectionHelper.GenerateViews(mappingItemCollection, out errors);
if (errors != null && errors.Any())
{
return null;
}
var extentViews = new Dictionary<string, string>(views.Count);
foreach (var kvp in views)
{
extentViews.Add(
GetExtentFullName(kvp.Key),
kvp.Value.Replace("\r\n", "\n")); // replace accounts for Xml new line normalization
}
return extentViews;
}
private static string GetExtentFullName(EntitySetBase entitySet)
{
return string.Format("{0}.{1}", entitySet.EntityContainer.Name, entitySet.Name);
}
private static string CreateViews(Dictionary<string, string> extentViews)
{
var sb = new StringBuilder();
//var embeddedViewsFileName = Path.ChangeExtension(Host.TemplateFile, "xml");
using (XmlWriter writer = XmlWriter.Create(sb, new XmlWriterSettings
{
Indent = true,
Encoding = Encoding.UTF8
}))
{
writer.WriteStartElement("views");
foreach (var kvp in extentViews)
{
writer.WriteStartElement("view");
writer.WriteAttributeString("extent", kvp.Key);
writer.WriteCData(kvp.Value);
writer.WriteEndElement();
}
writer.WriteEndElement();
}
return sb.ToString();
}
#region Nested type: ReflectionHelper
private static class ReflectionHelper
{
private static readonly Assembly efAssembly = typeof (StorageMappingItemCollection).Assembly;
private static readonly MethodInfo generateViewsMethodInfo =
typeof (StorageMappingItemCollection).GetMethod("GenerateEntitySetViews",
BindingFlags.NonPublic | BindingFlags.Instance);
private static readonly MethodInfo getMappingClosureHashMethodInfo =
efAssembly.GetType("System.Data.Entity.Core.Mapping.MetadataMappingHasherVisitor", true)
.GetMethod("GetMappingClosureHash", BindingFlags.Static | BindingFlags.NonPublic);
private static readonly MethodInfo generateHashForAllExtentViewsContentMethodInfo =
efAssembly.GetType("System.Data.Entity.Core.Common.Utils.MetadataHelper", true)
.GetMethod("GenerateHashForAllExtentViewsContent", BindingFlags.Static | BindingFlags.NonPublic);
public static Dictionary<EntitySetBase, string> GenerateViews(
StorageMappingItemCollection mappingItemCollection, out IList<EdmSchemaError> errors)
{
errors = null;
return
(Dictionary<EntitySetBase, string>)
generateViewsMethodInfo.Invoke(mappingItemCollection, new object[] {errors});
}
public static string GetMappingClosureHash(double schemaVersion,
StorageMappingItemCollection mappingItemCollection)
{
return (string) getMappingClosureHashMethodInfo.Invoke(
null,
new object[]
{
schemaVersion,
// CodeFirst currently creates always one entity container
mappingItemCollection.GetItems<GlobalItem>().Single(
i => i.GetType().Name == "StorageEntityContainerMapping")
});
}
public static string GenerateHashForAllExtentViewsContent(double schemaVersion,
Dictionary<string, string> extentViews)
{
return (string) generateHashForAllExtentViewsContentMethodInfo.Invoke(
null,
new object[] {schemaVersion, extentViews});
}
}
The stacktrace is the following:
NullReferenceException
at: System.Data.Entity.Edm.Serialization.EdmSerializationVisitor.VisitEdmAssociationSet(AssociationSet item)
at: System.Data.Entity.Edm.EdmModelVisitor.VisitCollection[T](IEnumerable1 collection, Action1 visitMethod)
at: System.Data.Entity.Edm.EdmModelVisitor.VisitAssociationSets(EntityContainer container, IEnumerable`1 associationSets)
at: System.Data.Entity.Edm.EdmModelVisitor.VisitEdmEntityContainer(EntityContainer item)
at: System.Data.Entity.Edm.Serialization.EdmSerializationVisitor.VisitEdmEntityContainer(EntityContainer item)
at: System.Data.Entity.Edm.EdmModelVisitor.VisitCollection[T](IEnumerable1 collection, Action1 visitMethod)
at: System.Data.Entity.Edm.EdmModelVisitor.VisitEntityContainers(IEnumerable`1 entityContainers)
at: System.Data.Entity.Edm.EdmModelVisitor.VisitEdmModel(EdmModel item)
at: System.Data.Entity.Edm.Serialization.EdmSerializationVisitor.Visit(EdmModel edmModel, String namespaceName, String provider, String providerManifestToken)
at: System.Data.Entity.Edm.Serialization.EdmSerializationVisitor.Visit(EdmModel edmModel, String provider, String providerManifestToken)
at: System.Data.Entity.Edm.Serialization.SsdlSerializer.Serialize(EdmModel dbDatabase, String provider, String providerManifestToken, XmlWriter xmlWriter, Boolean serializeDefaultNullability)
at: System.Data.Entity.ModelConfiguration.Edm.Serialization.EdmxSerializer.WriteEdmxRuntime()
at: System.Data.Entity.ModelConfiguration.Edm.Serialization.EdmxSerializer.Serialize(DbDatabaseMapping databaseMapping, DbProviderInfo providerInfo, XmlWriter xmlWriter)
at: System.Data.Entity.Infrastructure.EdmxWriter.WriteEdmx(DbModel model, XmlWriter writer)
at: System.Data.Entity.Infrastructure.EdmxWriter.WriteEdmx(DbContext context, XmlWriter writer)
at: **.DbContextPreGenerateViewExtension.GetEdmx(DbContext dbContext)
Thanks!
I just submitted a fix for work item 867 (35852e8392ad). It should fix the NRE. The fix should be included in the today's nightly build. Can you give it a try and let me know if this fixed the problem?
This is a known bug in Alpha 3.