Issue in converting HTML to PDF containing <pre> tag with Flying Saucer and ITEXT - itext

I am using Flying Saucer library to convert html to pdf. It is working fine with the all the HTML files.
But for some HTML files which include some tags in pre tag, generated PDF file has tags displayed.
If I remove pre tags then the formatting of data is lost.
My code is
org.w3c.dom.Document document = null;
try {
Document doc = Jsoup.parse(new File(htmlFile), "UTF-8", "");
Whitelist wl = new RelaxedPlusDataBase64Images();
Cleaner cleaner = new Cleaner(wl);
doc = cleaner.clean(doc);
Tidy tidy = new Tidy();
tidy.setShowWarnings(false);
tidy.setXmlTags(false);
tidy.setInputEncoding("UTF-8");
tidy.setOutputEncoding("UTF-8");
tidy.setPrintBodyOnly(true);
tidy.setXHTML(true);
tidy.setMakeClean(true);
tidy.setAsciiChars(true);
if (doc.select("pre").html().contains("</")) {
doc.select("pre").unwrap();
}
Reader reader = new StringReader(doc.html());
document = (tidy.parseDOM(reader, null));
Element element = (Element) document.getElementsByTagName("head").item(0);
element.getParentNode().removeChild(element);
NodeList elements = document.getElementsByTagName("img");
for (int i = 0; i < elements.getLength(); i++) {
String value = elements.item(i).getAttributes().getNamedItem("src").getNodeValue();
if (value != null && value.startsWith("cid:") && value.contains("#")) {
value = value.substring(value.indexOf("cid:") + 4, value.indexOf("#"));
elements.item(i).getAttributes().getNamedItem("src").setNodeValue(value);
System.out.println(value);
}
}
document.normalize();
System.out.println(getNiceLyFormattedXMLDocument(document));
} catch (Exception e) {
System.out.println(e);
}
Method to create PDF is :
try {
org.w3c.dom.Document doc = CleanHtml.cleanNTidyHTML("b.html");
ITextRenderer renderer = new ITextRenderer();
renderer.setDocument(doc, null);
renderer.setPDFVersion(new Character('7'));
String outputFile = "test.pdf";
OutputStream os = new FileOutputStream(outputFile);
renderer.layout();
renderer.createPDF(os);
os.flush();
os.close();
} catch (Exception e) {
e.printStackTrace();
}
By using itext XMLWorker :
try {
org.w3c.dom.Document doc = CleanHtml.cleanNTidyHTML("a.html");
String k = CleanHtml.getNiceLyFormattedXMLDocument(doc);
OutputStream file = new FileOutputStream(new File("test.pdf"));
Document document = new Document();
PdfWriter writer = PdfWriter.getInstance(document, file);
document.open();
ByteArrayInputStream is = new ByteArrayInputStream(k.getBytes());
XMLWorkerHelper.getInstance().parseXHtml(writer, document, is);
document.close();
file.close();
} catch (Exception e) {
e.printStackTrace();
}
public static String getNiceLyFormattedXMLDocument(org.w3c.dom.Document doc) throws IOException, TransformerException {
TransformerFactory tf = TransformerFactory.newInstance();
Transformer transformer = tf.newTransformer();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
// transformer.setOutputProperty(OutputKeys.METHOD, "xml");
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
Writer stringWriter = new StringWriter();
StreamResult streamResult = new StreamResult(stringWriter);
transformer.transform(new DOMSource(doc), streamResult);
String result = stringWriter.toString();
return result;
}

Related

iText 5 Add a text field under a paragraph

I'm using iText to create a PDF in my java spring mvc web project. I'm new to iText 5. Could anyone tell me how to add a text field under a paragraph? Based on my current code, the text field is not displayed under the paragraph. I may add more questions to the PDF so I want to display each text field under the paragraph.
try {
List<FormQuestionBean> questions = formBuilderBean.getQuestions();
MyHeaderAndFooter event = new MyHeaderAndFooter();
Document pdfDocument = new Document(PageSize.A4);
pdfDocument.setMargins(20, 20, 20, 30);
ByteArrayOutputStream pdfBaos = new ByteArrayOutputStream();
PdfWriter writer = PdfWriter.getInstance(pdfDocument, pdfBaos);
writer.setPageEvent(event);
pdfDocument.open();
Paragraph paragraph1 = new Paragraph(FORM_NAME + " : " + formBuilderBean.getName(), HEADING);
paragraph1.setSpacingAfter(20);
pdfDocument.add(paragraph1);
for (FormQuestionBean formQuestionBean: questions) {
if (formQuestionBean.getFieldImplementation().contentEquals("Text Field")) {
Paragraph p = new Paragraph(formQuestionBean.getName());
pdfDocument.add(p);
TextField text = new TextField(writer, new Rectangle(36, 788, 559, 806), "text" );
text.setBackgroundColor(BaseColor.WHITE);
PdfFormField field = text.getTextField();
writer.addAnnotation(field);
}
}
pdfDocument.close();
response.reset();
response.setHeader("Content-Length", String.valueOf(pdfBaos.size()));
response.setContentType("application/pdf");
response.setContentLength(pdfBaos.size());
String downloadName = getFormFileName(".pdf");
response.setHeader("Content-Disposition", "attachment; filename=\"" + downloadName + "\"");
OutputStream outputStream = response.getOutputStream();
pdfBaos.writeTo(outputStream);
outputStream.flush();
outputStream.close();
return true;
} catch (Exception e) {
logger.error(e);
e.printStackTrace();
return false;
}
I figured out using table and cell. Below is the code:
try {
List<FormQuestionBean> questions = formBuilderBean.getQuestions();
MyHeaderAndFooter event = new MyHeaderAndFooter();
Document pdfDocument = new Document(PageSize.A4);
pdfDocument.setMargins(20, 20, 20, 30);
ByteArrayOutputStream pdfBaos = new ByteArrayOutputStream();
PdfWriter writer = PdfWriter.getInstance(pdfDocument, pdfBaos);
writer.setPageEvent(event);
pdfDocument.open();
Paragraph paragraph1 = new Paragraph(FORM_NAME + " : " + formBuilderBean.getName(), HEADING);
paragraph1.setSpacingAfter(20);
pdfDocument.add(paragraph1);
for (FormQuestionBean formQuestionBean: questions) {
if (formQuestionBean.getFieldImplementation().contentEquals("Text Field")) {
PdfPTable table = new PdfPTable(1);
table.getDefaultCell().setBorder(Rectangle.NO_BORDER);
PdfPCell c1 = new PdfPCell(new Phrase(formQuestionBean.getName()));
c1.setFixedHeight(30);
table.addCell(c1);
PdfPCell c2 = new PdfPCell();
TextField text = new TextField(writer, c2, "text" );
text.setBackgroundColor(BaseColor.WHITE);
FieldPositioningEvents events = new FieldPositioningEvents(writer, text.getTextField());
c2.setCellEvent(events);
c2.setFixedHeight(30);
table.addCell(c2);
pdfDocument.add(table);
}
}
pdfDocument.close();
response.reset();
response.setHeader("Content-Length", String.valueOf(pdfBaos.size()));
response.setContentType("application/pdf");
response.setContentLength(pdfBaos.size());
String downloadName = getFormFileName(".pdf");
response.setHeader("Content-Disposition", "attachment; filename=\"" + downloadName + "\"");
OutputStream outputStream = response.getOutputStream();
pdfBaos.writeTo(outputStream);
outputStream.flush();
outputStream.close();
return true;
} catch (Exception e) {
logger.error(e);
e.printStackTrace();
return false;
}

How do I merge two PDF files with attachments using itext?

I am trying to merge two pdf files (file1.pdf and file2.pdf) into a single file file3.pdf. One of the source files, file2.pdf has few attachments.
Using PdfCopyFields addDocument method does not include the attachments in the source pdf files to the destination pdf file. How do I achieve this?
Extracting the document level attachments from source files using PdfDictionary and adding them to the destination file using PdfWriter addFileAttachment method works.
Can you please let me know if there is any other efficient method to include the attachments from source pdf files to be included in destination pdf file after merging?
This is the sample code that I am using to replicate the scenario.
public class TestItext
{
public String[] attachments;
public TestItext()
{
attachments = new String[2];
}
public static void main(String[] args)
{
try
{
TestItext obj = new TestItext();
obj.extractDocLevelAttachments("C:\\source.pdf");
obj.addAttachments("C:\\source.pdf","C:\\temp\\dest.pdf");
}
catch(Exception e)
{
e.printStackTrace();
}
}
public void extractDocLevelAttachments(String filename) throws IOException
{
PdfReader reader = new PdfReader(filename);
PdfDictionary root = reader.getCatalog();
PdfDictionary documentnames = root.getAsDict(PdfName.NAMES);
PdfDictionary embeddedfiles = documentnames.getAsDict(PdfName.EMBEDDEDFILES);
PdfArray filespecs = embeddedfiles.getAsArray(PdfName.NAMES);
PdfDictionary filespec;
PdfDictionary refs;
FileOutputStream fos;
PRStream stream;
int count = 0;
for (int i = 0; i < filespecs.size(); ) {
filespecs.getAsString(i++);
filespec = filespecs.getAsDict(i++);
refs = filespec.getAsDict(PdfName.EF);
for (Object key : refs.getKeys()) {
fos = new FileOutputStream(String.format("C:\\temp\\"+ filespec.getAsString((PdfName)key).toString()));
attachments[count++] = String.format("C:\\temp\\"+ filespec.getAsString((PdfName)key).toString());
stream = (PRStream) PdfReader.getPdfObject(refs.getAsIndirectObject((PdfName)key));
fos.write(PdfReader.getStreamBytes(stream));
fos.flush();
fos.close();
}
}
reader.close();
}
public void addAttachments(String src, String dest) throws IOException, DocumentException
{
PdfReader reader = new PdfReader(src);
PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(dest));
for (int i = 0; i < attachments.length; i++) {
addAttachment(stamper.getWriter(), new File(attachments[i]));
}
stamper.close();
}
protected void addAttachment(PdfWriter writer, File src) throws IOException {
PdfFileSpecification fs =
PdfFileSpecification.fileEmbedded(writer, src.getAbsolutePath(), src.getName(), null);
writer.addFileAttachment(src.getName().substring(0, src.getName().indexOf('.')), fs);
}
}

How use itext to change existing pdf pagesize?

I have a pdf, but beyond the current page, there is content that is not being displayed. I want to change the pagesize so that all of the content can be displayed. Is there a way to do this with itext?
public PdfReader changePDFPageSize(String inpdf,String outpdf,float vertical,float horizontal)
{
try
{
PdfReader reader = new PdfReader(inpdf);
PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(outpdf));
for (int curPageNum = 1; curPageNum <= reader.getNumberOfPages(); ++curPageNum) {
PdfDictionary pagedict = reader.getPageN(curPageNum);
PdfArray mediabox = pagedict.getAsArray(PdfName.MEDIABOX);
mediabox.set(0, new PdfNumber(mediabox.getAsNumber(0).intValue()-horizontal));//left add
mediabox.set(1, new PdfNumber(mediabox.getAsNumber(1).intValue()-vertical));//down
mediabox.set(2, new PdfNumber(mediabox.getAsNumber(2).intValue()+horizontal));//right
mediabox.set(3, new PdfNumber(mediabox.getAsNumber(3).intValue()+vertical));//up
}
stamper.close();
return new PdfReader(outpdf);
} catch (FileNotFoundException e)
{
e.printStackTrace();
} catch (DocumentException e)
{
e.printStackTrace();
} catch (IOException e)
{
e.printStackTrace();
}
return null;
}
itextsharp VB.NET
Dim objReader As PdfReader
Dim objStream As FileStream
Dim objStamper As PdfStamper
Dim objContent As PdfContentByte
Dim objImport As PdfImportedPage
Dim objMark As Image
objReader = New PdfReader(strBookPath)
objStream = New FileStream(strTempPath, FileMode.Create)
objStamper = New PdfStamper(objReader, objStream)
objContent = objStamper.GetOverContent(1)
objImport = objStamper.GetImportedPage(objReader, 1)
objContent.AddTemplate(objImport, PageSize.A4.Width / objImport.Width, 0, 0, PageSize.A4.Height / objImport.Height, 0, 0)
objReader.GetPageN(1).Put(PdfName.CROPBOX, New PdfRectangle(PageSize.A4.Width, PageSize.A4.Height))
objReader.GetPageN(1).Put(PdfName.MEDIABOX, New PdfRectangle(PageSize.A4.Width, PageSize.A4.Height))

"Content can not be added to a PdfImportedPage." error

I am trying to download and merges multiple pdf files by using ITextSharp.
It used to working before but I being got an "Content can not be added to a PdfImportedPage." error message on the line:
importedPage = writer.GetImportedPage(reader, currentPageIndex);
The full code is below, any help will be very appreciated.
private string MergeDocuments(IList<string> fileUrls, string fileName)
{
var reportFolder = this.ReportFolder + "\\";
using (MemoryStream output = new MemoryStream())
{
Document document = new Document();
try
{
// Initialize pdf writer
PdfWriter writer = PdfWriter.GetInstance(document, output);
// Open document to write
document.Open();
PdfContentByte content = writer.DirectContent;
PdfImportedPage importedPage;
// Iterate through all pdf documents
foreach (var url in fileUrls)
{
// Create pdf reader
using (PdfReader reader = new PdfReader(new Uri(url)))
{
int numberOfPages = reader.NumberOfPages;
// Iterate through all pages
for (int currentPageIndex = 1; currentPageIndex <= numberOfPages; currentPageIndex++)
{
// Determine page size for the current page
document.SetPageSize( reader.GetPageSizeWithRotation(currentPageIndex) );
// Create page
document.NewPage();
importedPage = writer.GetImportedPage(reader, currentPageIndex);
content.AddTemplate(importedPage, 1f, 0, 0, 1f, 0, 0);
}
}
}
}
catch (Exception exception)
{
throw new Exception("Error occured", exception);
}
File.WriteAllBytes(reportFolder + fileName + ".pdf", output.GetBuffer());
}
return "Reports/" + fileName + ".pdf";
}
When I try the following code, I get a null pointer exception in the addDocument() method:
using (MemoryStream output = new MemoryStream()) {
Document document = new Document();
document.Open();
PdfCopy copy = new PdfSmartCopy(document, output);
foreach (var url in fileUrls) {
using (WebClient client = new WebClient()) {
var byteArray = client.DownloadData(url);
PdfReader reader = new PdfReader(byteArray);
copy.AddDocument(reader);
reader.Close();
}
}
}
I found the problem, the document object should be closed before writing memory stream to file.
Just added document.Close() as below.
document.Close();
File.WriteAllBytes(reportFolder + fileName + ".pdf", output.GetBuffer());

iText rotation creates pdf which displays out of memory exception

Following is a code snippet creating a pdf file where pages could be rotated in the resulting file. This works fine for most pdf files. But one particualr pdf file of version 1.6 the page is already rotated by 180, on applying further rotation to it e.g. 90 degress and saving the file causes it to get corrupted. Infact even if you don't rotate the file and simply write it out to another file using iText the file the resulting pdf is corrupted and displays an out of memory exception when opened in Adobe reader.
Why would that happen? Am I missing some sort of compression in the file.
private String createPdfFileWithoutForms(final EditStateData[] editStateData, final String directory)
throws EditingException {
Long startTime = System.currentTimeMillis();
File pdfFileToReturn = new File(directory + File.separator + UidGenerator.generate() + ".pdf");
com.lowagie.text.Document document = null;
FileOutputStream outputStream = null;
PdfCopy pdfCopy = null;
PdfReader reader = null;
PdfDictionary pageDict = null;
int rotationAngle = 0;
Map<Integer, Integer> rotationQuadrants = null;
try {
document = new com.lowagie.text.Document();
outputStream = new FileOutputStream(pdfFileToReturn);
pdfCopy = new PdfCopy(document, outputStream);
pdfCopy.setFullCompression();
pdfCopy.setCompressionLevel(9);
document.open();
for (EditStateData state : editStateData) {
try {
reader = new PdfReader(state.getFileName());
reader.selectPages(state.getPages());
rotationQuadrants = state.getRotationQuadrants();
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
// Rotation quadrant key is the source page number
if (rotationQuadrants.containsKey(state.getPages().get(i - 1))) {
rotationAngle = reader.getPageRotation(i);
pageDict = reader.getPageN(i);
pageDict.put(PdfName.ROTATE,
new PdfNumber((rotationAngle
+ rotationQuadrants.get(state.getPages().get(i - 1))) % 360));
}
document.setPageSize(reader.getPageSizeWithRotation(i));
document.newPage();
// import the page from source pdf
PdfImportedPage page = pdfCopy.getImportedPage(reader, i);
// add the page to the destination pdf
pdfCopy.addPage(page);
}
} catch (final IOException e) {
LOGGER.error(e.getMessage(), e);
throw new EditingException(e.getMessage(), e);
} finally {
if (reader != null) {
reader.close();
}
}
}
} catch (final Exception e) {
LOGGER.error(e.getMessage(), e);
throw new EditingException(e.getMessage(), e);
} finally {
if (document != null) {
document.close();
}
if (pdfCopy != null) {
pdfCopy.close();
}
IoUtils.closeQuietly(outputStream);
}
LOGGER.debug("Combining " + editStateData.length + " pdf files took "
+ ((System.currentTimeMillis() - startTime) / 1000) + " msecs");
return pdfFileToReturn.getAbsolutePath();
}