Create PDF with iText on iSeries leads to error "The document has no pages." - itext

We use the nice library iText for one of my customer's project to generate a pdf from a string representing a html page. The iText version is 5.5.10.
The following piece of code works well on the development environments and servers running on Windows, but it is not working on the customer's server running on iSeries.
public class GeneratePDFCmdImpl extends ControllerCommandImpl implements
GeneratePDFCmd {
private String charsetStr = null;
private Charset charset = null;
private BaseFont bf = null;
private String destFile = null;
private String destFilename = null;
private String srcContent = null;
private String docName = null;
public void setDocName(String docname) {
this.docName = docname;
}
public void setSrcContent(String srcContent) {
this.srcContent = srcContent;
}
private void prepareDefaultsAndSettings() {
/* srcContent may be more complex html but even this simple one is not working */
srcContent = "<html><head></head><body>This is just a test</body></html>";
docName = "mypdf";
charsetStr = "UTF-8";
destFilename = docName+".pdf";
Date timestamp = new Date();
/* destFile = "/" is just for the sample. In my real project, the value is a folder where my app has full rights
*/
destFile = "/" + destFilename;
charset = Charset.forName(charsetStr);
FontFactory.register("/fonts/arial.ttf","Arial");
bf = FontFactory.getFont("Arial").getBaseFont();
}
#Override
public void performExecute() throws ECException {
super.performExecute();
Document document = null;
OutputStream os = null;
prepareDefaultsAndSettings();
try {
InputStream srcInputStream;
srcInputStream = new ByteArrayInputStream(srcContent.getBytes(charset));
document = new Document(PageSize.A4, 20, 20, 75, 80);
FileOutputStream destOutput = new FileOutputStream(destFile);
PdfWriter writer = PdfWriter.getInstance(document,destOutput);
writer.setPageEvent( new HeaderFooterPageEvent(bf));
document.open();
XMLWorkerHelper.getInstance().parseXHtml(writer, document, srcInputStream, charset);
document.close();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (DocumentException e) {
e.printStackTrace();
} finally {
if(document != null) {
document.close();
}
document = null;
try {
if (os != null) {
os.close();
}
} catch(IOException e) {
e.printStackTrace();
}
os = null;
}
}
private class HeaderFooterPageEvent extends PdfPageEventHelper {
PdfContentByte cb;
PdfTemplate template;
BaseFont bf;
Font f;
float fs;
public HeaderFooterPageEvent(BaseFont _bf) {
super();
bf = _bf;
f = new Font(bf);
}
#Override
public void onOpenDocument(PdfWriter writer, Document document) {
cb = writer.getDirectContent();
template = cb.createTemplate(50, 50);
}
#Override
public void onEndPage(PdfWriter writer, Document document) {
Date dat = new Date();
ColumnText ct = new ColumnText(writer.getDirectContent());
SimpleDateFormat sdf = new SimpleDateFormat("dd-MM-yyyy HH:mm");
ct.showTextAligned(writer.getDirectContent(), Element.ALIGN_CENTER, new Phrase(sdf.format(dat) ), 100, 30, 0);
String text = "Page " +writer.getPageNumber() + " to ";
float len = bf.getWidthPoint(text, 12);
cb.beginText();
cb.setFontAndSize(bf, 12);
cb.setTextMatrix(450, 30);
cb.showText(text);
cb.endText();
cb.addTemplate(template, 450 + len, 30);
}
#Override
public void onCloseDocument(PdfWriter writer, Document document) {
template.beginText();
template.setFontAndSize(bf, 12);
template.showText(String.valueOf(writer.getPageNumber()));
template.endText();
}
}
}
When executed on the iSeries, we have the error message
com.ibm.commerce.command.ECCommandTarget executeCommand CMN0420E: The following command exception has occurred during processing: "ExceptionConverter: java.io.IOException: The document has no pages.". ExceptionConverter: java.io.IOException: The document has no pages.
at com.itextpdf.text.pdf.PdfPages.writePageTree(PdfPages.java:112)
at com.itextpdf.text.pdf.PdfWriter.close(PdfWriter.java:1256)
at com.itextpdf.text.pdf.PdfDocument.close(PdfDocument.java:900)
at com.itextpdf.text.Document.close(Document.java:415)
at be.ourcustomer.package.GeneratePDFCmdImpl.performExecute(GeneratePDFCmdImpl.java:107)
I don't have much idea about what we do wrong. Any help would be greatly appreciated

Related

ITEXTPDF 5 : Print svg with specifical fonts

I'm using ItextPdf 5.
I have an SVG file with specifical font (integrated in svg).
When I print my SVG (using batik 1.8) the graphic is print on my document, but fonts are blocked, so, can't select them.
see below my java code :
public class ItextPdfSmallTests {
#Test
public void svgFontsTest() throws IOException, DocumentException, URISyntaxException {
String RESULT = "C:\\test\\svgFontsTest.pdf";
Document document = new Document(PageSize.A4, 36, 36, 54, 36);
PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(RESULT));
document.open();
document.add(new Paragraph("SVG Example"));
int width = 250;
int height = 250;
PdfContentByte cb = writer.getDirectContent();
PdfTemplate template = cb.createTemplate(width, height);
PdfPrinterGraphics2D g2 = new PdfPrinterGraphics2D(cb, width, height, new MyFontMapper(), PrinterJob.getPrinterJob());
PrintTranscoder prm = new PrintTranscoder();
URI svgFileURI = getClass().getResource("myfont.svg").toURI();
TranscoderInput ti = new TranscoderInput(svgFileURI.toString());
prm.transcode(ti, null);
PageFormat pg = new PageFormat();
Paper pp = new Paper();
pp.setSize(width, height);
pp.setImageableArea(0, 0, width, height);
pg.setPaper(pp);
prm.print(g2, pg, 0);
g2.dispose();
ImgTemplate img = new ImgTemplate(template);
document.add(img);
document.close();
}
class MyFontMapper extends DefaultFontMapper {
#Override
public BaseFont awtToPdf(java.awt.Font font) {
try {
return BaseFont.createFont("AmaticSC-Regular.ttf", BaseFont.WINANSI, false);
} catch (DocumentException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
}
}
is it possible make it editable ?
thanks for your helps

iTextSharp 5 generated PDF-Document AdobeReader-compatibility [duplicate]

This question already has answers here:
iTextSharp-generated PDFs now cause Save dialog in Adobe Reader X
(2 answers)
Closed 5 years ago.
I am generating some reports with iTextSharp 5.
Opening the generated .PDF-file, everything looks and actually is fine with most PDF-readers.
When I open the PDF using AdobeReader(DC) however, it asks me if i want to save the changes on close. Although I didn't change anything.
Hitting 'Cancel' sure makes the message window go away, but hitting save causes the file to actually shrinks in size.
Now what is happening there? And why? How can I disable it?
The users of the application are most likely gonna use AdobeReader as well.
I don't want them to see the save dialog anytime they open a report.
Here is my BaseReport class
public abstract class BaseReport : PdfPageEventHelper
{
protected const string SPACE = " ";
protected const string COLON = ":";
protected static string NEWLINE = Environment.NewLine;
protected Document document;
protected PdfTemplate footerTemplate;
protected PdfContentByte contentByte;
protected PdfWriter writer;
private PdfTemplate totalPageNoTemplate;
private int lastPageNumber;
// properties for header
private bool done;
// needs to be overriden in subclass order to use header feature
protected string kundeHeader { get; set; }
// font definitions
protected BaseFont baseFont;
protected Font fontFooter;
protected Font fontGeneralText;
protected Font fontLabelText;
protected Font fontBoldLabelText;
protected Font fontBoldText;
protected Font fontSpace;
protected Font fontLargeBoldText;
protected int language;
protected bool useLogo = false;
protected bool usePageNumbers = false;
protected bool usePrintDate = false;
protected const string PRINT_FULLDATE_FORMAT = "dd.MM.yyyy HH:mm";
protected const string PRINT_DATE_ONLY_FORMAT = "dd.MM.yyyy";
protected Rectangle pagesize = PageSize.A4;
protected float marginLeft = 80;
protected float marginRight = 35;
protected float marginTop = 40;
protected float marginBottom = 40;
private MemoryStream PDFStream { get; set; } = new MemoryStream();
private DateTime printDate;
public BaseReport(int language = Languages.DE, bool landscape = false)
{
this.language = language;
if (landscape)
{
pagesize = pagesize.Rotate();
}
}
public byte[] GenerateReport()
{
CultureInfo cultureBefore = Resources.Culture;
try
{
Resources.Culture = SelectCultureForLangauge();
PrepareReport();
document = new Document(pagesize, marginLeft, marginRight, marginTop, marginBottom);
BuildFonts();
OpenDocument();
PrepareDocument();
GenerateContent();
document.Close();
return PDFStream.GetBuffer();
} finally
{
Resources.Culture = cultureBefore;
}
}
public void GenerateReport(string filename)
{
byte[] report = GenerateReport();
using (FileStream f = new FileStream(filename, FileMode.Create))
{
f.Write(report, 0, report.Length);
}
}
protected CultureInfo SelectCultureForLangauge()
{
string languageCode = GetLanguageCode();
return CultureInfo.GetCultureInfo(languageCode);
}
protected string GetLanguageCode()
{
string languageCode = string.Empty;
switch (language)
{
case Languages.FR: languageCode = "FR"; break;
case Languages.IT: languageCode = "IT"; break;
case Languages.EN: languageCode = "EN"; break;
default: languageCode = "DE"; break;
}
return languageCode;
}
protected virtual void PrepareReport() { }
protected virtual void PrepareDocument() { }
protected abstract void GenerateContent();
private void BuildFonts()
{
baseFont = BaseFont.CreateFont(BaseFont.HELVETICA, BaseFont.WINANSI, BaseFont.NOT_EMBEDDED);
fontFooter = FontFactory.GetFont(FontFactory.HELVETICA, 11, Font.ITALIC, BaseColor.DARK_GRAY);
fontGeneralText = FontFactory.GetFont(FontFactory.HELVETICA, 11, Font.NORMAL, BaseColor.BLACK);
fontLabelText = FontFactory.GetFont(FontFactory.HELVETICA, 8.5f, Font.NORMAL, BaseColor.BLACK);
fontBoldLabelText = FontFactory.GetFont(FontFactory.HELVETICA, 8.5f, Font.BOLD, BaseColor.BLACK);
fontBoldText = FontFactory.GetFont(FontFactory.HELVETICA, 11, Font.BOLD, BaseColor.BLACK);
fontSpace = FontFactory.GetFont(FontFactory.HELVETICA, 3.5f, Font.NORMAL, BaseColor.BLACK);
fontLargeBoldText = FontFactory.GetFont(FontFactory.HELVETICA, 17, Font.BOLD, BaseColor.BLACK);
GetFontIfAvailable();
}
private void GetFontIfAvailable()
{
string fileName = "IF_Rg";
try
{
baseFont = LoadFontFromFile(fileName, true);
fontFooter = new Font(baseFont, 11, Font.ITALIC, BaseColor.DARK_GRAY);
fontGeneralText = new Font(baseFont, 11, Font.NORMAL, BaseColor.BLACK);
fontLabelText = new Font(baseFont, 8.5f, Font.NORMAL, BaseColor.BLACK);
fontBoldLabelText = new Font(baseFont, 8.5f, Font.BOLD, BaseColor.BLACK);
fontBoldText = new Font(baseFont, 11, Font.BOLD, BaseColor.BLACK);
fontSpace = new Font(baseFont, 3.5f, Font.NORMAL, BaseColor.BLACK);
fontLargeBoldText = new Font(baseFont, 17, Font.BOLD, BaseColor.BLACK);
} catch (FileNotFoundException)
{
LogWrapper.Warn("Font not found - using default.");
}
}
protected BaseFont LoadFontFromFile(string fileName, bool embedded)
{
string fontPath = Environment.GetEnvironmentVariable("SystemRoot") + "\\fonts\\" + fileName + ".ttf";
if (File.Exists(fontPath))
{
return BaseFont.CreateFont(fontPath, BaseFont.WINANSI, embedded);
}
else
{
throw new FileNotFoundException($"Fontfile {fileName} was not found!");
}
}
protected Image HeaderLogo()
{
Image logo = Image.GetInstance(Resources.logo, BaseColor.BLACK);
// TODO msc pick logo from debitor
logo.ScaleToFit(100f, 100f);
return logo;
}
protected void OpenDocument()
{
writer = PdfWriter.GetInstance(document, PDFStream);
writer.PageEvent = this;
writer.SetFullCompression();
document.Open();
contentByte = writer.DirectContent;
}
protected void AddLabelAt(string label, float posX, float posY)
{
PdfContentByte cb = writer.DirectContent;
ColumnText column = new ColumnText(cb);
column.SetText(new Paragraph(label + NEWLINE, fontLabelText));
column.SetSimpleColumn(posX, 20, posX + 200, posY);
column.Go();
}
protected void AddLabelOnMargin(string label)
{
AddLabelAt(label, document.LeftMargin - 40, writer.GetVerticalPosition(false));
}
protected Phrase ParaLine(string Text, Font textfont)
{
return new Phrase(Text, textfont);
}
public override void OnOpenDocument(PdfWriter writer, Document document)
{
if (usePageNumbers)
{
totalPageNoTemplate = writer.DirectContentUnder.CreateTemplate(50, 50);
}
if (usePrintDate)
{
printDate = DateTime.Now;
}
}
public override void OnStartPage(PdfWriter writer, Document document)
{
if (useLogo || (document.PageNumber > 1 && !string.IsNullOrEmpty(kundeHeader)))
{
PdfContentByte canvas = writer.DirectContentUnder;
canvas.SaveState();
if (document.PageNumber > 1 && !string.IsNullOrEmpty(kundeHeader))
{
//showtextaligned only shows a single line
//therefor the header needs to be split and its parts need to be added seperately
string[] headerParts = kundeHeader.Split(new string[] { Environment.NewLine }, StringSplitOptions.None);
Phrase header = new Phrase(kundeHeader, fontLabelText);
ColumnText.ShowTextAligned(canvas, Element.ALIGN_LEFT,
ParaLine(headerParts[0], fontLabelText),
document.LeftMargin,
document.Top + 30, 0);
ColumnText.ShowTextAligned(canvas, Element.ALIGN_LEFT,
ParaLine(headerParts[1], fontLabelText),
document.LeftMargin,
document.Top + 20, 0);
}
if (useLogo)
{
Image logo = HeaderLogo();
logo.SetAbsolutePosition(marginLeft - 17.5f, document.Top + document.TopMargin - 50);
document.Add(logo);
}
canvas.RestoreState();
}
}
public override void OnEndPage(PdfWriter writer, Document document)
{
if (usePageNumbers || usePrintDate)
{
PdfContentByte canvas = writer.DirectContentUnder;
canvas.SaveState();
if (usePageNumbers)
{
// adds current page number to the footer section of the document
int pageN = writer.PageNumber;
string text = Resources.LabelSeite + SPACE + pageN + "/";
float len = fontLabelText.BaseFont.GetWidthPoint(text, fontLabelText.Size);
ColumnText.ShowTextAligned(canvas, Element.ALIGN_LEFT,
ParaLine(text, fontLabelText),
document.LeftMargin,
document.Bottom - 10, 0);
// adds template to fill in total page number (see OnCloseDocument method)
canvas.AddTemplate(totalPageNoTemplate, document.LeftMargin + len, document.Bottom - 10);
lastPageNumber = pageN;
}
if (usePrintDate)
{
// adds the printdate to the footer secdtion of the document
string dateFormatted = printDate.ToString(PRINT_FULLDATE_FORMAT);
ColumnText.ShowTextAligned(canvas, Element.ALIGN_RIGHT,
ParaLine(dateFormatted, fontLabelText),
document.Right,
document.Bottom - 10, 0);
}
canvas.RestoreState();
}
}
public override void OnCloseDocument(PdfWriter writer, Document document)
{
if (usePageNumbers)
{
// fills in the total page number to the prepared template in the footer section of the document
string text = lastPageNumber + "";
float widthPoint = fontLabelText.BaseFont.GetWidthPoint(text, fontLabelText.Size);
totalPageNoTemplate.Width = widthPoint;
ColumnText.ShowTextAligned(totalPageNoTemplate, Element.ALIGN_LEFT, ParaLine(text, fontLabelText), 0, 0, 0);
}
}
iTextSharp-generated PDFs now cause Save dialog in Adobe Reader X
I had to switch PDFStream.GetBuffer() with PDFStream.ToArray(); problem solved.

How to generate signature using hash string using iTextSharp and c#

I am trying to add public key string in MakeSignature.SignExternalContainer but having error,
string sing_test = "MIIFMTCCBBmgAwIBAgICMF8wDQYJKoZIhvcNAQELBQAwfDELMAkGA1UEBhMCSU4xKjAoBgNVBAoTIU5TREwgZS1Hb3YgSW5mcmFzdHJ1Y3R1cmUgTGltaXRlZDEdMBsGA1UECxMUQ2VydGlmeWluZyBBdXRob3JpdHkxIjAgBgNVBAMTGU5TRExlR292SXNzdWluZ0NBMjAxNlRlc3QwHhcNMTcwMTI4MDkwOTEzWhcNMTcwMTI4MDkzODEzWjCCARgxCzAJBgNVBAYTAklOMR0wGwYDVQQDExRNQU5JU0ggR0lSSVNIIEJFTkRSRTEpMCcGA1UEQRMgMjUxYjkyYjkxZjllNDBmOGE2YmVlZjZiMDgyMzYyZGYxNjA0BgNVBC0DLQBsbnlOOVVuNzE5S1NKNnpaQVNtUGZaUlJpYTVuTXBLSEV0cs3ZidAmwZQKbBmUKLoeD8nCDIUZZR9Am2pwSnar7WtB2MA74Sz4U2xzifNsh22YPd+ySbGqJtUd9xbhlQIDAQABo4IBHTCCARkwEQYDVR0OBAoECEwFNGCDXdKgMIG7BgNVHSAEgbMwgbAwga0GB2CCZGQCBAIwgaEwQQYIKwYBBQUHAgEWNWh0dHBzOi8vbnNkbC5lZ292LWNhLmNvLmluL3JlcG9zaXRvcnkvbnNkbGVnb3ZjcHMucGRmMFwGCCsGAQUFBwICMFAaTkFhZGhhYXIgZS1LWUMtQmlvbWV0cmljIENsYXNzIENlcnRpZmljYXRlIElzc3VlZCBieSBOU0RMIGUtR292IElzc3VpbmcgQ0EgVGVzdDAhBgNVHREEGjAYgRZtYW5pc2hiZW5kcmVAZ21haWwuY29tMBMGA1UdIwQMMAqACEBuFJI2LqIcMA4GA1UdDwEB/wQEAwIGwDANBgkqhkiG9w0BAQsFAAOCAQEADbuOkgWKquflIrqDsB93L5aa+VxjFHvB914UDIllO4MYTo/UVgDN2iANiJ2HOjFkY0VhdnuJKp0cjDSywP6mTXs0VUf70DEL5sZpjfnoJK++Eb6FlDHHMKflMkG/ja3b6FWK1W/1L0/yjYpjl4E2Uu5tq0T3k4ZOPd/LBD3OeudKZM1IPaT95Zd8JRqwz6LsyYx1SvXqLtrRUb8eIauvAJ92prvovxusvupzolB3AOCkwEr6jqGXOiwssEnqUCuUd3CVXWUxL5TzWW9oCPIDAKbUyyVWtntorVFfKmzvWDCV42jkHrf9J1snbr4DyjNhkOSQr6cDZfg0uK2gKWfBcA==";
byte[] rawData = System.Convert.FromBase64String(sing_test );
public static byte[] GetBytesToSign(string unsignedPdf, string tempPdf, string signatureFieldName)
{
using (PdfReader reader = new PdfReader(unsignedPdf))
{
using (FileStream os = File.OpenWrite(tempPdf))
{
PdfStamper stamper = PdfStamper.CreateSignature(reader, os, '\0');
PdfSignatureAppearance appearance = stamper.SignatureAppearance;
appearance.SetVisibleSignature(new iTextSharp.text.Rectangle(36, 748, 144, 780), 1, signatureFieldName);
IExternalSignatureContainer external = new ExternalBlankSignatureContainer(PdfName.ADOBE_PPKMS, PdfName.ADBE_PKCS7_SHA1);
MakeSignature.SignExternalContainer(appearance, external, 8192);
stamper.Close();
return SHA1Managed.Create().ComputeHash(appearance.GetRangeStream());
}
}
}
public static void EmbedSignature(string tempPdf, string signedPdf, string signatureFieldName, byte[] signedBytes)
{
using (PdfReader reader = new PdfReader(tempPdf))
{
using (FileStream os = File.OpenWrite(signedPdf))
{
PdfStamper st = PdfStamper.CreateSignature(reader, os, '\0', null, true);
PdfSignatureAppearance sap = st.SignatureAppearance;
sap.Reason = "Testing";
sap.Location = "Test";
sap.SetVisibleSignature(new iTextSharp.text.Rectangle(250, 50, 50, 100), 1, null);
IExternalSignatureContainer external = new MyExternalSignatureContainer(signedBytes);
try
{
MakeSignature.SignExternalContainer(sap, external, 8192);
}
catch (Exception ex) { }
st.Close();
}
}
}
private class MyExternalSignatureContainer : IExternalSignatureContainer
{
private readonly byte[] signedBytes;
public MyExternalSignatureContainer(byte[] signedBytes)
{
this.signedBytes = signedBytes;
}
public byte[] Sign(Stream data)
{
return signedBytes;
}
public void ModifySigningDictionary(PdfDictionary signDic)
{
}
}
getting error..
Error during signature verification.
Error encountered while BER decoding:
Thanks...

Creating custom plugin for chinese tokenization

I'm working towards properly integrating the stanford segmenter within SOLR for chinese tokenization.
This plugin involves loading other jar files and model files. I've got it working in a crude manner by hardcoding the complete path for the files.
I'm looking for methods to create the plugin where the paths need not be hardcoded and also to have the plugin in conformance with the SOLR plugin architecture. Please let me know if there are any recommended sites or tutorials for this.
I've added my code below :
public class ChineseTokenizerFactory extends TokenizerFactory {
/** Creates a new WhitespaceTokenizerFactory */
public ChineseTokenizerFactory(Map<String,String> args) {
super(args);
assureMatchVersion();
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
#Override
public ChineseTokenizer create(AttributeFactory factory, Reader input) {
Reader processedStringReader = new ProcessedStringReader(input);
return new ChineseTokenizer(luceneMatchVersion, factory, processedStringReader);
}
}
public class ProcessedStringReader extends java.io.Reader {
private static final int BUFFER_SIZE = 1024 * 8;
//private static TextProcess m_textProcess = null;
private static final String basedir = "/home/praveen/PDS_Meetup/solr-4.9.0/custom_plugins/";
static Properties props = null;
static CRFClassifier<CoreLabel> segmenter = null;
private char[] m_inputData = null;
private int m_offset = 0;
private int m_length = 0;
public ProcessedStringReader(Reader input){
char[] arr = new char[BUFFER_SIZE];
StringBuffer buf = new StringBuffer();
int numChars;
if(segmenter == null)
{
segmenter = new CRFClassifier<CoreLabel>(getProperties());
segmenter.loadClassifierNoExceptions(basedir + "ctb.gz", getProperties());
}
try {
while ((numChars = input.read(arr, 0, arr.length)) > 0) {
buf.append(arr, 0, numChars);
}
} catch (IOException e) {
e.printStackTrace();
}
m_inputData = processText(buf.toString()).toCharArray();
m_offset = 0;
m_length = m_inputData.length;
}
#Override
public int read(char[] cbuf, int off, int len) throws IOException {
int charNumber = 0;
for(int i = m_offset + off;i<m_length && charNumber< len; i++){
cbuf[charNumber] = m_inputData[i];
m_offset ++;
charNumber++;
}
if(charNumber == 0){
return -1;
}
return charNumber;
}
#Override
public void close() throws IOException {
m_inputData = null;
m_offset = 0;
m_length = 0;
}
public String processText(String inputText)
{
List<String> segmented = segmenter.segmentString(inputText);
String output = "";
if(segmented.size() > 0)
{
output = segmented.get(0);
for(int i=1;i<segmented.size();i++)
{
output = output + " " +segmented.get(i);
}
}
System.out.println(output);
return output;
}
static Properties getProperties()
{
if (props == null) {
props = new Properties();
props.setProperty("sighanCorporaDict", basedir);
// props.setProperty("NormalizationTable", "data/norm.simp.utf8");
// props.setProperty("normTableEncoding", "UTF-8");
// below is needed because CTBSegDocumentIteratorFactory accesses it
props.setProperty("serDictionary",basedir+"dict-chris6.ser.gz");
props.setProperty("inputEncoding", "UTF-8");
props.setProperty("sighanPostProcessing", "true");
}
return props;
}
}
public final class ChineseTokenizer extends CharTokenizer {
public ChineseTokenizer(Version matchVersion, Reader in) {
super(matchVersion, in);
}
public ChineseTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
super(matchVersion, factory, in);
}
/** Collects only characters which do not satisfy
* {#link Character#isWhitespace(int)}.*/
#Override
protected boolean isTokenChar(int c) {
return !Character.isWhitespace(c);
}
}
You can pass the argument through the Factory's args parameter.

How do I convert text files to .arff format(weka)

Please advise me How do I convert text files to .arff format(weka)
because i wan to do data clustering for 1000 txt file.
regards
There are some converters implemented in WEKA, just find the right format or make little changes to your data (using awk, sed...).
Here is the API pages related to this topic: http://weka.sourceforge.net/doc.stable/weka/core/converters/package-summary.html
For exapmle here is how to convert from CSV to ARFF:
java weka.core.converters.CSVLoader filename.csv > filename.arff
Here is the code you can use
package text.Classification;
import java.io.*;
import weka.core.*;
public class TextDirectoryToArff {
public Instances createDataset(String directoryPath) throws Exception {
FastVector atts;
FastVector attVals;
atts = new FastVector();
atts.addElement(new Attribute("contents", (FastVector) null));
String[] s = { "class1", "class2", "class3" };
attVals = new FastVector();
for (String p : s)
attVals.addElement(p);
atts.addElement(new Attribute("class", attVals));
Instances data = new Instances("MyRelation", atts, 0);
System.out.println(data);
InputStreamReader is = null;
File dir = new File(directoryPath);
String[] files = dir.list();
for (int i = 0; i < files.length; i++) {
if (files[i].endsWith(".txt")) {
double[] newInst = new double[2];
File txt = new File(directoryPath + File.separator + files[i]);
is = new InputStreamReader(new FileInputStream(txt));
StringBuffer txtStr = new StringBuffer();
int c;
while ((c = is.read()) != -1) {
txtStr.append((char) c);
}
newInst[0] = data.attribute(0).addStringValue(txtStr.toString());
int j=i%(s.length-1);
newInst[1] = attVals.indexOf(s[j]);
data.add(new Instance(1.0, newInst));
}
}
return data;
}
public static void main(String[] args) {
TextDirectoryToArff tdta = new TextDirectoryToArff();
try {
Instances dataset = tdta.createDataset("/home/asadul/Desktop/Downloads/text_example/class5");
PrintWriter fileWriter = new PrintWriter("/home/asadul/Desktop/Downloads/text_example/abc.arff", "UTF-8");
fileWriter.println(dataset);
fileWriter.close();
} catch (Exception e) {
System.err.println(e.getMessage());
e.printStackTrace();
}
}
}