Open OSM pbf results in Protobuf exception - openstreetmap

Using OSMSharp I am having trouble to open a stream for a file (which I can provide on demand)
The error occurs in PBFReader (line 104)
using (var tmp = new LimitedStream(_stream, length))
{
header = _runtimeTypeModel.Deserialize(tmp, null, _blockHeaderType) as BlobHeader;
}
and states: "ProtoBuf.ProtoException: 'Invalid field in source data: 0'" which might mean different things as I have read in this SO question.
The file opens and is visualized with QGis so is not corrupt in my opinion.
Can it be that the contracts do not match? Is OsmSharp/core updated to the latest .proto files for OSM from here (although not sure if this is the real original source for the definition files).
And what might make more sense, can it be that the file I attached is generated for v2 of OSM PBF specification?
In the code at the line of the exception I see the following comment which makes me wonder:
// TODO: remove some of the v1 specific code.
// TODO: this means also to use the built-in capped streams.
// code borrowed from: http://stackoverflow.com/questions/4663298/protobuf-net-deserialize-open-street-maps
// I'm just being lazy and re-using something "close enough" here
// note that v2 has a big-endian option, but Fixed32 assumes little-endian - we
// actually need the other way around (network byte order):
// length = IntLittleEndianToBigEndian((uint)length);
BlobHeader header;
// again, v2 has capped-streams built in, but I'm deliberately
// limiting myself to v1 features
So this makes me wonder if OSM Sharp is (still) up-to-date.
My sandbox code looks like this:
using OsmSharp.Streams;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using OsmSharp.Tags;
namespace OsmSharp
{
class Program
{
private const string Path = #"C:\Users\Bernoulli IT\Documents\Applications\Argaleo\Test\";
private const string FileNameAntarctica = "antarctica-latest.osm";
private const string FileNameOSPbf = "OSPbf";
private const Boolean useRegisterSource = false;
private static KeyValuePair<string, string> KeyValuePair = new KeyValuePair<string, string>("joep", "monita");
static void Main(string[] args)
{
Console.WriteLine("Hello World!");
//string fileName = $#"{Path}\{FileNameAntarctica}.pbf";
string fileName = $#"{Path}\{FileNameOSPbf}.pbf";
string newFileName = $"{fileName.Replace(".pbf", string.Empty)}-{Guid.NewGuid().ToString().Substring(0, 4)}.pbf";
Console.WriteLine("*** Complete");
string fileNameOutput = CompleteFlow(fileName, newFileName);
Console.WriteLine("");
Console.WriteLine("*** Display");
DisplayFlow(fileNameOutput);
Console.ReadLine();
}
private static string CompleteFlow(string fileName, string newFileName)
{
// 1. Open file and convert to bytes
byte[] fileBytes = FileToBytes(fileName);
// 2. Bytes to OSM stream source (pbf)
PBFOsmStreamSource osmStreamSource;
osmStreamSource = BytesToOsmStreamSource(fileBytes);
osmStreamSource.MoveNext();
if (osmStreamSource.Current() == null)
{
osmStreamSource = FileToOsmStreamSource(fileName);
osmStreamSource.MoveNext();
if (osmStreamSource.Current() == null)
{
throw new Exception("No current in stream.");
}
}
// 3. Add custom tag
AddTag(osmStreamSource);
// 4. OSM stream source to bytes
//byte[] osmStreamSourceBytes = OsmStreamSourceToBytes(osmStreamSource);
// 5. Bytes to file
//string fileNameOutput = BytesToFile(osmStreamSourceBytes, newFileName);
OsmStreamSourceToFile(osmStreamSource, newFileName);
Console.WriteLine(newFileName);
return newFileName;
}
private static void DisplayFlow(string fileName)
{
// 1. Open file and convert to bytes
byte[] fileBytes = FileToBytes(fileName);
// 2. Bytes to OSM stream source (pbf)
BytesToOsmStreamSource(fileBytes);
}
private static byte[] FileToBytes(string fileName)
{
Console.WriteLine(fileName);
return File.ReadAllBytes(fileName);
}
private static PBFOsmStreamSource BytesToOsmStreamSource(byte[] bytes)
{
MemoryStream memoryStream = new MemoryStream(bytes);
memoryStream.Position = 0;
PBFOsmStreamSource osmStreamSource = new PBFOsmStreamSource(memoryStream);
foreach (OsmGeo element in osmStreamSource.Where(osmGeo => osmGeo.Tags.Any(tag => tag.Key.StartsWith(KeyValuePair.Key))))
{
foreach (Tag elementTag in element.Tags.Where(tag => tag.Key.StartsWith(KeyValuePair.Key)))
{
Console.WriteLine("!!!!!!!!!!!!!! Tag found while reading !!!!!!!!!!!!!!!!!!".ToUpper());
}
}
return osmStreamSource;
}
private static PBFOsmStreamSource FileToOsmStreamSource(string fileName)
{
using (FileStream fileStream = new FileInfo(fileName).OpenRead())
{
PBFOsmStreamSource osmStreamSource = new PBFOsmStreamSource(fileStream);
return osmStreamSource;
}
}
private static void AddTag(PBFOsmStreamSource osmStreamSource)
{
osmStreamSource.Reset();
OsmGeo osmGeo = null;
while (osmGeo == null)
{
osmStreamSource.MoveNext();
osmGeo = osmStreamSource.Current();
if(osmGeo?.Tags == null)
{
osmGeo = null;
}
}
osmGeo.Tags.Add("joep", "monita");
Console.WriteLine($"{osmGeo.Tags.FirstOrDefault(tag => tag.Key.StartsWith(KeyValuePair.Key)).Key} - {osmGeo.Tags.FirstOrDefault(tag => tag.Key.StartsWith(KeyValuePair.Key)).Value}");
}
private static byte[] OsmStreamSourceToBytes(PBFOsmStreamSource osmStreamSource)
{
MemoryStream memoryStream = new MemoryStream();
PBFOsmStreamTarget target = new PBFOsmStreamTarget(memoryStream, true);
osmStreamSource.Reset();
target.Initialize();
UpdateTarget(osmStreamSource, target);
target.Flush();
target.Close();
return memoryStream.ToArray();
}
private static string BytesToFile(byte[] bytes, string fileName)
{
using (FileStream fs = new FileStream(fileName, FileMode.Create, FileAccess.Write))
{
fs.Write(bytes, 0, bytes.Length);
}
return fileName;
}
private static void OsmStreamSourceToFile(PBFOsmStreamSource osmStreamSource, string fileName)
{
using (FileStream fileStream = new FileInfo(fileName).OpenWrite())
{
PBFOsmStreamTarget target = new PBFOsmStreamTarget(fileStream, true);
osmStreamSource.Reset();
target.Initialize();
UpdateTarget(osmStreamSource, target);
target.Flush();
target.Close();
}
}
private static void UpdateTarget(OsmStreamSource osmStreamSource, OsmStreamTarget osmStreamTarget)
{
if (useRegisterSource)
{
osmStreamTarget.RegisterSource(osmStreamSource, osmGeo => true);
osmStreamTarget.Pull();
}
else
{
bool isFirst = true;
foreach (OsmGeo osmGeo in osmStreamSource)
{
Tag? tag = osmGeo.Tags?.FirstOrDefault(t => t.Key == KeyValuePair.Key);
switch (osmGeo.Type)
{
case OsmGeoType.Node:
if (isFirst)
{
for (int indexer = 0; indexer < 1; indexer++)
{
(osmGeo as Node).Tags.Add(new Tag(KeyValuePair.Key + Guid.NewGuid(), KeyValuePair.Value));
}
isFirst = false;
}
osmStreamTarget.AddNode(osmGeo as Node);
break;
case OsmGeoType.Way:
osmStreamTarget.AddWay(osmGeo as Way);
break;
case OsmGeoType.Relation:
osmStreamTarget.AddRelation(osmGeo as Relation);
break;
default:
throw new ArgumentOutOfRangeException();
}
}
}
}
}
}
Already I posted this question on the GITHube page of OSMSharp as is linked here. Any help would be very appreciated.

Related

Export Metadata from a Custom Tab from file properties for a list of files in sub folders

The Problem:
On my Windows file server I have around 1,000,000 SolidWorks files that have metadata in the file properties under a custom tab (see image below) that I would like to export to a single CSV file.
These files are located in sub-tree of folders, and mixed with other file types.
The Solution:
A script is needed to only target specific file types (extensions) in a sub-tree of folders, that exports the metadata from the custom tab to a CSV File, where i can then clean up and import the data into an SQL database.
I am not sure the best way to achieve this I was thinking along the lines of PowerShell, any help to get going would be greatly appreciated.
If you use a .NET language like C# or Visual Basic, you can use the SolidWorks API and the Document Manager libraries (you'll need to get a license, free with your subscription from the Customer Portal) to extract this information without opening the files. It's quite fast. As for only looking at certain files, that's straight forward with .NET IO.Path.GetExtension.
Below is a working example of what I think you're looking for.
You will need the Document Manager dll, which is on the SolidWorks API SDK, which is included in the installation media. Then you can referencene SolidWorks.Interop.swdocumentmgr.dll.
You will also need a Document Manager Serial Number which you can request through the SolidWorks Customer Portal free of charge with your SolidWorks Subscription. Once you have that number, replace the lic string value below with the entire serial number, in quotes.
To define which Custom Properties to read from the SolidWorks files, just change the list propertiesToRead to include any values you need to retrieve. These are NOT case sensitive.
You you run this, you will be prompted to enter a directory path. This is also where the Output.csv file will be created.
At the bottom is a screen-shot of sample results.
using SolidWorks.Interop.swdocumentmgr;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
namespace WriteProperties
{
class Program
{
static ISwDMApplication4 docManager;
static List<string> propertiesToRead = new List<string>() { "Number", "Description", "Revision", "Material", "Finish", "Weight" };
const string lic = "YOU CAN GET THIS NUMBER FROM THE CUSTOMER PORTAL WITH YOUR SOLIDWORKS SUBSCRIPTION AT NO COST";
static char[] charactersToQuote = { ',', '"', '\n' };
const string QUOTE = "\"";
const string QUOTEFORMATTED = "\"\"";
static void Main(string[] args)
{
string directoryPath = GetDirectory(args);
if (string.IsNullOrEmpty(directoryPath)) return;
if (!LoadDocManager()) return;
string outputPath = Path.Combine(directoryPath, "Output.csv");
StringBuilder sb = new StringBuilder();
sb.AppendLine("File Name," + string.Join(",", propertiesToRead));
int counter = 0;
foreach (string filePath in Directory.EnumerateFiles(directoryPath, "*.sld*", SearchOption.AllDirectories))
{
SwDMDocument21 dmDocument = GetDocument(filePath);
if (dmDocument == null) continue;
WriteProperties(sb, dmDocument, filePath);
counter++;
}
File.WriteAllText(outputPath, sb.ToString());
Console.WriteLine("{0} files read and saved to {1}", counter, outputPath);
Console.ReadLine();
}
static string GetDirectory(string[] args)
{
if (args != null && args.Count() > 0 && Directory.Exists(args[0])) return args[0];
Console.WriteLine("Directory to read:");
string filePath = Console.ReadLine();
if (Directory.Exists(filePath)) return filePath;
Console.WriteLine("Directory does not exists: {0}", filePath);
return string.Empty;
}
static bool LoadDocManager()
{
if (docManager != null) return true;
try
{
SwDMClassFactory factory = new SwDMClassFactory();
if (factory == null) throw new NullReferenceException(nameof(SwDMClassFactory));
docManager = (SwDMApplication4)factory.GetApplication(lic);
if (docManager == null) throw new NullReferenceException(nameof(SwDMApplication4));
return true;
}
catch (Exception ex)
{
Console.WriteLine("Document Manager failed to load: {0}", ex.Message);
Console.ReadLine();
return false;
}
}
static SwDMDocument21 GetDocument(string filePath)
{
SwDmDocumentType documentType = GetDocType(filePath);
if (documentType == SwDmDocumentType.swDmDocumentUnknown) return null;
SwDmDocumentOpenError result = SwDmDocumentOpenError.swDmDocumentOpenErrorNone;
SwDMDocument21 dmDocument = (SwDMDocument21)docManager.GetDocument(filePath, documentType, true, out result);
if (result == SwDmDocumentOpenError.swDmDocumentOpenErrorNone || result == SwDmDocumentOpenError.swDmDocumentOpenErrorFileReadOnly) return dmDocument;
if (dmDocument != null) dmDocument.CloseDoc();
return null;
}
static SwDmDocumentType GetDocType(string filePath)
{
if (filePath.Contains("~$")) return SwDmDocumentType.swDmDocumentUnknown;
switch (Path.GetExtension(filePath).ToLower())
{
case ".sldprt": return SwDmDocumentType.swDmDocumentPart;
case ".sldasm": return SwDmDocumentType.swDmDocumentAssembly;
case ".slddrw": return SwDmDocumentType.swDmDocumentDrawing;
default: return SwDmDocumentType.swDmDocumentUnknown;
}
}
static void WriteProperties(StringBuilder sb, SwDMDocument21 dmDocument, string filePath)
{
Console.WriteLine("Reading {0}", filePath);
List<string> propertiesInFile = new List<string>();
if (dmDocument.GetCustomPropertyCount() > 0) propertiesInFile.AddRange(dmDocument.GetCustomPropertyNames());
string csvLine = filePath;
foreach (string property in propertiesToRead)
{
string propertyValue = "";
if (propertiesInFile.Any(s => string.Compare(s, property, true) == 0))
{
SwDmCustomInfoType propertyType = SwDmCustomInfoType.swDmCustomInfoText;
string resolvedValue;
propertyValue = dmDocument.GetCustomPropertyValues(property, out propertyType, out resolvedValue);
}
csvLine = csvLine + "," + FixChars(propertyValue);
}
sb.AppendLine(csvLine);
dmDocument.CloseDoc();
}
static string FixChars(string s)
{
if (s.Contains(QUOTE)) s = s.Replace(QUOTE, QUOTEFORMATTED);
if (s.IndexOfAny(charactersToQuote) > -1) s = QUOTE + s + QUOTE;
return s;
}
}
}
Here is a sample output

Manipulate paths, color etc. in iText

I need to analyze path data of PDF files and manipulate content with iText 7. Manipulations include deletion/replacemant and coloring.
I can analyze the graphics alright with something like the following code:
public class ContentParsing {
public static void main(String[] args) throws IOException {
new ContentParsing().inspectPdf("testdata/test.pdf");
}
public void inspectPdf(String path) throws IOException {
File file = new File(path);
PdfDocument pdf = new PdfDocument(new PdfReader(file.getAbsolutePath()));
PdfDocumentContentParser parser = new PdfDocumentContentParser(pdf);
for (int i=1; i<=pdf.getNumberOfPages(); i++) {
parser.processContent(i, new PathEventListener());
}
pdf.close();
}
}
public class PathEventListener implements IEventListener {
public void eventOccurred(IEventData eventData, EventType eventType) {
PathRenderInfo pathRenderInfo = (PathRenderInfo) eventData;
for ( Subpath subpath : pathRenderInfo.getPath().getSubpaths() ) {
for ( IShape segment : subpath.getSegments() ) {
// Here goes some path analysis code
System.out.println(segment.getBasePoints());
}
}
}
public Set<EventType> getSupportedEvents() {
Set<EventType> supportedEvents = new HashSet<EventType>();
supportedEvents.add(EventType.RENDER_PATH);
return supportedEvents;
}
}
Now, what's the way to go with manipulating things and writing them back to the PDF? Do I have to construct an entirely new PDF document and copy everything over (in manipulated form), or can I somehow manipulate the read PDF data directly?
Now, what's the way to go with manipulating things and writing them back to the PDF? Do I have to construct an entirely new PDF document and copy everything over (in manipulated form), or can I somehow manipulate the read PDF data directly?
In essence you are looking for a class which is not merely parsing a PDF content stream and signaling the instructions in it like the PdfCanvasProcessor (the PdfDocumentContentParser you use is merely a very thin wrapper for PdfCanvasProcessor) but which also creates the content stream anew with the instructions you forward back to it.
A generic content stream editor class
For iText 5.5.x a proof-of-concept for such a content stream editor class can be found in this answer (the Java version is a bit further down in the answer text).
This is a port of that proof-of-concept to iText 7:
public class PdfCanvasEditor extends PdfCanvasProcessor
{
/**
* This method edits the immediate contents of a page, i.e. its content stream.
* It explicitly does not descent into form xobjects, patterns, or annotations.
*/
public void editPage(PdfDocument pdfDocument, int pageNumber) throws IOException
{
if ((pdfDocument.getReader() == null) || (pdfDocument.getWriter() == null))
{
throw new PdfException("PdfDocument must be opened in stamping mode.");
}
PdfPage page = pdfDocument.getPage(pageNumber);
PdfResources pdfResources = page.getResources();
PdfCanvas pdfCanvas = new PdfCanvas(new PdfStream(), pdfResources, pdfDocument);
editContent(page.getContentBytes(), pdfResources, pdfCanvas);
page.put(PdfName.Contents, pdfCanvas.getContentStream());
}
/**
* This method processes the content bytes and outputs to the given canvas.
* It explicitly does not descent into form xobjects, patterns, or annotations.
*/
public void editContent(byte[] contentBytes, PdfResources resources, PdfCanvas canvas)
{
this.canvas = canvas;
processContent(contentBytes, resources);
this.canvas = null;
}
/**
* <p>
* This method writes content stream operations to the target canvas. The default
* implementation writes them as they come, so it essentially generates identical
* copies of the original instructions the {#link ContentOperatorWrapper} instances
* forward to it.
* </p>
* <p>
* Override this method to achieve some fancy editing effect.
* </p>
*/
protected void write(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
{
PdfOutputStream pdfOutputStream = canvas.getContentStream().getOutputStream();
int index = 0;
for (PdfObject object : operands)
{
pdfOutputStream.write(object);
if (operands.size() > ++index)
pdfOutputStream.writeSpace();
else
pdfOutputStream.writeNewLine();
}
}
//
// constructor giving the parent a dummy listener to talk to
//
public PdfCanvasEditor()
{
super(new DummyEventListener());
}
//
// Overrides of PdfContentStreamProcessor methods
//
#Override
public IContentOperator registerContentOperator(String operatorString, IContentOperator operator)
{
ContentOperatorWrapper wrapper = new ContentOperatorWrapper();
wrapper.setOriginalOperator(operator);
IContentOperator formerOperator = super.registerContentOperator(operatorString, wrapper);
return formerOperator instanceof ContentOperatorWrapper ? ((ContentOperatorWrapper)formerOperator).getOriginalOperator() : formerOperator;
}
//
// members holding the output canvas and the resources
//
protected PdfCanvas canvas = null;
//
// A content operator class to wrap all content operators to forward the invocation to the editor
//
class ContentOperatorWrapper implements IContentOperator
{
public IContentOperator getOriginalOperator()
{
return originalOperator;
}
public void setOriginalOperator(IContentOperator originalOperator)
{
this.originalOperator = originalOperator;
}
#Override
public void invoke(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
{
if (originalOperator != null && !"Do".equals(operator.toString()))
{
originalOperator.invoke(processor, operator, operands);
}
write(processor, operator, operands);
}
private IContentOperator originalOperator = null;
}
//
// A dummy event listener to give to the underlying canvas processor to feed events to
//
static class DummyEventListener implements IEventListener
{
#Override
public void eventOccurred(IEventData data, EventType type)
{ }
#Override
public Set<EventType> getSupportedEvents()
{
return null;
}
}
}
(PdfCanvasEditor.java)
The explanations from the iText 5 answer still apply, the parsing framework has not changed much from iText 5.5.x to iText 7.0.x.
Usage examples
Unfortunately you wrote in very vague terms about how exactly you want to change the contents. Thus I simply ported some iText 5 samples which made use of the original iText 5 content stream editor class:
Watermark removal
These are ports of the use cases in this answer.
testRemoveBoldMTTextDocument
This example drops all text written in a font the name of which ends with "BoldMT":
try ( InputStream resource = getClass().getResourceAsStream("document.pdf");
PdfReader pdfReader = new PdfReader(resource);
OutputStream result = new FileOutputStream(new File(RESULT_FOLDER, "document-noBoldMTText.pdf"));
PdfWriter pdfWriter = new PdfWriter(result);
PdfDocument pdfDocument = new PdfDocument(pdfReader, pdfWriter) )
{
PdfCanvasEditor editor = new PdfCanvasEditor()
{
#Override
protected void write(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
{
String operatorString = operator.toString();
if (TEXT_SHOWING_OPERATORS.contains(operatorString))
{
if (getGraphicsState().getFont().getFontProgram().getFontNames().getFontName().endsWith("BoldMT"))
return;
}
super.write(processor, operator, operands);
}
final List<String> TEXT_SHOWING_OPERATORS = Arrays.asList("Tj", "'", "\"", "TJ");
};
for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
{
editor.editPage(pdfDocument, i);
}
}
(EditPageContent.java test method testRemoveBoldMTTextDocument)
testRemoveBigTextDocument
This example drops all text written with a large font size:
try ( InputStream resource = getClass().getResourceAsStream("document.pdf");
PdfReader pdfReader = new PdfReader(resource);
OutputStream result = new FileOutputStream(new File(RESULT_FOLDER, "document-noBigText.pdf"));
PdfWriter pdfWriter = new PdfWriter(result);
PdfDocument pdfDocument = new PdfDocument(pdfReader, pdfWriter) )
{
PdfCanvasEditor editor = new PdfCanvasEditor()
{
#Override
protected void write(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
{
String operatorString = operator.toString();
if (TEXT_SHOWING_OPERATORS.contains(operatorString))
{
if (getGraphicsState().getFontSize() > 100)
return;
}
super.write(processor, operator, operands);
}
final List<String> TEXT_SHOWING_OPERATORS = Arrays.asList("Tj", "'", "\"", "TJ");
};
for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
{
editor.editPage(pdfDocument, i);
}
}
(EditPageContent.java test method testRemoveBigTextDocument)
Text color change
This is a port of the use case in this answer.
testChangeBlackTextToGreenDocument
This example changes the color of black text to green.
try ( InputStream resource = getClass().getResourceAsStream("document.pdf");
PdfReader pdfReader = new PdfReader(resource);
OutputStream result = new FileOutputStream(new File(RESULT_FOLDER, "document-blackTextToGreen.pdf"));
PdfWriter pdfWriter = new PdfWriter(result);
PdfDocument pdfDocument = new PdfDocument(pdfReader, pdfWriter) )
{
PdfCanvasEditor editor = new PdfCanvasEditor()
{
#Override
protected void write(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
{
String operatorString = operator.toString();
if (TEXT_SHOWING_OPERATORS.contains(operatorString))
{
if (currentlyReplacedBlack == null)
{
Color currentFillColor = getGraphicsState().getFillColor();
if (Color.BLACK.equals(currentFillColor))
{
currentlyReplacedBlack = currentFillColor;
super.write(processor, new PdfLiteral("rg"), Arrays.asList(new PdfNumber(0), new PdfNumber(1), new PdfNumber(0), new PdfLiteral("rg")));
}
}
}
else if (currentlyReplacedBlack != null)
{
if (currentlyReplacedBlack instanceof DeviceCmyk)
{
super.write(processor, new PdfLiteral("k"), Arrays.asList(new PdfNumber(0), new PdfNumber(0), new PdfNumber(0), new PdfNumber(1), new PdfLiteral("k")));
}
else if (currentlyReplacedBlack instanceof DeviceGray)
{
super.write(processor, new PdfLiteral("g"), Arrays.asList(new PdfNumber(0), new PdfLiteral("g")));
}
else
{
super.write(processor, new PdfLiteral("rg"), Arrays.asList(new PdfNumber(0), new PdfNumber(0), new PdfNumber(0), new PdfLiteral("rg")));
}
currentlyReplacedBlack = null;
}
super.write(processor, operator, operands);
}
Color currentlyReplacedBlack = null;
final List<String> TEXT_SHOWING_OPERATORS = Arrays.asList("Tj", "'", "\"", "TJ");
};
for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
{
editor.editPage(pdfDocument, i);
}
}
(EditPageContent.java test method testChangeBlackTextToGreenDocument)

Creating custom plugin for chinese tokenization

I'm working towards properly integrating the stanford segmenter within SOLR for chinese tokenization.
This plugin involves loading other jar files and model files. I've got it working in a crude manner by hardcoding the complete path for the files.
I'm looking for methods to create the plugin where the paths need not be hardcoded and also to have the plugin in conformance with the SOLR plugin architecture. Please let me know if there are any recommended sites or tutorials for this.
I've added my code below :
public class ChineseTokenizerFactory extends TokenizerFactory {
/** Creates a new WhitespaceTokenizerFactory */
public ChineseTokenizerFactory(Map<String,String> args) {
super(args);
assureMatchVersion();
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
#Override
public ChineseTokenizer create(AttributeFactory factory, Reader input) {
Reader processedStringReader = new ProcessedStringReader(input);
return new ChineseTokenizer(luceneMatchVersion, factory, processedStringReader);
}
}
public class ProcessedStringReader extends java.io.Reader {
private static final int BUFFER_SIZE = 1024 * 8;
//private static TextProcess m_textProcess = null;
private static final String basedir = "/home/praveen/PDS_Meetup/solr-4.9.0/custom_plugins/";
static Properties props = null;
static CRFClassifier<CoreLabel> segmenter = null;
private char[] m_inputData = null;
private int m_offset = 0;
private int m_length = 0;
public ProcessedStringReader(Reader input){
char[] arr = new char[BUFFER_SIZE];
StringBuffer buf = new StringBuffer();
int numChars;
if(segmenter == null)
{
segmenter = new CRFClassifier<CoreLabel>(getProperties());
segmenter.loadClassifierNoExceptions(basedir + "ctb.gz", getProperties());
}
try {
while ((numChars = input.read(arr, 0, arr.length)) > 0) {
buf.append(arr, 0, numChars);
}
} catch (IOException e) {
e.printStackTrace();
}
m_inputData = processText(buf.toString()).toCharArray();
m_offset = 0;
m_length = m_inputData.length;
}
#Override
public int read(char[] cbuf, int off, int len) throws IOException {
int charNumber = 0;
for(int i = m_offset + off;i<m_length && charNumber< len; i++){
cbuf[charNumber] = m_inputData[i];
m_offset ++;
charNumber++;
}
if(charNumber == 0){
return -1;
}
return charNumber;
}
#Override
public void close() throws IOException {
m_inputData = null;
m_offset = 0;
m_length = 0;
}
public String processText(String inputText)
{
List<String> segmented = segmenter.segmentString(inputText);
String output = "";
if(segmented.size() > 0)
{
output = segmented.get(0);
for(int i=1;i<segmented.size();i++)
{
output = output + " " +segmented.get(i);
}
}
System.out.println(output);
return output;
}
static Properties getProperties()
{
if (props == null) {
props = new Properties();
props.setProperty("sighanCorporaDict", basedir);
// props.setProperty("NormalizationTable", "data/norm.simp.utf8");
// props.setProperty("normTableEncoding", "UTF-8");
// below is needed because CTBSegDocumentIteratorFactory accesses it
props.setProperty("serDictionary",basedir+"dict-chris6.ser.gz");
props.setProperty("inputEncoding", "UTF-8");
props.setProperty("sighanPostProcessing", "true");
}
return props;
}
}
public final class ChineseTokenizer extends CharTokenizer {
public ChineseTokenizer(Version matchVersion, Reader in) {
super(matchVersion, in);
}
public ChineseTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
super(matchVersion, factory, in);
}
/** Collects only characters which do not satisfy
* {#link Character#isWhitespace(int)}.*/
#Override
protected boolean isTokenChar(int c) {
return !Character.isWhitespace(c);
}
}
You can pass the argument through the Factory's args parameter.

How do I convert text files to .arff format(weka)

Please advise me How do I convert text files to .arff format(weka)
because i wan to do data clustering for 1000 txt file.
regards
There are some converters implemented in WEKA, just find the right format or make little changes to your data (using awk, sed...).
Here is the API pages related to this topic: http://weka.sourceforge.net/doc.stable/weka/core/converters/package-summary.html
For exapmle here is how to convert from CSV to ARFF:
java weka.core.converters.CSVLoader filename.csv > filename.arff
Here is the code you can use
package text.Classification;
import java.io.*;
import weka.core.*;
public class TextDirectoryToArff {
public Instances createDataset(String directoryPath) throws Exception {
FastVector atts;
FastVector attVals;
atts = new FastVector();
atts.addElement(new Attribute("contents", (FastVector) null));
String[] s = { "class1", "class2", "class3" };
attVals = new FastVector();
for (String p : s)
attVals.addElement(p);
atts.addElement(new Attribute("class", attVals));
Instances data = new Instances("MyRelation", atts, 0);
System.out.println(data);
InputStreamReader is = null;
File dir = new File(directoryPath);
String[] files = dir.list();
for (int i = 0; i < files.length; i++) {
if (files[i].endsWith(".txt")) {
double[] newInst = new double[2];
File txt = new File(directoryPath + File.separator + files[i]);
is = new InputStreamReader(new FileInputStream(txt));
StringBuffer txtStr = new StringBuffer();
int c;
while ((c = is.read()) != -1) {
txtStr.append((char) c);
}
newInst[0] = data.attribute(0).addStringValue(txtStr.toString());
int j=i%(s.length-1);
newInst[1] = attVals.indexOf(s[j]);
data.add(new Instance(1.0, newInst));
}
}
return data;
}
public static void main(String[] args) {
TextDirectoryToArff tdta = new TextDirectoryToArff();
try {
Instances dataset = tdta.createDataset("/home/asadul/Desktop/Downloads/text_example/class5");
PrintWriter fileWriter = new PrintWriter("/home/asadul/Desktop/Downloads/text_example/abc.arff", "UTF-8");
fileWriter.println(dataset);
fileWriter.close();
} catch (Exception e) {
System.err.println(e.getMessage());
e.printStackTrace();
}
}
}

Can I drag items from Outlook into my SWT application?

Background
Our Eclipse RCP 3.6-based application lets people drag files in for storage/processing. This works fine when the files are dragged from a filesystem, but not when people drag items (messages or attachments) directly from Outlook.
This appears to be because Outlook wants to feed our application the files via a FileGroupDescriptorW and FileContents, but SWT only includes a FileTransfer type. (In a FileTransfer, only the file paths are passed, with the assumption that the receiver can locate and read them. The FileGroupDescriptorW/FileContents approach can supply files directly application-to-application without writing temporary files out to disk.)
We have tried to produce a ByteArrayTransfer subclass that could accept FileGroupDescriptorW and FileContents. Based on some examples on the Web, we were able to receive and parse the FileGroupDescriptorW, which (as the name implies) describes the files available for transfer. (See code sketch below.) But we have been unable to accept the FileContents.
This seems to be because Outlook offers the FileContents data only as TYMED_ISTREAM or TYMED_ISTORAGE, but SWT only understands how to exchange data as TYMED_HGLOBAL. Of those, it appears that TYMED_ISTORAGE would be preferable, since it's not clear how TYMED_ISTREAM could provide access to multiple files' contents.
(We also have some concerns about SWT's desire to pick and convert only a single TransferData type, given that we need to process two, but we think we could probably hack around that in Java somehow: it seems that all the TransferDatas are available at other points of the process.)
Questions
Are we on the right track here? Has anyone managed to accept FileContents in SWT yet? Is there any chance that we could process the TYMED_ISTORAGE data without leaving Java (even if by creating a fragment-based patch to, or a derived version of, SWT), or would we have to build some new native support code too?
Relevant code snippets
Sketch code that extracts file names:
// THIS IS NOT PRODUCTION-QUALITY CODE - FOR ILLUSTRATION ONLY
final Transfer transfer = new ByteArrayTransfer() {
private final String[] typeNames = new String[] { "FileGroupDescriptorW", "FileContents" };
private final int[] typeIds = new int[] { registerType(typeNames[0]), registerType(typeNames[1]) };
#Override
protected String[] getTypeNames() {
return typeNames;
}
#Override
protected int[] getTypeIds() {
return typeIds;
}
#Override
protected Object nativeToJava(TransferData transferData) {
if (!isSupportedType(transferData))
return null;
final byte[] buffer = (byte[]) super.nativeToJava(transferData);
if (buffer == null)
return null;
try {
final DataInputStream in = new DataInputStream(new ByteArrayInputStream(buffer));
long count = 0;
for (int i = 0; i < 4; i++) {
count += in.readUnsignedByte() << i;
}
for (int i = 0; i < count; i++) {
final byte[] filenameBytes = new byte[260 * 2];
in.skipBytes(72); // probable architecture assumption(s) - may be wrong outside standard 32-bit Win XP
in.read(filenameBytes);
final String fileNameIncludingTrailingNulls = new String(filenameBytes, "UTF-16LE");
int stringLength = fileNameIncludingTrailingNulls.indexOf('\0');
if (stringLength == -1)
stringLength = 260;
final String fileName = fileNameIncludingTrailingNulls.substring(0, stringLength);
System.out.println("File " + i + ": " + fileName);
}
in.close();
return buffer;
}
catch (final Exception e) {
return null;
}
}
};
In the debugger, we see that ByteArrayTransfer's isSupportedType() ultimately returns false for the FileContents because the following test is not passed (since its tymed is TYMED_ISTREAM | TYMED_ISTORAGE):
if (format.cfFormat == types[i] &&
(format.dwAspect & COM.DVASPECT_CONTENT) == COM.DVASPECT_CONTENT &&
(format.tymed & COM.TYMED_HGLOBAL) == COM.TYMED_HGLOBAL )
return true;
This excerpt from org.eclipse.swt.internal.ole.win32.COM leaves us feeling less hope for an easy solution:
public static final int TYMED_HGLOBAL = 1;
//public static final int TYMED_ISTORAGE = 8;
//public static final int TYMED_ISTREAM = 4;
Thanks.
even if
//public static final int TYMED_ISTREAM = 4;
Try below code.. it should work
package com.nagarro.jsag.poc.swtdrag;
imports ...
public class MyTransfer extends ByteArrayTransfer {
private static int BYTES_COUNT = 592;
private static int SKIP_BYTES = 72;
private final String[] typeNames = new String[] { "FileGroupDescriptorW", "FileContents" };
private final int[] typeIds = new int[] { registerType(typeNames[0]), registerType(typeNames[1]) };
#Override
protected String[] getTypeNames() {
return typeNames;
}
#Override
protected int[] getTypeIds() {
return typeIds;
}
#Override
protected Object nativeToJava(TransferData transferData) {
String[] result = null;
if (!isSupportedType(transferData) || transferData.pIDataObject == 0)
return null;
IDataObject data = new IDataObject(transferData.pIDataObject);
data.AddRef();
// Check for descriptor format type
try {
FORMATETC formatetcFD = transferData.formatetc;
STGMEDIUM stgmediumFD = new STGMEDIUM();
stgmediumFD.tymed = COM.TYMED_HGLOBAL;
transferData.result = data.GetData(formatetcFD, stgmediumFD);
if (transferData.result == COM.S_OK) {
// Check for contents format type
long hMem = stgmediumFD.unionField;
long fileDiscriptorPtr = OS.GlobalLock(hMem);
int[] fileCount = new int[1];
try {
OS.MoveMemory(fileCount, fileDiscriptorPtr, 4);
fileDiscriptorPtr += 4;
result = new String[fileCount[0]];
for (int i = 0; i < fileCount[0]; i++) {
String fileName = handleFile(fileDiscriptorPtr, data);
System.out.println("FileName : = " + fileName);
result[i] = fileName;
fileDiscriptorPtr += BYTES_COUNT;
}
} catch (Exception e) {
e.printStackTrace();
} finally {
OS.GlobalFree(hMem);
}
}
} finally {
data.Release();
}
return result;
}
private String handleFile(long fileDiscriptorPtr, IDataObject data) throws Exception {
// GetFileName
char[] fileNameChars = new char[OS.MAX_PATH];
byte[] fileNameBytes = new byte[OS.MAX_PATH];
COM.MoveMemory(fileNameBytes, fileDiscriptorPtr, BYTES_COUNT);
// Skip some bytes.
fileNameBytes = Arrays.copyOfRange(fileNameBytes, SKIP_BYTES, fileNameBytes.length);
String fileNameIncludingTrailingNulls = new String(fileNameBytes, "UTF-16LE");
fileNameChars = fileNameIncludingTrailingNulls.toCharArray();
StringBuilder builder = new StringBuilder(OS.MAX_PATH);
for (int i = 0; fileNameChars[i] != 0 && i < fileNameChars.length; i++) {
builder.append(fileNameChars[i]);
}
String name = builder.toString();
try {
File file = saveFileContent(name, data);
if (file != null) {
System.out.println("File Saved # " + file.getAbsolutePath());
;
}
} catch (IOException e) {
System.out.println("Count not save file content");
;
}
return name;
}
private File saveFileContent(String fileName, IDataObject data) throws IOException {
File file = null;
FORMATETC formatetc = new FORMATETC();
formatetc.cfFormat = typeIds[1];
formatetc.dwAspect = COM.DVASPECT_CONTENT;
formatetc.lindex = 0;
formatetc.tymed = 4; // content.
STGMEDIUM stgmedium = new STGMEDIUM();
stgmedium.tymed = 4;
if (data.GetData(formatetc, stgmedium) == COM.S_OK) {
file = new File(fileName);
IStream iStream = new IStream(stgmedium.unionField);
iStream.AddRef();
try (FileOutputStream outputStream = new FileOutputStream(file)) {
int increment = 1024 * 4;
long pv = COM.CoTaskMemAlloc(increment);
int[] pcbWritten = new int[1];
while (iStream.Read(pv, increment, pcbWritten) == COM.S_OK && pcbWritten[0] > 0) {
byte[] buffer = new byte[pcbWritten[0]];
OS.MoveMemory(buffer, pv, pcbWritten[0]);
outputStream.write(buffer);
}
COM.CoTaskMemFree(pv);
} finally {
iStream.Release();
}
return file;
} else {
return null;
}
}
}
Have you looked at https://bugs.eclipse.org/bugs/show_bug.cgi?id=132514 ?
Attached to this bugzilla entry is an patch (against an rather old version of SWT) that might be of interest.
I had the same problem and created a small library providing a Drag'n Drop Transfer Class for JAVA SWT. It can be found here:
https://github.com/HendrikHoetker/OutlookItemTransfer
Currently it supports dropping Mail Items from Outlook to your Java SWT application and will provide a list of OutlookItems with the Filename and a byte array of the file contents.
All is pure Java and in-memory (no temp files).
Usage in your SWT java application:
if (OutlookItemTransfer.getInstance().isSupportedType(event.currentDataType)) {
Object o = OutlookItemTransfer.getInstance().nativeToJava(event.currentDataType);
if (o != null && o instanceof OutlookMessage[]) {
OutlookMessage[] outlookMessages = (OutlookMessage[])o;
for (OutlookMessage msg: outlookMessages) {
//...
}
}
}
The OutlookItem will then provide two elements: filename as String and file contents as array of byte.
From here on, one could write it to a file or further process the byte array.
To your question above:
- What you find in the file descriptor is the filename of the outlook item and a pointer to an IDataObject
- the IDataObject can be parsed and will provide an IStorage object
- The IStorageObject will be then a root container providing further sub-IStorageObjects or IStreams similar to a filesystem (directory = IStorage, file = IStream
You find those elements in the following lines of code:
Get File Contents, see OutlookItemTransfer.java, method nativeToJava:
FORMATETC format = new FORMATETC();
format.cfFormat = getTypeIds()[1];
format.dwAspect = COM.DVASPECT_CONTENT;
format.lindex = <fileIndex>;
format.ptd = 0;
format.tymed = TYMED_ISTORAGE | TYMED_ISTREAM | COM.TYMED_HGLOBAL;
STGMEDIUM medium = new STGMEDIUM();
if (data.GetData(format, medium) == COM.S_OK) {
// medium.tymed will now contain TYMED_ISTORAGE
// in medium.unionfield you will find the root IStorage
}
Read the root IStorage, see CompoundStorage, method readOutlookStorage:
// open IStorage object
IStorage storage = new IStorage(pIStorage);
storage.AddRef();
// walk through the content of the IStorage object
long[] pEnumStorage = new long[1];
if (storage.EnumElements(0, 0, 0, pEnumStorage) == COM.S_OK) {
// get storage iterator
IEnumSTATSTG enumStorage = new IEnumSTATSTG(pEnumStorage[0]);
enumStorage.AddRef();
enumStorage.Reset();
// prepare statstg structure which tells about the object found by the iterator
long pSTATSTG = OS.GlobalAlloc(OS.GMEM_FIXED | OS.GMEM_ZEROINIT, STATSTG.sizeof);
int[] fetched = new int[1];
while (enumStorage.Next(1, pSTATSTG, fetched) == COM.S_OK && fetched[0] == 1) {
// get the description of the the object found
STATSTG statstg = new STATSTG();
COM.MoveMemory(statstg, pSTATSTG, STATSTG.sizeof);
// get the name of the object found
String name = readPWCSName(statstg);
// depending on type of object
switch (statstg.type) {
case COM.STGTY_STREAM: { // load an IStream (=File)
long[] pIStream = new long[1];
// get the pointer to the IStream
if (storage.OpenStream(name, 0, COM.STGM_DIRECT | COM.STGM_READ | COM.STGM_SHARE_EXCLUSIVE, 0, pIStream) == COM.S_OK) {
// load the IStream
}
}
case COM.STGTY_STORAGE: { // load an IStorage (=SubDirectory) - requires recursion to traverse the sub dies
}
}
}
}
// close the iterator
enumStorage.Release();
}
// close the IStorage object
storage.Release();