Extract images using iTextSharp - itext

I have been using this code with great success to pull out the first image found in each page of a PDF. However, it is now not working with some new PDFs for an uknown reason. I have used other tools (Datalogics, etc) that do pull out the images fine with these new PDFs. However, I do not want to buy Datalogics or any tool if I can use iTextSharp. Can anybody tell me why this code is not finding the images in the PDF?
Knowns: my PDFs only have 1 image per page and nothing else.
using iTextSharp.text;
using iTextSharp.text.pdf;
...
public static void ExtractImagesFromPDF(string sourcePdf, string outputPath)
{
// NOTE: This will only get the first image it finds per page.
PdfReader pdf = new PdfReader(sourcePdf);
RandomAccessFileOrArray raf = new iTextSharp.text.pdf.RandomAccessFileOrArray(sourcePdf);
try
{
for (int pageNumber = 1; pageNumber <= pdf.NumberOfPages; pageNumber++)
{
PdfDictionary pg = pdf.GetPageN(pageNumber);
PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
if (xobj != null)
{
foreach (PdfName name in xobj.Keys)
{
PdfObject obj = xobj.Get(name);
if (obj.IsIndirect())
{
PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
PdfName type = (PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));
if (PdfName.IMAGE.Equals(type))
{
int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
PdfStream pdfStrem = (PdfStream)pdfObj;
byte[] bytes = PdfReader.GetStreamBytesRaw((PRStream)pdfStrem);
if ((bytes != null))
{
using (System.IO.MemoryStream memStream = new System.IO.MemoryStream(bytes))
{
memStream.Position = 0;
System.Drawing.Image img = System.Drawing.Image.FromStream(memStream);
// must save the file while stream is open.
if (!Directory.Exists(outputPath))
Directory.CreateDirectory(outputPath);
string path = Path.Combine(outputPath, String.Format(#"{0}.jpg", pageNumber));
System.Drawing.Imaging.EncoderParameters parms = new System.Drawing.Imaging.EncoderParameters(1);
parms.Param[0] = new System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression, 0);
System.Drawing.Imaging.ImageCodecInfo jpegEncoder = Utilities.GetImageEncoder("JPEG");
img.Save(path, jpegEncoder, parms);
break;
}
}
}
}
}
}
}
}
catch
{
throw;
}
finally
{
pdf.Close();
raf.Close();
}
}

I found that my problem was that I was not recursively searching inside of forms and groups for images. Basically, the original code would only find images that were embedded at the root of the pdf document. Here is the revised method plus a new method (FindImageInPDFDictionary) that recursively searches for images in the page. NOTE: the flaws of only supporting JPEG and non-compressed images still applies. See R Ubben's code for options to fix those flaws. HTH someone.
public static void ExtractImagesFromPDF(string sourcePdf, string outputPath)
{
// NOTE: This will only get the first image it finds per page.
PdfReader pdf = new PdfReader(sourcePdf);
RandomAccessFileOrArray raf = new iTextSharp.text.pdf.RandomAccessFileOrArray(sourcePdf);
try
{
for (int pageNumber = 1; pageNumber <= pdf.NumberOfPages; pageNumber++)
{
PdfDictionary pg = pdf.GetPageN(pageNumber);
// recursively search pages, forms and groups for images.
PdfObject obj = FindImageInPDFDictionary(pg);
if (obj != null)
{
int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
PdfStream pdfStrem = (PdfStream)pdfObj;
byte[] bytes = PdfReader.GetStreamBytesRaw((PRStream)pdfStrem);
if ((bytes != null))
{
using (System.IO.MemoryStream memStream = new System.IO.MemoryStream(bytes))
{
memStream.Position = 0;
System.Drawing.Image img = System.Drawing.Image.FromStream(memStream);
// must save the file while stream is open.
if (!Directory.Exists(outputPath))
Directory.CreateDirectory(outputPath);
string path = Path.Combine(outputPath, String.Format(#"{0}.jpg", pageNumber));
System.Drawing.Imaging.EncoderParameters parms = new System.Drawing.Imaging.EncoderParameters(1);
parms.Param[0] = new System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression, 0);
System.Drawing.Imaging.ImageCodecInfo jpegEncoder = Utilities.GetImageEncoder("JPEG");
img.Save(path, jpegEncoder, parms);
}
}
}
}
}
catch
{
throw;
}
finally
{
pdf.Close();
raf.Close();
}
}
private static PdfObject FindImageInPDFDictionary(PdfDictionary pg)
{
PdfDictionary res =
(PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
PdfDictionary xobj =
(PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
if (xobj != null)
{
foreach (PdfName name in xobj.Keys)
{
PdfObject obj = xobj.Get(name);
if (obj.IsIndirect())
{
PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
PdfName type =
(PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));
//image at the root of the pdf
if (PdfName.IMAGE.Equals(type))
{
return obj;
}// image inside a form
else if (PdfName.FORM.Equals(type))
{
return FindImageInPDFDictionary(tg);
} //image inside a group
else if (PdfName.GROUP.Equals(type))
{
return FindImageInPDFDictionary(tg);
}
}
}
}
return null;
}

Here is a simpler solution:
iTextSharp.text.pdf.parser.PdfImageObject pdfImage =
new iTextSharp.text.pdf.parser.PdfImageObject(imgPRStream);
System.Drawing.Image img = pdfImage.GetDrawingImage();

The following code incorporates all of Dave and R Ubben's ideas above, plus it returns a full list of all the images and also deals with multiple bit depths. I had to convert it to VB for the project I'm working on though, sorry about that...
Private Sub getAllImages(ByVal dict As pdf.PdfDictionary, ByVal images As List(Of Byte()), ByVal doc As pdf.PdfReader)
Dim res As pdf.PdfDictionary = CType(pdf.PdfReader.GetPdfObject(dict.Get(pdf.PdfName.RESOURCES)), pdf.PdfDictionary)
Dim xobj As pdf.PdfDictionary = CType(pdf.PdfReader.GetPdfObject(res.Get(pdf.PdfName.XOBJECT)), pdf.PdfDictionary)
If xobj IsNot Nothing Then
For Each name As pdf.PdfName In xobj.Keys
Dim obj As pdf.PdfObject = xobj.Get(name)
If (obj.IsIndirect) Then
Dim tg As pdf.PdfDictionary = CType(pdf.PdfReader.GetPdfObject(obj), pdf.PdfDictionary)
Dim subtype As pdf.PdfName = CType(pdf.PdfReader.GetPdfObject(tg.Get(pdf.PdfName.SUBTYPE)), pdf.PdfName)
If pdf.PdfName.IMAGE.Equals(subtype) Then
Dim xrefIdx As Integer = CType(obj, pdf.PRIndirectReference).Number
Dim pdfObj As pdf.PdfObject = doc.GetPdfObject(xrefIdx)
Dim str As pdf.PdfStream = CType(pdfObj, pdf.PdfStream)
Dim bytes As Byte() = pdf.PdfReader.GetStreamBytesRaw(CType(str, pdf.PRStream))
Dim filter As String = tg.Get(pdf.PdfName.FILTER).ToString
Dim width As String = tg.Get(pdf.PdfName.WIDTH).ToString
Dim height As String = tg.Get(pdf.PdfName.HEIGHT).ToString
Dim bpp As String = tg.Get(pdf.PdfName.BITSPERCOMPONENT).ToString
If filter = "/FlateDecode" Then
bytes = pdf.PdfReader.FlateDecode(bytes, True)
Dim pixelFormat As System.Drawing.Imaging.PixelFormat
Select Case Integer.Parse(bpp)
Case 1
pixelFormat = Drawing.Imaging.PixelFormat.Format1bppIndexed
Case 24
pixelFormat = Drawing.Imaging.PixelFormat.Format24bppRgb
Case Else
Throw New Exception("Unknown pixel format " + bpp)
End Select
Dim bmp As New System.Drawing.Bitmap(Int32.Parse(width), Int32.Parse(height), pixelFormat)
Dim bmd As System.Drawing.Imaging.BitmapData = bmp.LockBits(New System.Drawing.Rectangle(0, 0, Int32.Parse(width), Int32.Parse(height)), System.Drawing.Imaging.ImageLockMode.WriteOnly, pixelFormat)
Marshal.Copy(bytes, 0, bmd.Scan0, bytes.Length)
bmp.UnlockBits(bmd)
Using ms As New MemoryStream
bmp.Save(ms, System.Drawing.Imaging.ImageFormat.Png)
bytes = ms.GetBuffer
End Using
End If
images.Add(bytes)
ElseIf pdf.PdfName.FORM.Equals(subtype) Or pdf.PdfName.GROUP.Equals(subtype) Then
getAllImages(tg, images, doc)
End If
End If
Next
End If
End Sub

This is just another rehash of others' ideas, but the one that worked for me. Here I use #Malco's image grabbing snippet with R Ubben's looping:
private IList<System.Drawing.Image> GetImagesFromPdfDict(PdfDictionary dict, PdfReader doc)
{
List<System.Drawing.Image> images = new List<System.Drawing.Image>();
PdfDictionary res = (PdfDictionary)(PdfReader.GetPdfObject(dict.Get(PdfName.RESOURCES)));
PdfDictionary xobj = (PdfDictionary)(PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT)));
if (xobj != null)
{
foreach (PdfName name in xobj.Keys)
{
PdfObject obj = xobj.Get(name);
if (obj.IsIndirect())
{
PdfDictionary tg = (PdfDictionary)(PdfReader.GetPdfObject(obj));
PdfName subtype = (PdfName)(PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE)));
if (PdfName.IMAGE.Equals(subtype))
{
int xrefIdx = ((PRIndirectReference)obj).Number;
PdfObject pdfObj = doc.GetPdfObject(xrefIdx);
PdfStream str = (PdfStream)(pdfObj);
iTextSharp.text.pdf.parser.PdfImageObject pdfImage =
new iTextSharp.text.pdf.parser.PdfImageObject((PRStream)str);
System.Drawing.Image img = pdfImage.GetDrawingImage();
images.Add(img);
}
else if (PdfName.FORM.Equals(subtype) || PdfName.GROUP.Equals(subtype))
{
images.AddRange(GetImagesFromPdfDict(tg, doc));
}
}
}
}
return images;
}

De c# version:
private IList<System.Drawing.Image> GetImagesFromPdfDict(PdfDictionary dict, PdfReader doc){
List<System.Drawing.Image> images = new List<System.Drawing.Image>();
PdfDictionary res = (PdfDictionary)(PdfReader.GetPdfObject(dict.Get(PdfName.RESOURCES)));
PdfDictionary xobj = (PdfDictionary)(PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT)));
if (xobj != null)
{
foreach (PdfName name in xobj.Keys)
{
PdfObject obj = xobj.Get(name);
if (obj.IsIndirect())
{
PdfDictionary tg = (PdfDictionary)(PdfReader.GetPdfObject(obj));
pdf.PdfName subtype = (pdf.PdfName)(pdf.PdfReader.GetPdfObject(tg.Get(pdf.PdfName.SUBTYPE)));
if (pdf.PdfName.IMAGE.Equals(subtype))
{
int xrefIdx = ((pdf.PRIndirectReference)obj).Number;
pdf.PdfObject pdfObj = doc.GetPdfObject(xrefIdx);
pdf.PdfStream str = (pdf.PdfStream)(pdfObj);
byte[] bytes = pdf.PdfReader.GetStreamBytesRaw((pdf.PRStream)str);
string filter = tg.Get(pdf.PdfName.FILTER).ToString();
string width = tg.Get(pdf.PdfName.WIDTH).ToString();
string height = tg.Get(pdf.PdfName.HEIGHT).ToString();
string bpp = tg.Get(pdf.PdfName.BITSPERCOMPONENT).ToString();
if (filter == "/FlateDecode")
{
bytes = pdf.PdfReader.FlateDecode(bytes, true);
System.Drawing.Imaging.PixelFormat pixelFormat;
switch (int.Parse(bpp))
{
case 1:
pixelFormat = System.Drawing.Imaging.PixelFormat.Format1bppIndexed;
break;
case 24:
pixelFormat = System.Drawing.Imaging.PixelFormat.Format24bppRgb;
break;
default:
throw new Exception("Unknown pixel format " + bpp);
}
var bmp = new System.Drawing.Bitmap(Int32.Parse(width), Int32.Parse(height), pixelFormat);
System.Drawing.Imaging.BitmapData bmd = bmp.LockBits(new System.Drawing.Rectangle(0, 0, Int32.Parse(width),
Int32.Parse(height)), System.Drawing.Imaging.ImageLockMode.WriteOnly, pixelFormat);
Marshal.Copy(bytes, 0, bmd.Scan0, bytes.Length);
bmp.UnlockBits(bmd);
using (var ms = new MemoryStream())
{
bmp.Save(ms, System.Drawing.Imaging.ImageFormat.Png);
bytes = ms.GetBuffer();
}
}
images.Add(System.Drawing.Image.FromStream(new MemoryStream(bytes)));
}
else if (pdf.PdfName.FORM.Equals(subtype) || pdf.PdfName.GROUP.Equals(subtype))
{
images.AddRange(GetImagesFromPdfDict(tg, doc));
}
}
}
}
return images;
}

The above will only work with JPEGs. Excluding inline images and embedded files, you need to go through the objects of subtype IMAGE, then look at the filter and take the appropriate action. Here's an example, assuming we have a PdfObject of subtype IMAGE:
PdfReader pdf = new PdfReader("c:\\temp\\exp0.pdf");
int xo=pdf.XrefSize;
for (int i=0;i<xo;i++)
{
PdfObject obj=pdf.GetPdfObject(i);
if (obj!=null && obj.IsStream())
{
PdfDictionary pd=(PdfDictionary)obj;
if (pd.Contains(PdfName.SUBTYPE) && pd.Get(PdfName.SUBTYPE).ToString()=="/Image")
{
string filter=pd.Get(PdfName.FILTER).ToString();
string width=pd.Get(PdfName.WIDTH).ToString();
string height=pd.Get(PdfName.HEIGHT).ToString();
string bpp=pd.Get(PdfName.BITSPERCOMPONENT).ToString();
string extent=".";
byte [] img=null;
switch (filter)
{
case "/FlateDecode":
byte[] arr=PdfReader.FlateDecode(PdfReader.GetStreamBytesRaw((PRStream)obj),true);
Bitmap bmp=new Bitmap(Int32.Parse(width),Int32.Parse(height),PixelFormat.Format24bppRgb);
BitmapData bmd=bmp.LockBits(new Rectangle(0,0,Int32.Parse(width),Int32.Parse(height)),ImageLockMode.WriteOnly,
PixelFormat.Format24bppRgb);
Marshal.Copy(arr,0,bmd.Scan0,arr.Length);
bmp.UnlockBits(bmd);
bmp.Save("c:\\temp\\bmp1.png",ImageFormat.Png);
break;
default:
break;
}
}
}
}
This will mess the color up because of the Microsoft BGR, of course, but I wanted to keep it short. Do something similar for "/CCITTFaxDecode", etc.

Related

How to replace text of Paragraph in Itext?

I want to set page number when I merge pdf files. The page number paragraph will defind by someone to custom style what he want. Now I can add text to the paragraph(like doc.add(paragraph.add(text))), but I can not replace it.
public static byte[] mergePdf(Map<String, PdfDocument> filesToMerge, Paragraph paragraph) {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
PdfDocument pdfDoc = new PdfDocument(new PdfWriter(baos));
Document doc = new Document(pdfDoc);
pdfDoc.initializeOutlines();
PdfPageFormCopier formCopier = new PdfPageFormCopier();
int page = 1;
for (Map.Entry<String, PdfDocument> entry : filesToMerge.entrySet()) {
String title = entry.getKey();
PdfDocument srcDoc = entry.getValue();
int numberOfPages = srcDoc.getNumberOfPages();
for (int i = 1; i <= numberOfPages; i++, page++) {
Text text = new Text(String.format("page %d", page));
srcDoc.copyPagesTo(i, i, pdfDoc, formCopier);
if (i == 1) {
text.setDestination("p" + page);
PdfOutline rootOutLine = pdfDoc.getOutlines(false);
PdfOutline outline = rootOutLine.addOutline(title);
outline.addDestination(PdfDestination.makeDestination(new PdfString("p" + page)));
}
// I want do like "doc.add(paragraph.set(text))";
// paragraph already have been set position,font,fontSize and so on. SO I dont want to "new Paragraph(text)"
doc.add(paragraph.add(text));
}
}
for (PdfDocument srcDoc : filesToMerge.values()) {
srcDoc.close();
}
doc.close();
return baos.toByteArray();
}
I am the questioner. At last I resolve the question do like this:
public class MyParagraph extends Paragraph {
public MyParagraph() {
}
public void setContent(Text text) {
List<IElement> children = this.getChildren();
if (!children.isEmpty()) {
children.clear();
}
children.add(text);
}
}

How to specify parameters in box-view API header?

I am using the following piece of C# code to upload, convert and download a .pptx file, via Box view API.
var boxViewID = "";
string boundary = "---------------------------" + DateTime.Now.Ticks.ToString("x");
byte[] boundarybytes = System.Text.Encoding.ASCII.GetBytes("\r\n--" + boundary + "\r\n");
String url_ = #"https://upload.view-api.box.com/1/documents";
HttpWebRequest wr = (HttpWebRequest)WebRequest.Create(url_);
wr.ContentType = "multipart/form-data; boundary=" + boundary;
wr.Headers.Add("Authorization:Token " + "MY_CODE"/*Configuration.BoxViewAPIKey*/);
wr.Method = "POST";
wr.KeepAlive = true;
wr.Credentials = System.Net.CredentialCache.DefaultCredentials;
wr.Timeout = 1000000;
wr.SendChunked = true;
DateTime start = DateTime.Now;
Exception exc = null;
Stream rs = wr.GetRequestStream();
try
{
rs.Write(boundarybytes, 0, boundarybytes.Length);
string headerTemplate = "Content-Disposition: form-data; name=\"{0}\"; non_svg=\"true\"; filename=\"{1}\"\r\nContent-Type: {2}\r\n\r\n";
string header = string.Format(headerTemplate,"file", file, contentType);
Console.WriteLine(header);
byte[] headerbytes = System.Text.Encoding.UTF8.GetBytes(header);
rs.Write(headerbytes, 0, headerbytes.Length);
FileStream fileStream = new FileStream(file, FileMode.Open, FileAccess.Read);
byte[] buffer = new byte[40960];
int bytesRead = 0;
int totalSent = 0;
int totalLength = (int)fileStream.Length;
while ((bytesRead = fileStream.Read(buffer, 0, buffer.Length)) != 0)
{
totalSent += bytesRead;
var percent = new decimal(100.0 * totalSent / totalLength);
if (progress != null)
{
progress("Box processing", percent);
}
rs.Write(buffer, 0, bytesRead);
}
fileStream.Close();
}
catch(Exception ex)
{
exc = ex;
}
DateTime end = DateTime.Now;
int seconds = (int)(end - start).TotalSeconds;
if(seconds>=0)
{
if(exc!=null)
{
throw exc;
}
}
byte[] trailer = System.Text.Encoding.ASCII.GetBytes("\r\n--" + boundary + "--\r\n");
rs.Write(trailer, 0, trailer.Length);
rs.Close();
WebResponse wresp = null;
try
{
wresp = wr.GetResponse();
Stream stream2 = wresp.GetResponseStream();
StreamReader reader2 = new StreamReader(stream2);
var res = reader2.ReadToEnd();
var docRes = Newtonsoft.Json.JsonConvert.DeserializeObject<Dictionary<string, string>>(res);
if (docRes["id"] != null)
boxViewID = docRes["id"];
}
catch (Exception ex)
{
if (wresp != null)
{
wresp.Close();
wresp = null;
}
}
finally
{
wr = null;
}
return boxViewID;
Specifying "non_svg" parameter should create .png images for every slide in the presentation (instead of .svg + .html pairs). However, the API seems to ignore this part of the request and I am always getting svg files.
Any idea on what am I doing wrong? Thanks!
The non_svg option causes PNG representations to be generated for each page, but the SVG representation is still generated. The viewer will only load the PNG files if SVG is not supported in the browser (basically only IE 8). Try changing page-1.svg to page-1.png in the browser (e.g., https://view-api.box.com/1/sessions/465c5d45caf04752a6113b0e5df593a5/assets/page-1.png vs https://view-api.box.com/1/sessions/465c5d45caf04752a6113b0e5df593a5/assets/page-1.svg). All of the assets will exist in content.zip if you use the documents content endpoint.

Copying fields in iTextSharp 5.4.5.0

I was under the impression that it is now possible to copy AcroFields using PdfCopy. In the release notes for iText 5.4.4.0 this is listed as possible now. However, when I try to do so it appears all the annotations (I think I am using that term correctly, still fairly new to iText...) for the fields are stripped out. It looks like the fields are there (meaning I can see the blue boxes that indicate an editable field), but they are not editable. If I try to bring the PDF up in Acrobat I get a message saying that "there are no fields, would you like Acrobat to discover them?" and most are found and marked and fields properly (check boxes aren't, but the text fields are).
I assume there is an additional step somewhere along the lines to re-add the annotations to the PdfCopy object, but I do not see a way to get the annotations from the PdfReader. I also cannot seem to find any documentation on how to do this (since AcroFields were for so long not supported in PdfCopy most of what I find is along that vein).
Due to sensitivity I cannot provide a copy of the PDF's in question, but using an altered version of a test program used earlier you can see the issue with the following code. It should generate a table with some check boxes in the four right columns. If I use the exact same code with PdfCopyFields in the MergePdfs method instead of PdfCopy it works as expected. This code does not produce any text fields, but in my main project they are part of the original parent PDF that is used as a template.
(Sorry for the long example, it has been cherry picked from a much larger application. You will need a PDF with a field named "TableStartPosition" somewhere in it and update RunTest with the correct paths for your local machine to get this to work.)
Has the PdfCopy functionality not made it into iTextSharp yet? I am using version 5.4.5.0.
class Program
{
Stream _pdfTemplateStream;
MemoryStream _pdfResultStream;
PdfReader _pdfTemplateReader;
PdfStamper _pdfResultStamper;
static void Main(string[] args)
{
Program p = new Program();
try
{
p.RunTest();
}
catch (Exception f)
{
Console.WriteLine(f.Message);
Console.ReadLine();
}
}
internal void RunTest()
{
FileStream fs = File.OpenRead(#"C:\temp\a\RenameFieldTest\RenameFieldTest\Library\CoverPage.pdf");
_pdfTemplateStream = fs;
_pdfResultStream = new MemoryStream();
//PDFTemplateStream = new FileStream(_templatePath, FileMode.Open);
_pdfTemplateReader = new PdfReader(_pdfTemplateStream);
_pdfResultStamper = new PdfStamper(_pdfTemplateReader, _pdfResultStream);
#region setup objects
List<CustomCategory> Categories = new List<CustomCategory>();
CustomCategory c1 = new CustomCategory();
c1.CategorySizesInUse.Add(CustomCategory.AvailableSizes[1]);
c1.CategorySizesInUse.Add(CustomCategory.AvailableSizes[2]);
Categories.Add(c1);
CustomCategory c2 = new CustomCategory();
c2.CategorySizesInUse.Add(CustomCategory.AvailableSizes[0]);
c2.CategorySizesInUse.Add(CustomCategory.AvailableSizes[1]);
Categories.Add(c2);
List<CustomObject> Items = new List<CustomObject>();
CustomObject co1 = new CustomObject();
co1.Category = c1;
co1.Title = "Object 1";
Items.Add(co1);
CustomObject co2 = new CustomObject();
co2.Category = c2;
co2.Title = "Object 2";
Items.Add(co2);
#endregion
FillCoverPage(Items);
_pdfResultStamper.Close();
_pdfTemplateReader.Close();
List<MemoryStream> pdfStreams = new List<MemoryStream>();
pdfStreams.Add(new MemoryStream(_pdfResultStream.ToArray()));
MergePdfs(#"C:\temp\a\RenameFieldTest\RenameFieldTest\Library\Outfile.pdf", pdfStreams);
_pdfResultStream.Dispose();
_pdfTemplateStream.Dispose();
}
internal void FillCoverPage(List<CustomObject> Items)
{
//Before we start we need to figure out where to start adding the table
var fieldPositions = _pdfResultStamper.AcroFields.GetFieldPositions("TableStartPosition");
if (fieldPositions == null)
{ throw new Exception("Could not find the TableStartPosition field. Unable to determine point of origin for the table!"); }
_pdfResultStamper.AcroFields.RemoveField("TableStartPosition");
var fieldPosition = fieldPositions[0];
// Get the position of the field
var targetPosition = fieldPosition.position;
//First, get all the available card sizes
List<string> availableSizes = CustomCategory.AvailableSizes;
//Generate a table with the number of available card sizes + 1 for the device name
PdfPTable table = new PdfPTable(availableSizes.Count + 1);
float[] columnWidth = new float[availableSizes.Count + 1];
for (int y = 0; y < columnWidth.Length; y++)
{
if (y == 0)
{ columnWidth[y] = 320; }
else
{ columnWidth[y] = 120; }
}
table.SetTotalWidth(columnWidth);
table.WidthPercentage = 100;
PdfContentByte canvas;
List<PdfFormField> checkboxes = new List<PdfFormField>();
//Build the header row
table.Rows.Add(new PdfPRow(this.GetTableHeaderRow(availableSizes)));
//Insert the global check boxes
PdfPCell[] globalRow = new PdfPCell[availableSizes.Count + 1];
Phrase tPhrase = new Phrase("Select/Unselect All");
PdfPCell tCell = new PdfPCell();
tCell.BackgroundColor = BaseColor.LIGHT_GRAY;
tCell.AddElement(tPhrase);
globalRow[0] = tCell;
for (int x = 0; x < availableSizes.Count; x++)
{
tCell = new PdfPCell();
tCell.BackgroundColor = BaseColor.LIGHT_GRAY;
PdfFormField f = PdfFormField.CreateCheckBox(_pdfResultStamper.Writer);
string fieldName = string.Format("InkSaver.Global.chk{0}", availableSizes[x].Replace(".", ""));
//f.FieldName = fieldName;
string js = string.Format("hideAll(event.target, '{0}');", availableSizes[x].Replace(".", ""));
f.Action = PdfAction.JavaScript(js, _pdfResultStamper.Writer);
tCell.CellEvent = new ChildFieldEvent(_pdfResultStamper.Writer, f, fieldName);
globalRow[x + 1] = tCell;
checkboxes.Add(f);
}
table.Rows.Add(new PdfPRow(globalRow));
int status = 0;
int pageNum = 1;
for (int itemIndex = 0; itemIndex < Items.Count; itemIndex++)
{
tCell = new PdfPCell();
Phrase p = new Phrase(Items[itemIndex].Title);
tCell.AddElement(p);
tCell.HorizontalAlignment = Element.ALIGN_LEFT;
PdfPCell[] cells = new PdfPCell[availableSizes.Count + 1];
cells[0] = tCell;
for (int availCardSizeIndex = 0; availCardSizeIndex < availableSizes.Count; availCardSizeIndex++)
{
if (Items[itemIndex].Category.CategorySizesInUse.Contains(availableSizes[availCardSizeIndex]))
{
string str = availableSizes[availCardSizeIndex];
tCell = new PdfPCell();
tCell.PaddingLeft = 10f;
tCell.PaddingRight = 10f;
cells[availCardSizeIndex + 1] = tCell;
cells[availCardSizeIndex].HorizontalAlignment = Element.ALIGN_CENTER;
PdfFormField f = PdfFormField.CreateCheckBox(_pdfResultStamper.Writer);
string fieldName = string.Format("InkSaver.chk{0}.{1}", availableSizes[availCardSizeIndex].Replace(".", ""), itemIndex + 1);
//f.FieldName = fieldName; <-- This causes the checkbox to be double-named (i.e. InkSaver.Global.chk0.InkSaver.Global.chk0
string js = string.Format("hideCardSize(event.target, {0}, '{1}');", itemIndex + 1, availableSizes[availCardSizeIndex]);
f.Action = PdfAction.JavaScript(js, _pdfResultStamper.Writer);
tCell.CellEvent = new ChildFieldEvent(_pdfResultStamper.Writer, f, fieldName);
checkboxes.Add(f);
}
else
{
//Add a blank cell
tCell = new PdfPCell();
cells[availCardSizeIndex + 1] = tCell;
}
}
//Test if the column text will fit
table.Rows.Add(new PdfPRow(cells));
canvas = _pdfResultStamper.GetUnderContent(pageNum);
ColumnText ct2 = new ColumnText(canvas);
ct2.AddElement(new PdfPTable(table));
ct2.Alignment = Element.ALIGN_LEFT;
ct2.SetSimpleColumn(targetPosition.Left, 0, targetPosition.Right, targetPosition.Top, 0, 0);
status = ct2.Go(true);
if ((status != ColumnText.NO_MORE_TEXT) || (itemIndex == (Items.Count - 1)))
{
ColumnText ct3 = new ColumnText(canvas);
ct3.AddElement(table);
ct3.Alignment = Element.ALIGN_LEFT;
ct3.SetSimpleColumn(targetPosition.Left, 0, targetPosition.Right, targetPosition.Top, 0, 0);
ct3.Go();
foreach (PdfFormField f in checkboxes)
{
_pdfResultStamper.AddAnnotation(f, pageNum);
}
checkboxes.Clear();
if (itemIndex < (Items.Count - 1))
{
pageNum++;
_pdfResultStamper.InsertPage(pageNum, _pdfTemplateReader.GetPageSize(1));
table = new PdfPTable(availableSizes.Count + 1);
table.SetTotalWidth(columnWidth);
table.WidthPercentage = 100;
table.Rows.Add(new PdfPRow(this.GetTableHeaderRow(availableSizes)));
}
}
}
}
private PdfPCell[] GetTableHeaderRow(List<string> AvailableSizes)
{
PdfPCell[] sizeHeaders = new PdfPCell[AvailableSizes.Count + 1];
Phrase devName = new Phrase("Device Name");
PdfPCell deviceHeader = new PdfPCell(devName);
deviceHeader.HorizontalAlignment = Element.ALIGN_CENTER;
deviceHeader.BackgroundColor = BaseColor.GRAY;
sizeHeaders[0] = deviceHeader;
for (int x = 0; x < AvailableSizes.Count; x++)
{
PdfPCell hCell = new PdfPCell(new Phrase(AvailableSizes[x]));
hCell.HorizontalAlignment = Element.ALIGN_CENTER;
hCell.BackgroundColor = BaseColor.GRAY;
sizeHeaders[x + 1] = hCell;
}
return sizeHeaders;
}
public void MergePdfs(string filePath, List<MemoryStream> pdfStreams)
{
//Create output stream
FileStream outStream = new FileStream(filePath, FileMode.Create);
Document document = null;
if (pdfStreams.Count > 0)
{
try
{
int PageCounter = 0;
//Create Main reader
PdfReader reader = new PdfReader(pdfStreams[0]);
PageCounter = reader.NumberOfPages;//This is if we have multiple pages in the cover page, we need to adjust the offset.
//rename fields in the PDF. This is required because PDF's cannot have more than one field with the same name
RenameFields(reader, PageCounter++);
//Create Main Doc
document = new Document(reader.GetPageSizeWithRotation(1));
//Create main writer
PdfCopy Writer = new PdfCopy(document, outStream);
//PdfCopyFields Writer = new PdfCopyFields(outStream);
//Open document for writing
document.Open();
////Add pages
Writer.AddDocument(reader);
//For each additional pdf after first combine them into main document
foreach (var PdfStream in pdfStreams.Skip(1))
{
PdfReader reader2 = new PdfReader(PdfStream);
//rename PDF fields
RenameFields(reader2, PageCounter++);
// Add content
Writer.AddDocument(reader);
}
//Writer.AddJavaScript(PostProcessing.GetSuperscriptJavaScript());
Writer.Close();
}
catch (Exception ex)
{
Console.WriteLine(ex.ToString());
}
finally
{
if (document != null)
document.Close();
foreach (var Strm in pdfStreams)
{
try { if (null != Strm) Strm.Dispose(); }
catch { }
}
//pdfStamper.Close();
outStream.Close();
}
}
}
private void RenameFields(PdfReader reader, int PageNum)
{
int tempPageNum = 1;
//rename all fields
foreach (string field in reader.AcroFields.Fields.Keys)
{
if (((reader.AcroFields.GetFieldType(field) == 1) || (reader.AcroFields.GetFieldType(field) == 2)) && (field.StartsWith("InkSaver")))
{
//This is a InkSaver button, set the name so its subclassed
string classPath;
if (reader.AcroFields.GetFieldType(field) == 2)
{
classPath = field.Substring(0, field.LastIndexOf("."));
if (field.StartsWith("InkSaver.chk"))
{
int a = field.LastIndexOf(".");
string sub = field.Substring(a + 1, (field.Length - a - 1));
int pageNum = int.Parse(sub);
int realPageNum = pageNum + tempPageNum;//PostProcessing.Instance.CoverPageLength;
PageNum = realPageNum;
}
}
else
{
classPath = field.Substring(0, field.LastIndexOf("."));
}
string newID = classPath + ".page" + PageNum.ToString();
bool ret = reader.AcroFields.RenameField(field, newID);
}
else
{
reader.AcroFields.RenameField(field, field + "_" + PageNum.ToString());// field + Guid.NewGuid().ToString("N"));
}
}
}
}
public class ChildFieldEvent : IPdfPCellEvent
{
protected PdfWriter writer;
protected PdfFormField parent;
protected string checkBoxName;
internal ChildFieldEvent(PdfWriter writer, PdfFormField parent, string CheckBoxName)
{
this.writer = writer;
this.parent = parent;
this.checkBoxName = CheckBoxName;
}
public void CellLayout(PdfPCell cell, Rectangle rect, PdfContentByte[] cb)
{
createCheckboxField(rect);
}
private void createCheckboxField(Rectangle rect)
{
RadioCheckField bt = new RadioCheckField(this.writer, rect, this.checkBoxName, "Yes");
bt.CheckType = RadioCheckField.TYPE_SQUARE;
bt.Checked = true;
this.parent.AddKid(bt.CheckField);
}
}
internal class CustomCategory
{
internal static List<string> AvailableSizes
{
get
{
List<string> retVal = new List<string>();
retVal.Add("1");
retVal.Add("2");
retVal.Add("3");
retVal.Add("4");
return retVal;
}
}
internal CustomCategory()
{
CategorySizesInUse = new List<string>();
}
internal List<string> CategorySizesInUse { get; set; }
}
internal class CustomObject
{
internal string Title { get; set; }
internal CustomCategory Category { get;set; }
}
Please take a look at the MergeForms example. Your example is too long for me to read, but at first sight, I'm missing the following line:
copy.setMergeFields();
By the way, in MergeForms2, the fields are also renamed before the form is merged.

How to implement boolean retrieval using hitcollector in below scenario

I am running my code on TREC documents and right now implementing scoring scheme to get number of relevant documents. However now i want to implement boolean retrieval, I am trying to use HitCollector.
below is my code..
public class BatchSearch {
private BatchSearch() {}
/** Simple command-line based search demo. */
public static void main(String[] args) throws Exception {
String usage =
"Usage:\tjava BatchSearch [-index dir] [-simfn similarity] [-field f] [-queries file]";
if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
System.out.println(usage);
System.out.println("Supported similarity functions:\ndefault: DefaultSimilary (tfidf)\n");
System.exit(0);
}
String index = "index";
String field = "contents";
String queries = null;
String simstring = "default";
for(int i = 0;i < args.length;i++) {
if ("-index".equals(args[i])) {
index = args[i+1];
i++;
} else if ("-field".equals(args[i])) {
field = args[i+1];
i++;
} else if ("-queries".equals(args[i])) {
queries = args[i+1];
i++;
} else if ("-simfn".equals(args[i])) {
simstring = args[i+1];
i++;
}
}
Similarity simfn = null;
if ("default".equals(simstring)) {
simfn = new DefaultSimilarity();
} else if ("bm25".equals(simstring)) {
simfn = new BM25Similarity();
} else if ("dfr".equals(simstring)) {
simfn = new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH2());
} else if ("lm".equals(simstring)) {
simfn = new LMDirichletSimilarity();
}
if (simfn == null) {
System.out.println(usage);
System.out.println("Supported similarity functions:\ndefault: DefaultSimilary (tfidf)");
System.out.println("bm25: BM25Similarity (standard parameters)");
System.out.println("dfr: Divergence from Randomness model (PL2 variant)");
System.out.println("lm: Language model, Dirichlet smoothing");
System.exit(0);
}
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index)));
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(simfn);
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_41);
BufferedReader in = null;
if (queries != null) {
in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8"));
} else {
in = new BufferedReader(new InputStreamReader(new FileInputStream("queries"), "UTF-8"));
}
QueryParser parser = new QueryParser(Version.LUCENE_41, field, analyzer);
while (true) {
String line = in.readLine();
if (line == null || line.length() == -1) {
break;
}
line = line.trim();
if (line.length() == 0) {
break;
}
String[] pair = line.split(" ", 2);
Query query = parser.parse(pair[1]);
doBatchSearch(in, searcher, pair[0], query, simstring);
}
reader.close();
}
/**
* This function performs a top-1000 search for the query as a basic TREC run.
*/
public static void doBatchSearch(BufferedReader in, IndexSearcher searcher, String qid, Query query, String runtag)
throws IOException {
// Collect enough docs to show 5 pages
TopDocs results = searcher.search(query, 1000);
ScoreDoc[] hits = results.scoreDocs;
HashMap<String, String> seen = new HashMap<String, String>(1000);
int numTotalHits = results.totalHits;
int start = 0;
int end = Math.min(numTotalHits, 1000);
for (int i = start; i < end; i++) {
Document doc = searcher.doc(hits[i].doc);
String docno = doc.get("docno");
// There are duplicate document numbers in the FR collection, so only output a given
// docno once.
if (seen.containsKey(docno)) {
continue;
}
seen.put(docno, docno);
System.out.println(qid+" Q0 "+docno+" "+i+" "+hits[i].score+" "+runtag);
}
}
}
The scoring is done in doBatchSearch and now i want to implement HitCollector here.

Can I drag items from Outlook into my SWT application?

Background
Our Eclipse RCP 3.6-based application lets people drag files in for storage/processing. This works fine when the files are dragged from a filesystem, but not when people drag items (messages or attachments) directly from Outlook.
This appears to be because Outlook wants to feed our application the files via a FileGroupDescriptorW and FileContents, but SWT only includes a FileTransfer type. (In a FileTransfer, only the file paths are passed, with the assumption that the receiver can locate and read them. The FileGroupDescriptorW/FileContents approach can supply files directly application-to-application without writing temporary files out to disk.)
We have tried to produce a ByteArrayTransfer subclass that could accept FileGroupDescriptorW and FileContents. Based on some examples on the Web, we were able to receive and parse the FileGroupDescriptorW, which (as the name implies) describes the files available for transfer. (See code sketch below.) But we have been unable to accept the FileContents.
This seems to be because Outlook offers the FileContents data only as TYMED_ISTREAM or TYMED_ISTORAGE, but SWT only understands how to exchange data as TYMED_HGLOBAL. Of those, it appears that TYMED_ISTORAGE would be preferable, since it's not clear how TYMED_ISTREAM could provide access to multiple files' contents.
(We also have some concerns about SWT's desire to pick and convert only a single TransferData type, given that we need to process two, but we think we could probably hack around that in Java somehow: it seems that all the TransferDatas are available at other points of the process.)
Questions
Are we on the right track here? Has anyone managed to accept FileContents in SWT yet? Is there any chance that we could process the TYMED_ISTORAGE data without leaving Java (even if by creating a fragment-based patch to, or a derived version of, SWT), or would we have to build some new native support code too?
Relevant code snippets
Sketch code that extracts file names:
// THIS IS NOT PRODUCTION-QUALITY CODE - FOR ILLUSTRATION ONLY
final Transfer transfer = new ByteArrayTransfer() {
private final String[] typeNames = new String[] { "FileGroupDescriptorW", "FileContents" };
private final int[] typeIds = new int[] { registerType(typeNames[0]), registerType(typeNames[1]) };
#Override
protected String[] getTypeNames() {
return typeNames;
}
#Override
protected int[] getTypeIds() {
return typeIds;
}
#Override
protected Object nativeToJava(TransferData transferData) {
if (!isSupportedType(transferData))
return null;
final byte[] buffer = (byte[]) super.nativeToJava(transferData);
if (buffer == null)
return null;
try {
final DataInputStream in = new DataInputStream(new ByteArrayInputStream(buffer));
long count = 0;
for (int i = 0; i < 4; i++) {
count += in.readUnsignedByte() << i;
}
for (int i = 0; i < count; i++) {
final byte[] filenameBytes = new byte[260 * 2];
in.skipBytes(72); // probable architecture assumption(s) - may be wrong outside standard 32-bit Win XP
in.read(filenameBytes);
final String fileNameIncludingTrailingNulls = new String(filenameBytes, "UTF-16LE");
int stringLength = fileNameIncludingTrailingNulls.indexOf('\0');
if (stringLength == -1)
stringLength = 260;
final String fileName = fileNameIncludingTrailingNulls.substring(0, stringLength);
System.out.println("File " + i + ": " + fileName);
}
in.close();
return buffer;
}
catch (final Exception e) {
return null;
}
}
};
In the debugger, we see that ByteArrayTransfer's isSupportedType() ultimately returns false for the FileContents because the following test is not passed (since its tymed is TYMED_ISTREAM | TYMED_ISTORAGE):
if (format.cfFormat == types[i] &&
(format.dwAspect & COM.DVASPECT_CONTENT) == COM.DVASPECT_CONTENT &&
(format.tymed & COM.TYMED_HGLOBAL) == COM.TYMED_HGLOBAL )
return true;
This excerpt from org.eclipse.swt.internal.ole.win32.COM leaves us feeling less hope for an easy solution:
public static final int TYMED_HGLOBAL = 1;
//public static final int TYMED_ISTORAGE = 8;
//public static final int TYMED_ISTREAM = 4;
Thanks.
even if
//public static final int TYMED_ISTREAM = 4;
Try below code.. it should work
package com.nagarro.jsag.poc.swtdrag;
imports ...
public class MyTransfer extends ByteArrayTransfer {
private static int BYTES_COUNT = 592;
private static int SKIP_BYTES = 72;
private final String[] typeNames = new String[] { "FileGroupDescriptorW", "FileContents" };
private final int[] typeIds = new int[] { registerType(typeNames[0]), registerType(typeNames[1]) };
#Override
protected String[] getTypeNames() {
return typeNames;
}
#Override
protected int[] getTypeIds() {
return typeIds;
}
#Override
protected Object nativeToJava(TransferData transferData) {
String[] result = null;
if (!isSupportedType(transferData) || transferData.pIDataObject == 0)
return null;
IDataObject data = new IDataObject(transferData.pIDataObject);
data.AddRef();
// Check for descriptor format type
try {
FORMATETC formatetcFD = transferData.formatetc;
STGMEDIUM stgmediumFD = new STGMEDIUM();
stgmediumFD.tymed = COM.TYMED_HGLOBAL;
transferData.result = data.GetData(formatetcFD, stgmediumFD);
if (transferData.result == COM.S_OK) {
// Check for contents format type
long hMem = stgmediumFD.unionField;
long fileDiscriptorPtr = OS.GlobalLock(hMem);
int[] fileCount = new int[1];
try {
OS.MoveMemory(fileCount, fileDiscriptorPtr, 4);
fileDiscriptorPtr += 4;
result = new String[fileCount[0]];
for (int i = 0; i < fileCount[0]; i++) {
String fileName = handleFile(fileDiscriptorPtr, data);
System.out.println("FileName : = " + fileName);
result[i] = fileName;
fileDiscriptorPtr += BYTES_COUNT;
}
} catch (Exception e) {
e.printStackTrace();
} finally {
OS.GlobalFree(hMem);
}
}
} finally {
data.Release();
}
return result;
}
private String handleFile(long fileDiscriptorPtr, IDataObject data) throws Exception {
// GetFileName
char[] fileNameChars = new char[OS.MAX_PATH];
byte[] fileNameBytes = new byte[OS.MAX_PATH];
COM.MoveMemory(fileNameBytes, fileDiscriptorPtr, BYTES_COUNT);
// Skip some bytes.
fileNameBytes = Arrays.copyOfRange(fileNameBytes, SKIP_BYTES, fileNameBytes.length);
String fileNameIncludingTrailingNulls = new String(fileNameBytes, "UTF-16LE");
fileNameChars = fileNameIncludingTrailingNulls.toCharArray();
StringBuilder builder = new StringBuilder(OS.MAX_PATH);
for (int i = 0; fileNameChars[i] != 0 && i < fileNameChars.length; i++) {
builder.append(fileNameChars[i]);
}
String name = builder.toString();
try {
File file = saveFileContent(name, data);
if (file != null) {
System.out.println("File Saved # " + file.getAbsolutePath());
;
}
} catch (IOException e) {
System.out.println("Count not save file content");
;
}
return name;
}
private File saveFileContent(String fileName, IDataObject data) throws IOException {
File file = null;
FORMATETC formatetc = new FORMATETC();
formatetc.cfFormat = typeIds[1];
formatetc.dwAspect = COM.DVASPECT_CONTENT;
formatetc.lindex = 0;
formatetc.tymed = 4; // content.
STGMEDIUM stgmedium = new STGMEDIUM();
stgmedium.tymed = 4;
if (data.GetData(formatetc, stgmedium) == COM.S_OK) {
file = new File(fileName);
IStream iStream = new IStream(stgmedium.unionField);
iStream.AddRef();
try (FileOutputStream outputStream = new FileOutputStream(file)) {
int increment = 1024 * 4;
long pv = COM.CoTaskMemAlloc(increment);
int[] pcbWritten = new int[1];
while (iStream.Read(pv, increment, pcbWritten) == COM.S_OK && pcbWritten[0] > 0) {
byte[] buffer = new byte[pcbWritten[0]];
OS.MoveMemory(buffer, pv, pcbWritten[0]);
outputStream.write(buffer);
}
COM.CoTaskMemFree(pv);
} finally {
iStream.Release();
}
return file;
} else {
return null;
}
}
}
Have you looked at https://bugs.eclipse.org/bugs/show_bug.cgi?id=132514 ?
Attached to this bugzilla entry is an patch (against an rather old version of SWT) that might be of interest.
I had the same problem and created a small library providing a Drag'n Drop Transfer Class for JAVA SWT. It can be found here:
https://github.com/HendrikHoetker/OutlookItemTransfer
Currently it supports dropping Mail Items from Outlook to your Java SWT application and will provide a list of OutlookItems with the Filename and a byte array of the file contents.
All is pure Java and in-memory (no temp files).
Usage in your SWT java application:
if (OutlookItemTransfer.getInstance().isSupportedType(event.currentDataType)) {
Object o = OutlookItemTransfer.getInstance().nativeToJava(event.currentDataType);
if (o != null && o instanceof OutlookMessage[]) {
OutlookMessage[] outlookMessages = (OutlookMessage[])o;
for (OutlookMessage msg: outlookMessages) {
//...
}
}
}
The OutlookItem will then provide two elements: filename as String and file contents as array of byte.
From here on, one could write it to a file or further process the byte array.
To your question above:
- What you find in the file descriptor is the filename of the outlook item and a pointer to an IDataObject
- the IDataObject can be parsed and will provide an IStorage object
- The IStorageObject will be then a root container providing further sub-IStorageObjects or IStreams similar to a filesystem (directory = IStorage, file = IStream
You find those elements in the following lines of code:
Get File Contents, see OutlookItemTransfer.java, method nativeToJava:
FORMATETC format = new FORMATETC();
format.cfFormat = getTypeIds()[1];
format.dwAspect = COM.DVASPECT_CONTENT;
format.lindex = <fileIndex>;
format.ptd = 0;
format.tymed = TYMED_ISTORAGE | TYMED_ISTREAM | COM.TYMED_HGLOBAL;
STGMEDIUM medium = new STGMEDIUM();
if (data.GetData(format, medium) == COM.S_OK) {
// medium.tymed will now contain TYMED_ISTORAGE
// in medium.unionfield you will find the root IStorage
}
Read the root IStorage, see CompoundStorage, method readOutlookStorage:
// open IStorage object
IStorage storage = new IStorage(pIStorage);
storage.AddRef();
// walk through the content of the IStorage object
long[] pEnumStorage = new long[1];
if (storage.EnumElements(0, 0, 0, pEnumStorage) == COM.S_OK) {
// get storage iterator
IEnumSTATSTG enumStorage = new IEnumSTATSTG(pEnumStorage[0]);
enumStorage.AddRef();
enumStorage.Reset();
// prepare statstg structure which tells about the object found by the iterator
long pSTATSTG = OS.GlobalAlloc(OS.GMEM_FIXED | OS.GMEM_ZEROINIT, STATSTG.sizeof);
int[] fetched = new int[1];
while (enumStorage.Next(1, pSTATSTG, fetched) == COM.S_OK && fetched[0] == 1) {
// get the description of the the object found
STATSTG statstg = new STATSTG();
COM.MoveMemory(statstg, pSTATSTG, STATSTG.sizeof);
// get the name of the object found
String name = readPWCSName(statstg);
// depending on type of object
switch (statstg.type) {
case COM.STGTY_STREAM: { // load an IStream (=File)
long[] pIStream = new long[1];
// get the pointer to the IStream
if (storage.OpenStream(name, 0, COM.STGM_DIRECT | COM.STGM_READ | COM.STGM_SHARE_EXCLUSIVE, 0, pIStream) == COM.S_OK) {
// load the IStream
}
}
case COM.STGTY_STORAGE: { // load an IStorage (=SubDirectory) - requires recursion to traverse the sub dies
}
}
}
}
// close the iterator
enumStorage.Release();
}
// close the IStorage object
storage.Release();