Elastic Search bulk need to add headers - perl

Do we have the option to add header in the Search::Elasticsearch module.
use Search::Elasticsearch;
my $esloc='abc.com';
my $es = Search::Elasticsearch->new(nodes => [
"$esloc"],
timeout => 60,
no_refresh => 0,
userinfo => 'username:password'
);

You should be able to do it like this:
my $es = Search::Elasticsearch->new(nodes => [
"$esloc"],
timeout => 60,
no_refresh => 0,
userinfo => 'username:password',
default_headers => { 'Content-Type' => 'application/ndjson' }
);

Related

ElasticSearch (search_context_missing_exception) with Search::ElasticSearch::Scroll

I'm using Search::Elasticsearch and Search::Elasticsearch::Scroll for search and scroll into my elasticsearch server.
In scrolling process, for some querys, I'm seeing the next errors while I'm scrolling the search results:
2016/03/22 11:03:38 - 265885 FATAL: [Daemon.pm][8221]: Something gone wrong, error $VAR1 = bless( {
'msg' => '[Missing] ** [http://localhost:9200]-[404] Not Found, called from sub Search::Elasticsearch::Scroll::next at searcher.pl line 92. With vars: {\'body\' => {\'hits\' => {\'hits\' => [],\'max_score\' => \'0\',\'total\' => 5215},\'timed_out\' => bless( do{\\(my $o = 0)}, \'JSON::XS::Boolean\' ),\'_shards\' => {\'failures\' => [{\'index\' => undef,\'reason\' => {\'reason\' => \'No search context found for id [4920053]\',\'type\' => \'search_context_missing_exception\'},\'shard\' => -1},{\'index\' => undef,\'reason\' => {\'reason\' => \'No search context found for id [5051485]\',\'type\' => \'search_context_missing_exception\'},\'shard\' => -1},{\'index\' => undef,\'reason\' => {\'reason\' => \'No search context found for id [4920059]\',\'type\' => \'search_context_missing_exception\'},\'shard\' => -1},{\'index\' => undef,\'reason\' => {\'reason\' => \'No search context found for id [5051496]\',\'type\' => \'search_context_missing_exception\'},\'shard\' => -1},{\'index\' => undef,\'reason\' => {\'reason\' => \'No search context found for id [5051500]\',\'type\' => \'search_context_missing_exception\'},\'shard\' => -1}],\'failed\' => 5,\'successful\' => 0,\'total\' => 5},\'_scroll_id\' => \'c2NhbjswOzE7dG90YWxfaGl0czo1MjE1Ow==\',\'took\' => 2},\'request\' => {\'serialize\' => \'std\',\'path\' => \'/_search/scroll\',\'ignore\' => [],\'mime_type\' => \'application/json\',\'body\' => \'c2Nhbjs1OzQ5MjAwNTM6bHExbENzRDVReEc0OV9UMUgzd3Vkdzs1MDUxNDg1OnJrQ3lsUkRKVHRxRWRWeURoOTB4WVE7NDkyMDA1OTpscTFsQ3NENVF4RzQ5X1QxSDN3dWR3OzUwNTE0OTY6cmtDeWxSREpUdHFFZFZ5RGg5MHhZUTs1MDUxNTAwOnJrQ3lsUkRKVHRxRWRWeURoOTB4WVE7MTt0b3RhbF9oaXRzOjUyMTU7\',\'qs\' => {\'scroll\' => \'1m\'},\'method\' => \'GET\'},\'status_code\' => 404}
',
'stack' => [
[
'searcher.pl',
92,
'Search::Elasticsearch::Scroll::next'
]
],
'text' => '[http://localhost:9200]-[404] Not Found',
'vars' => {
'body' => {
'hits' => {
'hits' => [],
'max_score' => '0',
'total' => 5215
},
'timed_out' => bless( do{\(my $o = 0)}, 'JSON::XS::Boolean' ),
'_shards' => {
'failures' => [
{
'index' => undef,
'reason' => {
'reason' => 'No search context found for id [4920053]',
'type' => 'search_context_missing_exception'
},
'shard' => -1
},
{
'index' => undef,
'reason' => {
'reason' => 'No search context found for id [5051485]',
'type' => 'search_context_missing_exception'
},
'shard' => -1
},
{
'index' => undef,
'reason' => {
'reason' => 'No search context found for id [4920059]',
'type' => 'search_context_missing_exception'
},
'shard' => -1
},
{
'index' => undef,
'reason' => {
'reason' => 'No search context found for id [5051496]',
'type' => 'search_context_missing_exception'
},
'shard' => -1
},
{
'index' => undef,
'reason' => {
'reason' => 'No search context found for id [5051500]',
'type' => 'search_context_missing_exception'
},
'shard' => -1
}
],
'failed' => 5,
'successful' => 0,
'total' => 5
},
'_scroll_id' => 'c2NhbjswOzE7dG90YWxfaGl0czo1MjE1Ow==',
'took' => 2
},
'request' => {
'serialize' => 'std',
'path' => '/_search/scroll',
'ignore' => [],
'mime_type' => 'application/json',
'body' => 'c2Nhbjs1OzQ5MjAwNTM6bHExbENzRDVReEc0OV9UMUgzd3Vkdzs1MDUxNDg1OnJrQ3lsUkRKVHRxRWRWeURoOTB4WVE7NDkyMDA1OTpscTFsQ3NENVF4RzQ5X1QxSDN3dWR3OzUwNTE0OTY6cmtDeWxSREpUdHFFZFZ5RGg5MHhZUTs1MDUxNTAwOnJrQ3lsUkRKVHRxRWRWeURoOTB4WVE7MTt0b3RhbF9oaXRzOjUyMTU7',
'qs' => {
'scroll' => '1m'
},
'method' => 'GET'
},
'status_code' => 404
},
'type' => 'Missing'
}, 'Search::Elasticsearch::Error::Missing' );
The code I'm using is the next one (simplified) :
# Retrieve scroll
my $scroll = $self->getScrollBySignature($item);
# Retrieve all affected documents ids
while (my #docs = $scroll->next(500)) {
# Do stuff with #docs
}
The function getScrollBySignature have the next code in order to call to elasticSearch
my $scroll = $self->{ELASTIC}->scroll_helper(
index => $self->{INDEXES},
search_type => 'scan',
ignore_unavailable => 1,
body => {
size => $self->{PAGINATION},
query => {
filtered => {
filter => {
bool => {
must => [{term => {signature_id => $item->{profileId}}}, {terms => {channel_type_id => $type}}]
}
}
}
}
}
);
As you can see, I'm doing the scroll without passing scroll parameter then as documentation says, the time that scroll is alive is 1 min.
The elasticSearch is a cluster of 3 servers, and the query that ends with that error retrieves a bit more than 5000 docs.
My first solution was to update the life time for scroll to 5 minutes and the error didn't appear.
The question is, as I understand every time I'm calling $scroll->next() the life time off scroll affected is upgraded 1m more, then how is possible to receive those context related errors?
I'm doing something in a bad manner?
Thank you all.
The first thing that comes to mind is that the timer is not updated. Have you checked this? You can do a query every 10 seconds for example and see if at the 6th query it gives you the error ...
Well, a good rule of thumb is inside a ->next() block, don't stay by iteration more than time that you've configured in scroll.
Between each call of ->next() you cannot stay more than that time configured. If you stay more, the scroll may be not be there and the error earch_context_missing_exception will appear.
My solution for this problem was inside next block only store data into array/hash structure and once the scroll process ended work with all data.
The solution of the question example:
# Retrieve scroll
my $scroll = $self->getScrollBySignature($item);
# Retrieve all affected documents ids
my #allDocs;
while (my #docs = $scroll->next(500)) {
push #allDocs, map {$_->{_id}} #docs
}
foreach (#allDocs) {
# Do stuff with doc
}

How do I set the HTTP User-Agent when using Search::Elasticsearch?

I'm using Search::Elasticsearch to query MetaCPAN.
my $es = Search::Elasticsearch->new(
cxn_pool => 'Static::NoPing',
nodes => 'api.metacpan.org:80',
);
my $scroller = $es->scroll_helper(
index => 'v0',
type => 'release',
search_type => 'scan',
scroll => '2m',
size => $size,
body => {
fields => [qw(author archive date)],
query => { range => { date => { gte => $date } } },
},
);
This works ok, but I'd like to set the HTTP User-Agent header to a custom value so my requests can be identified if there's a problem. How do I do that with Search::Elasticsearch?
You can pass arguments to the handle constructor using handle_args. So for the default HTTP::Tiny you would use agent:
my $es = Search::Elasticsearch->new(
cxn_pool => 'Static::NoPing',
nodes => 'api.metacpan.org:80',
handle_args => { agent => "youragent/0.1" },
);

Perl Mongo find object Id

You would think it is a simple thing. I have a list of object id's that are in my collection. I would like to get a single record based on an object id. Have Googled, but nothing helpful.
So I have object id: 5106c7703abc120a04070b34
my $client = MongoDB::MongoClient->new;
my $db = $client->get_database( 'myDatabase' );
my $id_find = $db->get_collection('mycollection')->find({},{_id => MongoDB::OID->new(value => "5106c7703abc120a04070b34")});
print Dumper $id_find;
This prints:
$VAR1 = bless( {
'_skip' => 0,
'_ns' => 'MindCrowd_test.Users',
'_grrrr' => 0,
'partial' => 0,
'_query' => {},
'_tailable' => 0,
'_client' => bless( {
'w' => 1,
'query_timeout' => 30000,
'find_master' => 0,
'_servers' => {},
'sasl' => 0,
'wtimeout' => 1000,
'j' => 0,
'timeout' => 20000,
'sasl_mechanism' => 'GSSAPI',
'auto_connect' => 1,
'auto_reconnect' => 1,
'db_name' => 'admin',
'ssl' => 0,
'ts' => 0,
'inflate_dbrefs' => 1,
'port' => 27017,
'host' => 'mongodb://localhost:27017',
'dt_type' => 'DateTime',
'max_bson_size' => 16777216
}, 'MongoDB::MongoClient' ),
'_limit' => 0,
'slave_okay' => 0,
'_request_id' => 0,
'immortal' => 0,
'started_iterating' => 0
}, 'MongoDB::Cursor' );
I have tried different verions of the above find. All of them fail to compile:
$mongo->my_db->my_collection(find({_id => "ObjectId(4d2a0fae9e0a3b4b32f70000"}));
$mongo->my_db->my_collection(
find({ _id => MongoDB::OID->new(value => "4d2a0fae9e0a3b4b32f70000")})
);
NONE of them work. How do I find (findone) a single record using the object id??
the find methods returns a Cursor object for iterating through. If you only want one record use the find_one method which returns a value.
my $client = MongoDB::MongoClient->new;
my $db = $client->get_database( 'myDatabase' );
my $id_find = $db->get_collection('mycollection')->find_one({_id => MongoDB::OID->new(value => "5106c7703abc120a04070b34")});
print Dumper $id_find;
The answer to this has changed. MongoDB::OID has been deprecated, replaced by BSON::OID, which does not have a method that allows you to pass in the 24-byte hex string that you have. Here's what you have to do these days:
my $id = "5c7463277fc2198b64654feb";
my $oid = BSON::OID->new(oid => pack('H24', $id));
my $result = $db->get_collection('mycollection')->find_id($oid);
pack creates a 12-byte binary sequence from the 24-bytes of hexadecimal data you have in $id. This is what BSON::OID is expecting, and then the perl driver constructs the correct filter for you in the background.

LWP Get Large File Download Headers Missing

This post is follow on work related to LWP GET large file download. That post was regarding an error from LWP when trying to pass arguments in the header incorrectly. Now I am posting the changes I made and how I am trying to debug the approach. This discussion should be very informative for those interested in POST vs GET header formation, and what the server receives while using the CGI package. It is not information easily found on the net.
Here is my client code snip:
my $bytes_received = 0; # vars used below are set prior to this point
my $filename = $opt{t}."/$srcfile";
open (FH, ">", "$filename") or $logger->error( "Couldn't open $filename for writing: $!" );
my $ua = LWP::UserAgent->new();
my $target = $srcfile;
my $res = $ua->get(
$url,
':content_cb' => \&callback,
'api' => 'olfs', # Note attempted use of different types of quotes had no impact
"cmd" => 'rfile',
"target" => $target,
"bs" => $bs
);
print $logger->info("$bytes_received bytes received");
sub callback{
my($chunk, $res) = #_;
$bytes_received += length($chunk);
print FH $chunk;
}
Here is the server snip (cgi script):
my $query = new CGI;
my $rcvd_data = Dumper($query);
print $rcvd_data;
Here is the output from a GET:
$VAR1 = bless( {
'.parameters' => [],
'use_tempfile' => 1,
'.charset' => 'ISO-8859-1',
'.fieldnames' => {},
'param' => {},
'.header_printed' => 1,
'escape' => 1
}, 'CGI' );
Here is a client with a POST request:
my $ua = new LWP::UserAgent();
local $HTTP::Request::Common::DYNAMIC_FILE_UPLOAD = 1;
my $req =
POST
$url,
'Content_Type' => 'form-data',
'Content' => {
"api" => 'olfs',
"cmd" => 'wfile',
"target" => $target,
"tsize" => $file_size,
"bs" => $bs,
"filename" => [ $file ] };
# HTTP::Message calls set_content, which appears to set the subroutine for content
# LWP::UserAgent
# LWP::Protocol::file::request sends content in chunks
#
$req->content( $req->content() );
$logger->info("Uploading: $file");
my $resp = $ua->request($req);
Here is the output on the server, just like before but now from the POST:
'.parameters' => [
'cmd',
'bs',
'api',
'target',
'filename',
'tsize'
],
'use_tempfile' => 1,
'.tmpfiles' => {
'*Fh::fh00001random23' => {
'info' => {
'Content-Type' => 'text/plain',
'Content-Disposition' => 'form-data; name="filename"; filename="random23"'
},
'name' => bless( do{\(my $o = '/usr/tmp/CGItemp33113')}, 'CGITempFile' ),
'hndl' => bless( \*Fh::fh00001random23, 'Fh' )
}
},
'.charset' => 'ISO-8859-1',
'.fieldnames' => {},
'param' => {
'cmd' => [
'wfile'
],
'bs' => [
'buffer1'
],
'api' => [
'olfs'
],
'target' => [
'random23'
],
'tsize' => [
'1073741824'
],
'filename' => [
$VAR1->{'.tmpfiles'}{'*Fh::fh00001random23'}{'hndl'}
},
'escape' => 1,
'.header_printed' => 1
}, 'CGI' );
In short, you can see in the POST dump the "key" / "value" pairs, ie "target => random23". In the GET dump I do not find any keys or values from what I submitted on the client side. Can that be explained, or what do I need to do so as to extract key / value pairs in the CGI script?
You're passing your form variables as HTTP headers.
Like I previously mentioned, if you want to build a url, you can use URI.
$url = URI->new($url);
$url->query_form(
api => 'olfs',
cmd => 'rfile',
target => $target,
bs => $bs,
);

trying to parse text and html from email over IMAP using MAIL::IMAPClient but text is hidden in parts and multi-parts

I can connect to the IMAP mail server easy enough:
use Mail::IMAPClient;
use MIME::Base64;
use MIME::Parser;
my $imap = Mail::IMAPClient->new(
Server => '192.168.2.2',
User => 'xxxxxx',
Password => 'yyyyyy',
Ssl => 1,
Uid => 1,
);
my $folders = $imap->folders
or die "List folders error: ", $imap->LastError, "\n";
print "Folders: #$folders\n";
$sfolder="INBOX.2012";
$imap->select( $sfolder )
or die "Select '$Opt{sfolder}' error: ", $imap->LastError, "\n";
my #msgs = $imap->messages or die "Could not messages: $#\n";
However, the text and html I want is not easily parsed due to codes like this:
Content-Transfer-Encoding:base64
Content-Type:text/html; charset=utf-8
Content-Transfer-Encoding:base64
Content-Type:text/html; charset=utf-8
Content-Transfer-Encoding:
Content-Type:multipart/mixed; boundary="----------=_4F0F4830.7079357A"
Multipart
Content-Transfer-Encoding:
Content-Type:multipart/mixed; boundary="----=_Part_4487195_1184536749.1326753403034"
Multipart
Content-Transfer-Encoding:
Content-Type:multipart/alternative; boundary=--boundary_164442_d184e417-739f-
46d6-824a-6ea1846e79de
Multipart
Content-Transfer-Encoding:
Content-Type:multipart/mixed; boundary="----=_Part_3882878_23916831.1326509484032"
Multipart
Content-Transfer-Encoding:
I tried this but it only works on a tiny number of different encodings.
if ($imap->get_header($msg,"Content-Transfer-Encoding")=~ /base64/i) {
print "\nMatch base64";
if ($imap->get_header($msg,"Content-Type")=~m/text/i ) {
push(#mail,decode_base64($imap->body_string($msg)));
}
elsif ($imap->get_header($msg,"Content-Type")=~m/image/i )
{ print "\nImage detected"; }
elsif ($imap->get_header($msg,"Content-Type")=~m/application/i )
{ print "\nApplication detected"; }
There are 7bit and 8bit variants and other encoding methods that contain the html or text I want for later use. I successfully use decode_base64() to decode base64. The worse ones to decode are the ones that contain multi-part codes. I feel like I am re-inventing the wheel and there must be a library or module that can do all the heavy lifting for me.
Other content types such as .jpg,.gif, and .pdf should simply be ignored. The multi-part emails contain at least 1 part that I an interested but many that are useless to me.
After further research this structure has some of the information I need but don't know how to get it out efficiently is another matter.
Dumping:$VAR1 = bless( {
'bodyparms' => {
'boundary' => '----=_NextPart_002_BC64_7D688C1F.A2FF9BE0'
},
'bodyextra' => undef,
'_top' => 1,
'bodydisp' => 'NIL',
'_id' => 'HEAD',
'bodysubtype' => 'mixed',
'PartsIndex' => {
'1.3' => bless( {
'bodyparms' => 'NIL',
'bodyid' => '<d9e26cc0-019c-4ac0-9b1e-9c9ac8424f52>',
'bodyextra' => 'NIL',
'bodydisp' => 'NIL',
'_id' => '1.3',
'bodysubtype' => 'jpeg',
'_prefix' => '1.3',
'bodysize' => '4808',
'bodytype' => 'image',
'bodyMD5' => 'NIL',
'bodylang' => 'NIL',
'bodydesc' => 'NIL',
'bodyenc' => 'base64'
}, 'Mail::IMAPClient::BodyStructure' ),
'1.1' => bless( {
'bodyparms' => {
'boundary' => '----=_NextPart_000_36AE_880DDD08.0A776E35'
},
'bodyextra' => undef,
'bodydisp' => 'NIL',
'_id' => '1.1',
'bodysubtype' => 'alternative',
'_prefix' => '1.1',
'bodytype' => 'MULTIPART',
'bodystructure' => [
bless( {
'bodyparms' => {
'charset' => 'utf-8'
},
'bodyextra' => 'NIL',
'bodyid' => 'NIL',
'bodydisp' => 'NIL',
'_id' => '1.1.1',
'bodysubtype' => 'PLAIN',
'_prefix' => '1.1.1',
'bodysize' => '1971',
'bodytype' => 'TEXT',
'bodyMD5' => 'NIL',
'textlines' => '74',
'bodylang' => 'NIL',
'bodydesc' => 'NIL',
'bodyenc' => 'quoted-printable'
}, 'Mail::IMAPClient::BodyStructure' ),
bless( {
'bodyparms' => {
'charset' => 'utf-8'
},
'bodyextra' => 'NIL',
'bodyid' => 'NIL',
'bodydisp' => 'NIL',
'_id' => '1.1.2',
'bodysubtype' => 'HTML',
'_prefix' => '1.1.2',
'bodysize' => '23364',
'bodytype' => 'TEXT',
'bodyMD5' => 'NIL',
'textlines' => '331',
'bodylang' => 'NIL',
'bodydesc' => 'NIL',
'bodyenc' => 'quoted-printable'
}, 'Mail::IMAPClient::BodyStructure' )
],
'bodyloc' => 'NIL',
'bodylang' => 'NIL'
}, 'Mail::IMAPClient::BodyStructure' ),
'1' => bless( {
'bodyparms' => {
'boundary' => '----=_NextPart_001_EA96_2BF8DEDE.32622D51'
},
'bodyextra' => undef,
'bodydisp' => 'NIL',
'_id' => 1,
'bodysubtype' => 'related',
'_prefix' => 1,
'bodytype' => 'MULTIPART',
'bodystructure' => [
$VAR1->{'PartsIndex'}{'1.1'},
bless( {
'bodyparms' => 'NIL',
'bodyid' => '<5dff39db-e81c-4410-be75-8662564fd328>',
'bodyextra' => 'NIL',
'bodydisp' => 'NIL',
'_id' => '1.2',
'bodysubtype' => 'jpeg',
'_prefix' => '1.2',
'bodysize' => '14406',
'bodytype' => 'image',
'bodyMD5' => 'NIL',
'bodylang' => 'NIL',
'bodydesc' => 'NIL',
'bodyenc' => 'base64'
}, 'Mail::IMAPClient::BodyStructure' ),
$VAR1->{'PartsIndex'}{'1.3'},
bless( {
'bodyparms' => 'NIL',
'bodyid' => '<717f2ef4-f795-4d1c-87cc-283c9b0a59b0>',
'bodyextra' => 'NIL',
'bodydisp' => 'NIL',
'_id' => '1.4',
'bodysubtype' => 'gif',
'_prefix' => '1.4',
'bodysize' => '2912',
'bodytype' => 'image',
'bodyMD5' => 'NIL',
'bodylang' => 'NIL',
'bodydesc' => 'NIL',
'bodyenc' => 'base64'
}, 'Mail::IMAPClient::BodyStructure' )
],
'bodyloc' => 'NIL',
'bodylang' => 'NIL'
}, 'Mail::IMAPClient::BodyStructure' ),
'1.2' => $VAR1->{'PartsIndex'}{'1'}{'bodystructure'}[1],
'1.1.2' => $VAR1->{'PartsIndex'}{'1.1'}{'bodystructure'}[1],
'2' => bless( {
'bodyparms' => {
'name' => 'BKD-7361945220.pdf'
},
'bodyid' => 'NIL',
'bodyextra' => 'NIL',
'bodydisp' => {
'attachment' => {
'filename' => 'BKD-7361945220.pdf'
}
},
'_id' => 2,
'bodysubtype' => 'octetstream',
'_prefix' => 2,
'bodysize' => '47540',
'bodytype' => 'application',
'bodyMD5' => 'NIL',
'bodystructure' => [],
'bodylang' => 'NIL',
'bodydesc' => 'NIL',
'bodyenc' => 'base64'
}, 'Mail::IMAPClient::BodyStructure' ),
'1.4' => $VAR1->{'PartsIndex'}{'1'}{'bodystructure'}[3],
'1.1.1' => $VAR1->{'PartsIndex'}{'1.1'}{'bodystructure'}[0]
},
'_prefix' => 'HEAD',
'PartsList' => [
1,
'1.1',
'1.1.1',
'1.1.2',
'1.2',
'1.3',
'1.4',
2
],
'bodytype' => 'MULTIPART',
'bodystructure' => [
$VAR1->{'PartsIndex'}{'1'},
$VAR1->{'PartsIndex'}{'2'}
],
'bodyloc' => 'NIL',
'bodylang' => 'NIL'
}, 'Mail::IMAPClient::BodyStructure' );
As you can see none of the values are guaranteed to be part of every part on the PartsIndex and some them are nested.
variable of interest for each PartsIndex item:
bodytype
bodysubtype
bodyenc
Parse mail messages with Courriel:
use strictures;
use Mail::IMAPClient qw();
use Courriel qw();
sub walk_parts {
my ($obj, $callback) = #_;
if ($obj->is_multipart) {
for my $part ($obj->parts) {
walk_parts($part, $callback);
}
} else {
$callback->($obj);
}
}
my $imap = Mail::IMAPClient->new(
…
) or die $#;
my $folders = $imap->folders
or die $imap->LastError;
$imap->select('INBOX')
or die $imap->LastError;
my #messages = $imap->messages
or die $imap->LastError;
for my $id (#messages) {
my $raw = $imap->message_string($id)
or die $imap->LastError;
my $email = Courriel->parse(text => $raw);
walk_parts $email, sub {
my ($part) = #_;
my $content = $part->content;
my $type = $part->mime_type;
}
}
I tried using a couple of prebuilt modules but they had too many dependencies and was hard to work with. This solution adds no dependencies beyond the original. I also had issues with the dependencies for libMagic, see above, and I did not want anyone who uses my program to have to deal with that issue either.
You have to call decode twice once for the main parent, and again for each child. Since this $imap->get_bodystructure($msg); contains all the information you need why add dependencies where none are needed. It took many many hours to figure out how to decode it manually, but it was worth it.
You can add whatever decoders you want to the decode() subroutine. I only need to decode the text/html and base64 encoded versions there of. The IMAPClient functions give you a list of all parents and children so you don't have to go making a list by yourself. The tricky part is you can have any number of parent each with any number of children, but only the children contain useful data. The parents can be ignored, since many of their values are blank,undef, or 'NIL' (literally). In fact a vast number of variables have the value of 'NIL'. Even ones that the email client could have answered for the user like bodyMD5 and bodylang are USUALLY equal 'NIL'. Due to the overwhelming use of 'NIL' parsing and using other fields may prove futile. Depend on your imap server and the people you recieve email from you mileage may vary.
If you have further questions leave a comment.
use Mail::IMAPClient;
use MIME::Base64;
use MIME::Parser;
sub decode {
($process, $imap) =#_;
if ($process->bodytype eq "TEXT") {
print "\n Text SubType:".$process->bodysubtype;
if ($process->bodyenc eq "base64") {
return decode_base64($imap->bodypart_string($msg,$process->id));
}
elsif (index(" -7bit- -8bit- -quoted-printable- ",lc($process->bodyenc)) !=-1 ) {
return $imap->bodypart_string($msg,$process->id);
}
print "\n==========Insert new decoder here============";
print "\n".$imap->bodypart_string($msg,$process->id);
print "\n=================================================";
}
return "";
}
#insert your login code with credentials here
$imap->select( $sfolder )
or die "Select '$Opt{sfolder}' error: ", $imap->LastError, "\n";
my #msgs = $imap->messages or die "Could not messages: $#\n";
foreach $msg (#msgs) {
my $raw = $imap->message_string($msg)
or die $imap->LastError;
$struct = $imap->get_bodystructure($msg);
#MULTIPART is a container designation and does not contain anything useful by itself.
#However it will still process all of the children that have content
if ($struct->bodytype ne "MULTIPART") { print "\n BodyEnc:".$struct->bodyenc();}
$rDecode=decode($struct,$imap);
#do not insert blanks.
if ($rDecode ne "" && (length($rDecode)>2)) {push(#mail,$rDecode); }
foreach $dumpme ($struct->bodystructure()) {
if ($dumpme->bodytype() eq "MULTIPART") {next;}
$rDecode="";
$rDecode=decode($dumpme,$imap);
#do not insert blanks.
if (($rDecode ne "") && (length($rDecode)>2) ) {
push(#mail,$rDecode); }
}
}
You need a MIME parser. Unfortunately, even then, you will need some normalization of your own, because there are multiple ways to represent the same data in MIME.