I need help with import about 25 millions items from Elasticsearch to Mongodb. I wrote php script to do it but when the script reaches 16 millions items it fails on memory limit and throws me an error: VirtualAlloc() failed: [0x000005af] The paging file is too small for this operation to complete. I changed the system settings - virtual memory (paging file) to 100 000 according this web, but it is still not enough. I dont understand why it allocates so much memory. To get data from Elasticsearch I use scroll api. Look at the script:
<?php
error_reporting( E_ALL );
ini_set( 'memory_limit', -1 );
ini_set( 'max_execution_time', -1 );
/** #var \Nette\DI\Container $container */
$container = require( __DIR__ . '/../app/bootstrap.php' );
echo "----------------------------------------------------------------\n";
echo "--------------------- EVENT INDEX IMPORT -----------------------\n";
echo "----------------------------------------------------------------\n";
echo 'memory_limit: ' . ini_get( 'memory_limit' ) . "\n";
/** #var MongoConnect $mongo */
$mongo = $container->getService( 'mongo' );
/** #var \MongoDB\Collection $eventsCollection */
$eventsCollection = $mongo->selectCollection( 'Events', 'events' );
/** #var Elastica\Client $elastic */
$elastic = new Elastica\Client();
/** #var Elastica\Index $elasticIndex */
$elasticScrollData = $elastic->getIndex( 'event' )->request( '_search?scroll=10s', 'GET', ['size' => 250, 'sort' => ['_doc']] )->getData();
$countAll = $elasticScrollData['hits']['total'];
echo 'ES ALL ITEMS COUNT ' . $countAll . "\n";
$offset = 0;
saveToMongo( $elasticScrollData, $countAll, $offset, $elastic, $eventsCollection );
function saveToMongo( $scrollData, $countAll, $offset, \Elastica\Client $elastic, \MongoDB\Collection $mongoCollection )
{
$documents = [];
foreach ( $scrollData['hits']['hits'] as $item )
{
$doc = [];
$doc['ico'] = (array)$item['_source']['ico'];
$doc['data'] = $item['_source'];
if( isset( $item['_type'] ) ) $doc['type'] = $item['_type'];
if( isset( $item['_source']['key'] ) ) $doc['key'] = $item['_source']['key'];
if( isset( $item['_source']['action'] ) ) $doc['action'] = $item['_source']['action'];
if( isset( $item['_source']['publishDate'] ) ) $doc['publishDate'] = stringToDate( $item['_source']['publishDate'] );
if( isset( $item['_source']['generateDate'] ) ) $doc['generateDate'] = stringToDate( $item['_source']['generateDate'] );
if( isset( $item['_source']['eventDate'] ) ) $doc['eventDate'] = stringToDate( $item['_source']['eventDate'] );
$documents[] = $doc;
$offset++;
}
try
{
$mongoCollection->insertMany( $documents, ['ordered' => FALSE] );
echo '--- offest ' . ( $offset ) . ' OK' . "\n";
}
catch( \Exception $e )
{
echo '+++ insert exception: ' . $e->getMessage() . "\n";
}
if( $offset < $countAll )
{
$scrollData = $elastic->request( '_search/scroll', 'GET', ['scroll' => '10s', 'scroll_id' => $scrollData['_scroll_id']] )->getData();
saveToMongo( $scrollData, $countAll, $offset, $elastic, $mongoCollection );
}
}
function stringToDate( $string )
{
if( preg_match( '/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+\+[\d:]+$/', $string ) ) $format = 'Y-m-d\TH:i:s.uT';
elseif( preg_match( '/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+$/', $string ) ) $format = 'Y-m-d\TH:i:s.u';
elseif ( preg_match( '/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+[\d:]+$/', $string ) ) $format = 'Y-m-d\TH:i:sT';
elseif ( preg_match( '/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/', $string ) ) $format = 'Y-m-d\TH:i:s';
elseif ( preg_match( '/^\d{4}-\d{2}-\d{2}\+[\d:]+$/', $string ) ) $format = 'Y-m-dT';
elseif ( preg_match( '/^\d{4}-\d{2}-\d{2}$/', $string ) ) $format = 'Y-m-d';
return DateTime::createFromFormat( $format, $string );
}
echo "------------------------------------------------------------------------\n";
echo "------------------------- EVERYTHING IS DONE ---------------------------\n";
echo "------------------------------------------------------------------------\n";
I'm trying to write a macro for destructuring BSON data which looks like this:
let bson: Document = ...;
let (id, hash, name, path, modification_time, size, metadata, commit_data) = bson_destructure! {
get id = from (bson), optional, name ("_id"), as ObjectId;
get hash = from (bson), as String, through (|s| ContentHash::from_str(&s));
get name = from (bson), as String;
get path = from (bson), as Bson, through (PathBuf::from_bson);
get modification_time = from (bson), as UtcDatetime, through (FileTime);
get size = from (bson), as I64, through (|n| n as u64);
get metadata = from (bson), as Document, through (Metadata::from_bson);
get commit_data = from (bson), optional, as Document, through (CommitData::from_bson);
ret (id, hash, name, path, modification_time, size, metadata, commit_data)
};
I've written the following macro (pretty large) for it:
macro_rules! bson_destructure {
// required field
(
#collect req,
[$target:ident, $source:expr, $field:expr, Bson, $f:expr],
[];
$($rest:tt)*
) => {{
let $target = try!(match $source.remove($field) {
Some(v) => $f(v),
None => Err(BsonDestructureError::MissingField {
field_name: $field,
expected: "Bson"
}),
});
bson_destructure!($($rest)*)
}};
(
#collect req,
[$target:ident, $source:expr, $field:expr, $variant:ident, $f:expr],
[];
$($rest:tt)*
) => {{
let $target = try!(match $source.remove($field) {
Some(v) => match v {
::ejdb::bson::Bson::$variant(v) => $f(v),
v => Err(BsonDestructureError::InvalidType {
field_name: $field,
expected: stringify!($variant),
actual: v
})
},
None => Err(BsonDestructureError::MissingField {
field_name: $field,
expected: stringify!($variant)
}),
});
bson_destructure!($($rest)*)
}};
// optional field
(
#collect opt,
[$target:ident, $source:expr, $field:expr, Bson, $f:expr],
[];
$($rest:tt)*
) => {{
let $target = try!(match $source.remove($field) {
Some(v) => $f(v).map(Some),
None => Ok(None),
});
bson_destructure!($($rest)*)
}};
(
#collect opt,
[$target:ident, $source:expr, $field:expr, $variant:ident, $f:expr],
[];
$($rest:tt)*
) => {{
let $target = try!(match $source.remove($field) {
Some(v) => match v {
::ejdb::bson::Bson::$variant(v) => $f(v).map(Some),
v => Err(BsonDestructureError::InvalidType {
field_name: $field,
expected: stringify!($variant),
actual: v
})
},
None => Ok(None),
});
bson_destructure!($($rest)*)
}};
// change variant name
(
#collect $k:tt,
[$target:ident, $source:expr, $field:expr, $variant:ident, $f:expr],
[as $nv:ident, $($word:ident $arg:tt),*];
$($rest:tt)*
) => {
bson_destructure!(
#collect $k,
[$target, $source, $field, $nv, $f],
[$($word $arg),*];
$($rest)*
)
};
// change final mapping function
(
#collect $k:tt,
[$target:ident, $source:expr, $field:expr, $variant:ident, $f:expr],
[through ($nf:expr), $($word:ident $arg:tt),*];
$($rest:tt)*
) => {
bson_destructure!(
#collect $k,
[$target, $source, $field, $variant, $nf],
[$($word $arg),*];
$($rest)*
)
};
// change field name
(
#collect $k:tt,
[$target:ident, $source:expr, $field:expr, $variant:ident, $f:expr],
[name ($nn:expr), $($word:ident $arg:tt),*];
$($rest:tt)*
) => {
bson_destructure!(
#collect $k,
[$target, $source, $nn, $variant, $f],
[$($word $arg),*];
$($rest)*
)
};
// main forms
(get $target:ident = from ($source:expr), $($word:ident $arg:tt),*; $($rest:tt)*) => {
bson_destructure!(
#collect req,
[$target, $source, stringify!($target), Bson, Ok],
[$($word $arg),*];
$($rest)*
)
};
(get $target:ident = from ($source:expr), optional, $($word:ident $arg:tt),*; $($rest:tt)*) => {
bson_destructure!(
#collect opt,
[$target, $source, stringify!($target), Bson, Ok],
[$($word $arg),*];
$($rest)*
)
};
// final form
(ret $e:expr) => { $e }
}
However, the first example above results in the following compilation error:
src/db/data.rs:345:22: 345:25 error: no rules expected the token `opt`
src/db/data.rs:345 #collect opt,
^~~
I'm somewhat surprised that it doesn't show the error location as usual (that is, there is no indication where expansion happens), however, the error vanishes when I comment the piece of code which uses the macro out.
I can't see why it says that no rules expected this token because there is such a rule, but maybe I don't understand something.
I'm pretty sure that this is possible because that's roughly what quick_error crate does, but it seems that my macro writing skills are still lacking.
How should I fix the macro so it would work as I expect?
For completeness, the following is the definition of BsonDestructureError:
#[derive(Debug, Clone)]
pub enum BsonDestructureError {
InvalidType {
field_name: &'static str,
expected: &'static str,
actual: Bson
},
InvalidArrayItemType {
index: usize,
expected: &'static str,
actual: Bson
},
MissingField {
field_name: &'static str,
expected: &'static str
}
}
I'm also using bson crate reexported from ejdb crate. Here is a minimal example, runnable with cargo script on stable Rust.
Both cargo script, a recursive muncher, and my favourite internal rule syntax; how can I not?
First, the exact problem can be identified by running cargo rustc -- -Z trace-macros. This will output each rule as it gets expanded, giving us a "backtrace" which, after some manual reformatting, comes out looking like so:
bson_destructure! {
get id = from ( bson ) , optional , name ( "_id" ) , as ObjectId ;
get hash = from ( bson ) , as String ;
get name = from ( bson ) , as String ;
get path = from ( bson ) , as Bson ;
get modification_time = from ( bson ) , as UtcDatetime ;
get size = from ( bson ) , as I64 , through ( | n | n as u64 ) ;
get metadata = from ( bson ) , as Document ;
get commit_data = from ( bson ) , optional , as Document ;
ret ( id , hash , name , path , modification_time , size , metadata , commit_data )
}
bson_destructure! {
# collect opt ,
[ id , bson , stringify ! ( id ) , Bson , Ok ] ,
[ name ( "_id" ) , as ObjectId ] ;
get hash = from ( bson ) , as String ;
get name = from ( bson ) , as String ;
get path = from ( bson ) , as Bson ;
get modification_time = from ( bson ) , as UtcDatetime ;
get size = from ( bson ) , as I64 , through ( | n | n as u64 ) ;
get metadata = from ( bson ) , as Document ;
get commit_data = from ( bson ) , optional , as Document ;
ret ( id , hash , name , path , modification_time , size , metadata , commit_data )
}
bson_destructure! {
# collect opt ,
[ id , bson , "_id" , Bson , Ok ] , [ as ObjectId ] ;
get hash = from ( bson ) , as String ;
get name = from ( bson ) , as String ;
get path = from ( bson ) , as Bson ;
get modification_time = from ( bson ) , as UtcDatetime ;
get size = from ( bson ) , as I64 , through ( | n | n as u64 ) ;
get metadata = from ( bson ) , as Document ;
get commit_data = from ( bson ) , optional , as Document ;
ret ( id , hash , name , path , modification_time , size , metadata , commit_data )
}
A careful perusal of the rules in bson_destructure! shows the issue: there is no rule which matches the third expansion. macro_rules! is, frankly, rubbish at reporting sane error locations when it comes to recursive rules; that it's pointing to the opt token is irrelevant. The real problem is that it couldn't find a matching rule.
In particular, the offending rule is this one:
// change variant name
(
#collect $k:tt,
[$target:ident, $source:expr, $field:expr, $variant:ident, $f:expr],
[as $nv:ident, $($word:ident $arg:tt),*];
$($rest:tt)*
) => {
...
};
Note the presence of a comma immediately after $nv:ident. Also note that there is no such comma in the input. This can be solved by moving the comma inside the repetition, like so:
// change field name
(
#collect $k:tt,
[$target:ident, $source:expr, $field:expr, $variant:ident, $f:expr],
[name ($nn:expr) $(, $word:ident $arg:tt)*];
$($rest:tt)*
) => {
...
};
Another alternative (and the one I ususally go with), is to simply mutate the input when it is first encountered to make sure there is always a trailing comma in place.
The code won't actually compile on my machine, due to a native dependency, but I did verify that making this change (both here, and to the other rules with a similar issue) allows it to complete macro expansion. You can check the output looks correct using cargo rustc -- -Z unstable-options --pretty=expanded.