Count words of nested subschema field of mongoDB collecction - mongodb

I am working on data analysis of CV data of a large mongoDB collection. I try to count the absolute frequencey of words in the job title (jobs.jobTitle field in below schema).
The documents are structured like this:
{
firstName: String,
lastName: String,
jobs: [{jobTitle: 'software architect', company: String, ...}, {jobTitle: 'full stack software engineer', company: String, ...}, {jobTitle: 'javascript developer', company: String, ...}],
...
}
I would like to iterate over the entire collection and get an outcome like this:
[{word: 'manager', count: 3245},{word: 'engineer', count: 3102}, {word: 'software', count: 3021}, ..]
I tried the following aggregation:
db.cvs.aggregate([
{
$project: {
words: { $split: ["$jobs.jobTitle", " "] }
}
},
{
$unwind: {
path: "$words"
}
},
{
$group: {
_id: "$words",
count: { $sum: 1 }
}
},
{ $sort: { "count": -1 } }
])
Which results to the following error message:
$split requires an expression that evaluates to a string as a first argument, found: array
Can I concat the string values of jobs.jobTitle first to a string by using an aggregation? Or is there any other way to achive the expected result?

Thanks for the quick comment #NeilLunn
I would like to share the corrected query with everyone:
db.cvs.aggregate([
{ "$unwind": "$jobs" },
{
$project: {
words: { $split: ["$jobs.jobTitle", " "] }
}
},
{
$unwind: {
path: "$words"
}
},
{
$group: {
_id: "$words",
count: { $sum: 1 }
}
},
{ $sort: { "count": -1 } }
])

Related

Mongoose - filter matched documents and assign the resultant length to a field

I have this collection(some irrelevant fields were omitted for brevity):
clients: {
userId: ObjectId,
clientSalesValue: Number,
currentDebt: Number,
}
Then I have this query that matches all the clients for a specific user, then calculates the sum of all debts and sales and put those results in a separate field each of them:
await clientsCollection.aggregate([
{
$match: { userId: new ObjectId(userId) }
},
{
$group: {
_id: null,
totalSalesValue: { $sum: '$clientSalesValue' },
totalDebts: { $sum: '$currentDebt' },
}
},
{
$unset: ['_id']
}
]).exec();
This works as expected, it returns an array with only one item which is an object, but now I need to also include in that resultant object a field for the amount of debtors, that is for the amount of clients that have currentDebt > 0, how can I do that is the same query? is it possible?
PD: I cannot modify the $match condition, it need to always return all the clients for the corresponding users.
To include a count of how many matching documents have a positive currentDebt, you can use the $sum and $cond operators like so:
await clientsCollection.aggregate([
{
$match: { userId: new ObjectId(userId) }
},
{
$group: {
_id: null,
totalSalesValue: { $sum: '$clientSalesValue' },
totalDebts: { $sum: '$currentDebt' },
numDebtors: {
$sum: {
$cond: [{ $gt: ['$currentDebt', 0] }, 1, 0]
}
},
}
},
{
$unset: ['_id']
}
]).exec();

categraji data by using MongoDb aggregation

Payload in excel sheets that consist of 4 columns i.e Date, status, amount, orderId.You need to structure the data / categorize the columns according to months and in each month orders are categorized as per status.
Umbrella Status:
INTRANSIT - ‘intransit’, ‘at hub’, ‘out for delivery’
RTO - ‘RTO Intransit’, ‘RTO Delivered’
PROCESSING - ‘processing’
For example:
Response should look like: -
May :
1.INTRANSIT
2. RTO
3.PROCESSING
June:
1.INTRANSIT
2. RTO
3.PROCESSING
You can use different aggregation operators provided in MongoDB.For example: -group, facet, Match, unwind, bucket, project, lookup, etc.
I tried it with this:
const pipeline = [{
$facet:
{
"INTRANSIT": [{ $match: { Status: { $in: ['INTRANSIT', 'AT HUB', 'OUT FOR
DELIVERY'] } } }, { $group: { _id: "$Date", numberofbookings: { $sum: 1 } }
}],
"RTO": [{ $match: { Status: { $in: ['RTO INTRANSIT', 'RTO DELIVERED'] } } },
{ $group: { _id: "$Date", numberofbookings: { $sum: 1 } } }],
"PROCESSING": [{ $match: { Status: { $in: ['PROCESSING'] } } }, {
$group: {
_id: date.getMonth("$Date"),
numberofbookings: { $sum: 1 }
}
}]
}
}];
const aggCursor = coll.aggregate(pipeline);

MongoDB: How to speed up my data reorganisation query/operation?

I'm trying to analyse some data and I thought my queries would be faster ultimately by storing a relationship between my collections instead. So I wrote something to do the data normalisation, which is as follows:
var count = 0;
db.Interest.find({'PersonID':{$exists: false}, 'Data.DateOfBirth': {$ne: null}})
.toArray()
.forEach(function (x) {
if (null != x.Data.DateOfBirth) {
var peep = { 'Name': x.Data.Name, 'BirthMonth' :x.Data.DateOfBirth.Month, 'BirthYear' :x.Data.DateOfBirth.Year};
var person = db.People.findOne(peep);
if (null == person) {
peep._id = db.People.insertOne(peep).insertedId;
//print(peep._id);
}
db.Interest.updateOne({ '_id': x._id }, {$set: { 'PersonID':peep._id }})
++count;
if ((count % 1000) == 0) {
print(count + ' updated');
}
}
})
This script is just passed to mongo.exe.
Basically, I attempt to find an existing person, if they don't exist create them. In either case, link the originating record with the individual person.
However this is very slow! There's about 10 million documents and at the current rate it will take about 5 days to complete.
Can I speed this up simply? I know I can multithread it to cut it down, but have I missed something?
In order to insert new persons into People collection, use this one:
db.Interest.aggregate([
{
$project: {
Name: "$Data.Name",
BirthMonth: "$Data.DateOfBirth.Month",
BirthYear: "$Data.DateOfBirth.Year",
_id: 0
}
},
{
$merge: {
into: "People",
// requires an unique index on {Name: 1, BirthMonth: 1, BirthYear: 1}
on: ["Name", "BirthMonth", "BirthYear"]
}
}
])
For updating PersonID in Interest collection use this pipeline:
db.Interest.aggregate([
{
$lookup: {
from: "People",
let: {
name: "$Data.Name",
month: "$Data.DateOfBirth.Month",
year: "$Data.DateOfBirth.Year"
},
pipeline: [
{
$match: {
$expr: {
$and: [
{ $eq: ["$Name", "$$name"] },
{ $eq: ["$BirthMonth", "$$month"] },
{ $eq: ["$BirthYear", "$$year"] }
]
}
}
},
{ $project: { _id: 1 } }
],
as: "interests"
}
},
{
$set: {
PersonID: { $first: "$interests._id" },
interests: "$$REMOVE"
}
},
{ $merge: { into: "Interest" } }
])
Mongo Playground

translate sql query into mongodb query

I'm trying to grasp the mongodb concepts by translating some of our sql queries into mongo aggregation framework.
I have an sql code:
select dbo.VisitNo(u.id) as visitNo , o.id, o.PatientId, u.VisitDate
from dbo.Observation o
join sbo.ProspectiveFollowUp u on u.rootid = o.Id
order by o.PatientId
The dbo.VisitNo is implemented as:
CREATE FUNCTION dbo.VisitNo(#Id int)
RETURNS INT
AS
BEGIN
DECLARE #VisitDate date, #RootId int
SELECT #VisitDate=VisitDate, #RootId=RootId FROM dbo.ProspectiveFollowUp WHERE Id=#Id
RETURN (SELECT COUNT(1) FROM dbo.ProspectiveFollowUp WHERE RootId = #RootId AND VisitDate <= #VisitDate)
END
result:
My document in Mongo has following structure:
{
"_id",
"values":[
{
"Id",
"PatientId",
"ProspectiveFollowUp":[
"Id",
"RootId",
"VisitDate"
]
}
]
}
The values array has always one element, but that's how the data was imported. ProspectiveFollowUp has at least one record.
Creating query for retrieving the data was rather easy:
db.dbo_ObservationJSON.aggregate([
{ $unwind: '$values' },
{
$project: {
_id: 0,
Id: '$values.Id',
PatientId: '$values.PatientId',
VisitDate: '$values.ProspectiveFollowUp.VisitDate'
}
},
{ $unwind: '$VisitDate' },
{ $sort: { PatientId: 1 } }
])
The harder part is the custom function itself. I can't think outside od tsql world yet, so I have hard time getting this to work. I have translated the function into mongo the following way:
var id = 4
var result = db.dbo.ObservationJSON.aggregate([
{ $unwind: '$values' },
{ $unwind: '$values.ProspectiveFollowUp' },
{ $project: { Id: '$values.ProspectiveFollowUp.Id', RootId: '$values.ProspectiveFollowUp.RootId', VisitDate: '$values.ProspectiveFollowUp.VisitDate', _id:0 }},
{ $match: { Id: id }}
]).toArray()[0]
var totalResult = db.dbo_ObservationJSON.aggregate([{
$unwind: {
path: '$values'
}
}, {
$unwind: {
path: '$values.ProspectiveFollowUp'
}
}, {
$project: {
Id: '$values.ProspectiveFollowUp.Id',
RootId: '$values.ProspectiveFollowUp.RootId',
VisitDate: '$values.ProspectiveFollowUp.VisitDate'
}
}, {
$match: {
RootId: result.RootId,
VisitDate: {
$lte: result.VisitDate
}
}
},{$count: 'total'}]).toArray()[0]
But don't know how to integrate it into the aggregation function above.
Can I write the entire sql query equivalent into one mongo aggregate expression?
I finally got it to work.
db.dbo_ObservationJSON.aggregate([
{ $unwind: '$values' },
{ $unwind: { path: '$values.ProspectiveFollowUp', "includeArrayIndex": "index" } },
{
$project: {
_id: 0,
VisitNo: { $add: ['$index', 1] },
RootId: '$values.ProspectiveFollowUp.RootId',
PatientId: '$values.PatientId',
VisitDate: '$values.ProspectiveFollowUp.VisitDate'
}
},
{
$sort: {
PatientId: 1
}
}
]);

MongoDB sum with match

I have a collection with the following data structure:
{
_id: ObjectId,
text: 'This contains some text',
type: 'one',
category: {
name: 'Testing',
slug: 'test'
},
state: 'active'
}
What I'm ultimately trying to do is get a list of categories and counts. I'm using the following:
const query = [
{
$match: {
state: 'active'
}
},
{
$project: {
_id: 0,
categories: 1
}
},
{
$unwind: '$categories'
},
{
$group: {
_id: { category: '$categories.name', slug: '$categories.slug' },
count: { $sum: 1 }
}
}
]
This returns all categories (that are active) and the total counts for documents matching each category.
The problem is that I want to introduce two additional $match that should still return all the unique categories, but only affect the counts. For example, I'm trying to add a text search (which is indexed on the text field) and also a match for type.
I can't do this at the top of the pipeline because it would then only return categories that match, not only affect the $sum. So basically it would be like being able to add a $match within the $group only for the $sum. Haven't been able to find a solution for this and any help would be greatly appreciated. Thank you!
You can use $cond inside of your $group statement:
{
$group: {
_id: { category: '$categories.name', slug: '$categories.slug' },
count: { $sum: { $cond: [ { $eq: [ "$categories.type", "one" ] }, 1, 0 ] } }
}
}