What default id should I use such that all the ids in my collection are greater than this id? - mongodb

I intend to fetch 100 ids at once in a sorted manner.
I find the ids greater than skip where skip can be set to a default value at the beginning. I need to sort the ids generated in the find() and the limit set is 100.
So, my query is:
db['Organization'].find({"_id":{"$gt":ObjectId(skip)}},{"_id":1}).sort([("_id",1)]).limit(100)
As of now, I have set skip to str(0). I intend to update it with the last id fetched in the iteration.
The complete endpoint is:
#hug.get('/organization/collect_pricing')
def get_organizations():
start_time = datetime.strptime('2016-11-01', '%Y-%m-%d')
org_ids = []
org_pricing_plans = []
counter = 0
skip = str(0)
result_check = True
pricing_response = []
ob_toolbox = Toolbox()
while(result_check is True):
print(counter)
try:
if
organizations = db['Organization'].find({"_id":{"$gt":ObjectId(skip)}},{"_id":1}).sort([("_id",1)]).limit(100)
except Exception as e:
print(e)
if organizations.count(True) == 0:
result_check = False
continue
counter += 100
for org in organizations:
org_ids.append("Organization$"+org["_id"])
try:
pricing_plans = ob_toolbox.bulk_fetch(collection="PricingPlan", identifier="_p_organization", ids=org_ids)
except Exception as e:
print(e)
currDict = {}
for i in range(0, organizations.count(True)):
currDict["id"] = org_ids[i]
currDict["expiresAt"] = pricing_plans[i]["expiresAt"]
currDict["resources"] = pricing_plans[i]["resources"]
currDict["_created_at"] = pricing_plans[i]["_created_at"]
org_pricing_plans.append(currDict)
print(currDict["id"])
skip = currDict["id"]
if organizations.count(True) < 100:
result_check = False
return (org_pricing_plans)

If you want the default "minimal" value, then null object id is better. It's the same type (ObjectId) and will sort lowest.
ObjectId('000000000000000000000000')
Alternatively, you could branch when doing a query. Is it first query? If yes, don't include the skip part. If no, use last id from previous results.

Related

cur.execute() psycopg2.ProgrammingError: can't call .execute() on named cursors more than once

I'm trying to get this code to run but I get this error above. Can someone please help? I've tried reading about this in other posts but I don't really know how to apply it here. I'm trying to iterate over lines of this database and select 1400 random ones. It blocks on the error above.
def paragraph_generator(test=True, itersize=5000, year=None, state=None):
con, cur = database_connection.connect(cursor_type="server")
cur.itersize = itersize
while True:
sql = f"""
SELECT
text_id,
lccn_sn,
date,
ed,
seq,
chroniclingamerica_meta.statefp,
chroniclingamerica_meta.countyfp,
text_ocr
FROM
chroniclingamerica natural join chroniclingamerica_meta
WHERE date_part('year',date) BETWEEN 1860 AND 1920
ORDER BY RANDOM()
LIMIT 1400
"""
if test:
sql = (
sql + " limit 10000"
) # limit 1000 means it only goes through 1000 lines of the database
else:
pass
print(sql)
cur.execute(sql)
for p in cur.fetchall():
tokens = stem_text(p[-1]) # Stem
# print(tokens)
tokens = (
p[-1]
.translate(str.maketrans("", "", punct))
.replace("\n", " ")
.lower()
.split(" ")
)
tokens_3 = [
a for a in tokens if len(a) == 3 if a in wn_lemmas
] # For 3-letter words, only keep WordNet recognized tokens
tokens = gensim.parsing.preprocessing.remove_short_tokens(
tokens, minsize=4
) # Remove 1-, 2-, and 3-letter words
tokens = tokens + tokens_3 # Add back in 3-letter WordNet-recognized tokens
tokens = gensim.parsing.preprocessing.remove_stopword_tokens(
tokens, stopwords=stop_words
) # Remove stopwords in stopword list above
print("THIS IS THE LENGTH OF TOKENS")
a = len(tokens)
print(a)
if len(tokens) != 0:
ocr_2 = 1 - (
len([a for a in tokens if a in wn_lemmas]) / len(tokens)
) # Generate a measure for proportion of OCR errors in a page
else:
ocr_2 = float("nan")
print("THIS IS OCR")
print(ocr_2)
ocr = ocr_2
if ocr < 0.75 and ~np.isnan(
ocr
): # If the % of OCR in a page is less than 75%, then keep the page and all tokens
tokens = tokens
else:
tokens = [] # Otherwise, give it an empty list (i.e. drop the page)
yield tokens
con.close()
Error:
cur.execute(sql)
psycopg2.ProgrammingError: can't call .execute() on named cursors more than once

PowerBI Cumulative Distinctcount by Date, Condition and through Dimension

I have the following Table:
It represents cases on which a certain Team is working on over the Time until the case is closed.
And there is also a Date Table over column Date.
I would like to cumulative count the open cases until the selected date.
So I used this measure:
CountOpen =
VAR CurrentDate = MAX('Date'[Date])
VAR Closed =
CALCULATE(
DISTINCTCOUNT(Tabelle1[case]),
ALL('Date'),'Date'[Date]<=CurrentDate,Tabelle1[Status_Open]="0")
VAR OpenAll =
CALCULATE(
DISTINCTCOUNT(Tabelle1[case]),
ALL('Date'),'Date'[Date]<=CurrentDate,Tabelle1[Status_Open]="1")
RETURN OpenAll-Closed
And it works for the overall view. But for the view within the Dimension CurrentTeam it's not correct:
It should be:
a = 0
b = 1
c = 0
So... this is actually quite tricky, you have to pick the latest status per case up to the selected date. In my solution I create a table, with a column R which ranks the cases per date, then in the result I filter for those depending on which team you have selected.
Measure is below:
VAR CurrentDate = MAX('Date'[Date])
VAR CurrentTeam = SELECTEDVALUE(Tabelle1[CurrentTeam])
VAR tbl =
SUMMARIZE(
FILTER(ALL('Tabelle1'), 'Tabelle1'[Date] <= CurrentDate),
Tabelle1[case],
Tabelle1[CurrentTeam],
Tabelle1[Status_Open],
Tabelle1[Date],
"R",
VAR c = MAX(Tabelle1[case])
VAR d = LASTDATE(Tabelle1[Date])
RETURN
CALCULATE(DISTINCTCOUNT(Tabelle1[Date]),
ALLSELECTED(Tabelle1),
Tabelle1[case] = c,
Tabelle1[Date] >= d)
)
RETURN SUMX(
FILTER(tbl,
[R] = 1 &&
(ISBLANK(CurrentTeam) || [CurrentTeam] = CurrentTeam) &&
[Status_Open])
, 1) + 0 //+0 is here to show 0 where it would be blank

stress centrality in social network

i got the error of this code which is:
path[index][4] += 1
IndexError: list index out of range
why this happened?how can i remove this error ?
Code:
def stress_centrality(g):
stress = defaultdict(int)
for a in nx.nodes_iter(g):
for b in nx.nodes_iter(g):
if a==b:
continue
pred = nx.predecessor(G,b) # for unweighted graphs
#pred, distance = nx.dijkstra_predecessor_and_distance(g,b) # for weighted graphs
if a not in pred:
return []
path = [[a,0]]
path_length = 1
index = 0
while index >= 0:
n,i = path[index]
if n == b:
for vertex in list(map(lambda x:x[0], path[:index+1]))[1:-1]:
stress[vertex] += 1
if len(pred[n]) >i:
index += 1
if index == path_length:
path.append([pred[n][i],0])
path_length += 1
else:
path[index] = [pred[n][i],0]
else:
index -= 1
if index >= 0:
path[index][4] += 1
return stress
Without the data it's hard to give you anything more than an indicative answer.
This line
path[index][4] += 1
assumes there are 5 elements in path[index] but there are fewer than that. It seems to me that your code only assigns or appends to path lists of length 2. As in
path = [[a,0]]
path.append([pred[n][i],0])
path[index] = [pred[n][i],0]
So it's hard to see how accessing the 5th element of one of those lists could ever be correct.
This is a complete guess, but I think you might have meant
path[index][1] += 4

remove duplicates in a table (rexx language)

I have a question about removing duplicates in a table (rexx language), I am on netphantom applications that are using the rexx language.
I need a sample on how to remove the duplicates in a table.
I do have a thoughts on how to do it though, like using two loops for these two tables which are A and B, but I am not familiar with this.
My situation is:
rc = PanlistInsertData('A',0,SAMPLE)
TABLE A (this table having duplicate data)
123
1
1234
12
123
1234
I need to filter out those duplicates data into TABLE B like this:
123
1234
1
12
You can use lookup stem variables to test if you have already found a value.
This should work (note I have not tested so there could be syntax errors)
no=0;
yes=1
lookup. = no /* initialize the stem to no, not strictly needed */
j=0
do i = 1 to in.0
v = in.i
if lookup.v <> yes then do
j = j + 1
out.j = v
lookup.v = yes
end
end
out.0 = j
You can eliminate the duplicates by :
If InStem first element, Move the element to OutStem Else check all the OutStem elements for the current InStem element
If element is found, Iterate to the next InStem element Else add InStem element to OutStem
Code Snippet :
/*Input Stem - InStem.
Output Stem - OutStem.
Array Counters - I, J, K */
J = 1
DO I = 1 TO InStem.0
IF I = 1 THEN
OutStem.I = InStem.I
ELSE
DO K = 1 TO J
IF (InStem.I ?= OutStem.K) & (K = J) THEN
DO
J = J + 1
OutStem.J = InStem.I
END
ELSE
DO
IF (InStem.I == OutStem.K) THEN
ITERATE I
END
END
END
OutStem.0 = J
Hope this helps.

How do you order annotations by offset in brat?

When using the rapid annotator tool brat, it appears that the created annotations file will present the annotation in the order that the annotations were performed by the user. If you start at the beginning of a document and go the end performing annotation, then the annotations will naturally be in the correct offset order. However, if you need to go earlier in the document and add another annotation, the offset order of the annotations in the output .ann file will be out of order.
How then can you rearrange the .ann file such that the annotations are in offset order when you are done? Is there some option within brat that allows you to do this or is it something that one has to write their own script to perform?
Hearing nothing, I did write a python script to accomplish what I had set out to do. First, I reorder all annotations by begin index. Secondly, I resequence the label numbers so that they are once again in ascending order.
import optparse, sys
splitchar1 = '\t'
splitchar2 = ' '
# for brat, overlapped is not permitted (or at least a warning is generated)
# we could use this simplification in sorting by simply sorting on begin. it is
# probably a good idea anyway.
class AnnotationRecord:
label = 'T0'
type = ''
begin = -1
end = -1
text = ''
def __repr__(self):
return self.label + splitchar1
+ self.type + splitchar2
+ str(self.begin) + splitchar2
+ str(self.end) + splitchar1 + self.text
def create_record(parts):
record = AnnotationRecord()
record.label = parts[0]
middle_parts = parts[1].split(splitchar2)
record.type = middle_parts[0]
record.begin = middle_parts[1]
record.end = middle_parts[2]
record.text = parts[2]
return record
def main(filename, out_filename):
fo = open(filename, 'r')
lines = fo.readlines()
fo.close()
annotation_records = []
for line in lines:
parts = line.split(splitchar1)
annotation_records.append(create_record(parts))
# sort based upon begin
sorted_records = sorted(annotation_records, key=lambda a: int(a.begin))
# now relabel based upon the sorted order
label_value = 1
for sorted_record in sorted_records:
sorted_record.label = 'T' + str(label_value)
label_value += 1
# now write the resulting file to disk
fo = open(out_filename, 'w')
for sorted_record in sorted_records:
fo.write(sorted_record.__repr__())
fo.close()
#format of .ann file is T# Type Start End Text
#args are input file, output file
if __name__ == '__main__':
parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(),
usage=globals()['__doc__'],
version='$Id$')
parser.add_option ('-v', '--verbose', action='store_true',
default=False, help='verbose output')
(options, args) = parser.parse_args()
if len(args) < 2:
parser.error ('missing argument')
main(args[0], args[1])
sys.exit(0)