Calculate bigrams co-occurrence matrix - find-occurrences

I tried to ask a question regarding nathandrake's #nathandrake post: How do I calculate a word-word co-occurrence matrix with sklearn?
import pandas as pd
def co_occurance_matrix(input_text,top_words,window_size):
co_occur = pd.DataFrame(index=top_words, columns=top_words)
for row,nrow in zip(top_words,range(len(top_words))):
for colm,ncolm in zip(top_words,range(len(top_words))):
count = 0
if row == colm:
co_occur.iloc[nrow,ncolm] = count
else:
for single_essay in input_text:
essay_split = single_essay.split(" ")
max_len = len(essay_split)
top_word_index = [index for index, split in enumerate(essay_split) if row in split]
for index in top_word_index:
if index == 0:
count = count + essay_split[:window_size + 1].count(colm)
elif index == (max_len -1):
count = count + essay_split[-(window_size + 1):].count(colm)
else:
count = count + essay_split[index + 1 : (index + window_size + 1)].count(colm)
if index < window_size:
count = count + essay_split[: index].count(colm)
else:
count = count + essay_split[(index - window_size): index].count(colm)
co_occur.iloc[nrow,ncolm] = count
return co_occur
My question is: what if my words are not one word but bigrams. For example:
corpus = ['ABC DEF IJK PQR','PQR KLM OPQ','LMN PQR XYZ DEF ABC']
words = ['ABC PQR','PQR DEF']
window_size =100
result = co_occurance_matrix(corpus,words,window_size)
result
I changed the word list into a bigram list, then the co_occurance_matrix function is not working. All are showing 0.

Related

Talend - split a string to n rows

I would like to split a string in a column to n rows in Talend.
For example :
column
2aabbccdd
The first number is the "n" which I use to define the row lenght, so the expected result should be :
row 1 = aa
row 2 = bb
row 3 = cc
row 4 = dd
The idea here is to iterate on the string and cut it every 2 characters.
Any idea please ?
I would use a tJavaFlex to split the string, with a trick to have n rows coming out of it.
tJavaFlex's main code:
int n = Integer.parseInt(row1.str.substring(0, 4)); //get n from the first 4 characters
String str2 = row1.str.substring(4); //get the string after n
int nbParts = (str2.length() + 1) / n;
System.out.println("number of parts = " + nbParts);
for (int i = 0; i < nbParts; i++)
{
String part = str2.substring(i * n);
if(part.length() > n)
{
part = part.substring(0, n);
}
row2.str = part;
And tJavaFlex's end code is just a closing brace:
}
The trick is to use a for loop in the main code, but only close it in the end code.
tFixedFlowInput contains just one column holding the input string.

How to remove a single number from a list with multiples of that number

As I'm a beginner in coding I wanted to try to find the first three repeated numbers in a list. My problem is that in my code when there is a number repeated three, the code breaks.
The usual, remove, pop, and del, don't work as they delete one element in the list.
import random
r = random.randint
string = ""
def first_repeat(myList):
myList = sorted(list(myList))
print(myList)
number = 0
final_numbers = []
loop = 0
while loop < 2:
try:
if number == 0:
number += 1
else:
if myList[loop] == myList[loop-1]:
final_numbers.append(myList[loop])
else:
myList.pop(loop)
myList.pop (loop-1)
number = 0
if loop == 0 :
loop += 1
else:
loop -= 1
if len(final_numbers) > 3:
return final_numbers[0], final_numbers[1], final_numbers[2]
if len(myList) <=1:
loop += 2
except:
continue
return final_numbers
for n in range(20):
string = string+str(r(0,9))
print(first_repeat(string))
the expected result should be at the first three repeated numbers.
I added some print statements so you can go through your program and find out where the logic is wrong with your code.
import random
r = random.randint
string = ""
def first_repeat(myList):
myList = sorted(list(myList))
print(myList)
number = 0
final_numbers = []
loop = 0
while loop < 2:
print( 'inside while loop: loop = {}'.format( loop ))
try:
if number == 0:
number += 1
else:
if myList[loop] == myList[loop-1]:
print( 'in -> if myList[loop] == myList[loop-1]' )
final_numbers.append(myList[loop])
print( 'final_numbers: [{}]'.format( ','.join( final_numbers )))
else:
print( 'in first -> else' )
myList.pop(loop)
myList.pop (loop-1)
number = 0
print( 'myList: [{}]'.format( ','.join( myList ) ))
if loop == 0 :
loop += 1
else:
loop -= 1
if len(final_numbers) > 3:
print( 'returning final numbers' )
print( final_numbers )
return final_numbers[0], final_numbers[1], final_numbers[2]
if len(myList) <=1:
loop += 2
except:
continue
print( 'at end of this loop final numbers is: [{}]'.format( ','.join( final_numbers)))
print( 'press any key to continue loop: ')
input()
return final_numbers
for n in range(20):
string = string+str(r(0,9))
print(first_repeat(string))
Following is a method to do it taking advantage of pythons defaultdict
https://docs.python.org/2/library/collections.html#collections.defaultdict
#import defaultdict to keep track of number counts
from collections import defaultdict
#changed parameter name since you are passing in a string, not a list
def first_repeat( numbers_string ):
#create a dictionary - defaulddict( int ) is a dictionary with keys
#instantiated to 0 - (instead of throwing a key error)
number_count = defaultdict( int )
#convert your string to a list of integers - look up list iterations
numbers = [ int( s ) for s in list( numbers )]
# to store the repeated numbers
first_three_repeats = []
for number in numbers:
# for each number in the list, increment when it is seen
number_count[number] += 1
#at first occurence of 3 numbers, return the number
if number_count[number] == 2:
first_three_repeats.append( number )
if len( first_three_repeats ) == 3:
return first_three_repeats
#if here - not three occurrences of repeated numbers
return []
for n in range(20):
string = string+str(r(0,9))
print( findFirstThreeNumbers( string ))

stress centrality in social network

i got the error of this code which is:
path[index][4] += 1
IndexError: list index out of range
why this happened?how can i remove this error ?
Code:
def stress_centrality(g):
stress = defaultdict(int)
for a in nx.nodes_iter(g):
for b in nx.nodes_iter(g):
if a==b:
continue
pred = nx.predecessor(G,b) # for unweighted graphs
#pred, distance = nx.dijkstra_predecessor_and_distance(g,b) # for weighted graphs
if a not in pred:
return []
path = [[a,0]]
path_length = 1
index = 0
while index >= 0:
n,i = path[index]
if n == b:
for vertex in list(map(lambda x:x[0], path[:index+1]))[1:-1]:
stress[vertex] += 1
if len(pred[n]) >i:
index += 1
if index == path_length:
path.append([pred[n][i],0])
path_length += 1
else:
path[index] = [pred[n][i],0]
else:
index -= 1
if index >= 0:
path[index][4] += 1
return stress
Without the data it's hard to give you anything more than an indicative answer.
This line
path[index][4] += 1
assumes there are 5 elements in path[index] but there are fewer than that. It seems to me that your code only assigns or appends to path lists of length 2. As in
path = [[a,0]]
path.append([pred[n][i],0])
path[index] = [pred[n][i],0]
So it's hard to see how accessing the 5th element of one of those lists could ever be correct.
This is a complete guess, but I think you might have meant
path[index][1] += 4

Matlab - preprocess CSV file

I have a CSV file in a format similar to the following one:
title1
index columnA1 columnA2 columnA3
1 2 3 6
2 23 23 1
3 2 3 45
4 2 2 101
title2
index columnB1 columnB2 columnB3
1 23 53 6
2 22 13 1
3 5 4 43
4 8 6 102
I want to build a function readCustomCSV which receives a CSV file in the bellow illustrated format and a row index i and returns an output file with (for let's say i = 3) the following content:
title1
index columnA1 columnA2 columnA3
3 2 3 45
title2
index columnB1 columnB2 columnB3
3 5 4 43
Do you know how to use the csvread function in order to obtain this type of functionality?
It confuses me that there are 2 types sections. I was thinking at using the whole thing as a string and then split it into 2 .csv files and then read the corresponding line line.
try using this function :
I assumed that all tables have equal number of columns/rows. The code can definitely be shortened / improved / extended ;)
function multi_table_csvread (row_index)
filename_INPUT = 'multi_table.csv' ;
filename_OUTPUT = 'selected_row.csv' ;
fIN = fopen(filename_INPUT,'r');
nextLine = fgetl(fIN);
tableIndex = 0;
tableLine = 0;
csvTable = [];
% start reading the csv file, line by line
while nextLine ~= -1
lineStr = strtrim(strsplit(nextLine,',')) ;
% remove empty cells
lineStr(cellfun('isempty',lineStr)) = [] ;
tableLine = tableLine + 1 ;
% if 1 element start new table
if numel(lineStr) == 1
tableIndex = tableIndex + 1;
tableLine = 1;
csvTable{tableIndex,tableLine} = lineStr ;
else
lineStr = add_comas(lineStr) ;
csvTable{tableIndex,tableLine} = lineStr ;
end
nextLine = fgetl(fIN);
end
fclose(fIN);
fOUT = fopen(filename_OUTPUT,'w');
if row_index > size(csvTable,2) -2
error('The row index exceeds the maximum number of rows!')
end
for k = 1 : size(csvTable,1)
title = csvTable{k,1};
columnHeaders = csvTable{k,2};
selected_row = csvTable{k,row_index+2};
fprintf(fOUT,'%s\n',title{:});
fprintf(fOUT,'%s',columnHeaders{:});
fprintf(fOUT,'\n');
fprintf(fOUT,'%s',selected_row{:});
fprintf(fOUT,'\n');
end
fclose(fOUT);
function line_with_comas = add_comas(this_line)
for ii = 1 : length(this_line)-1
this_line{ii} = strcat(this_line{ii},',') ;
end
line_with_comas = this_line ;

Run time error 9 when using arrary in Macro

I have been using the followng Macro and it works fine:
Sub PremTable()
Dim i, m, j As Integer
Dim PDFDiv, PDFClass, PDFSex, PDFPlan, LimAge As Variant
Dim FlagD, FlagC, Band, FlagP, FlagB, IssAge, Dur As Integer
PDFClass = Array("N", "S")
PDFSex = Array("M", "F")
PDFDiv = Array("G", "E")
PDFPlan = Array(10, 20, 30)
LimAge = Array(70, 60, 50)
j = 0
For FlagD = 1 To 2
Range("div").Value = PDFDiv(FlagD)
For FlagP = 1 To 3
Range("plan").Value = PDFPlan(FlagP)
For Band = 1 To 3
Range("band").Value = Band
For FlagS = 1 To 2
Range("sex").Value = PDFSex(FlagS)
For FlagC = 1 To 2
Range("class").Value = PDFClass(FlagC)
m = 18
For i = 1 To Range("LimAge").Value - 17
Range("IssAge").Offset(i + j, 0) = m
Range("age").Value = Range("IssAge").Offset(i + j, 0)
Worksheets("input").Range("J4:J76").Copy
Worksheets("Premium Tables").Range("M1").Offset(i + j, 0).PasteSpecial xlPasteValues, Transpose:=True
Range("DIV2").Offset(i + j, 0) = Range("Div")
Range("PLAN2").Offset(i + j, 0) = Range("plan")
Range("BAND2").Offset(i + j, 0) = Range("band")
Range("SEX2").Offset(i + j, 0) = Range("sex")
Range("CLASS2").Offset(i + j, 0) = Range("class")
m = m + 1
Next i
j = j + i - 1
Next FlagC
Next FlagS
Next Band
Next FlagP
Next FlagD
End Sub
Now I have another very similar spreatsheet that I want to use this macro to creat tables, but it always give me the "run time error 9" for all of the arrays having text format variables (for example: Range("class").Value = PDFClass(FlagC) causing an runtime error 9)
Please advise! Thanks very much!