Pyspark automatically rename repeated columns - pyspark

I want to automatically rename repeated columns of a df. For example:
df
Out[4]: DataFrame[norep1: string, num1: string, num1: bigint, norep2: bigint, num1: bigint, norep3: bigint]
Apply some function to end with a df like:
f_rename_repcol(df)
Out[4]: DataFrame[norep1: string, num1_1: string, num1_2: bigint, norep2: bigint, num1_3: bigint, norep3: bigint]
I've already create my own function, and works, but I'm sure there is a shorter and better way of doing it:
def f_df_col_renombra_rep(df):
from collections import Counter
from itertools import chain
import pandas as pd
columnas_original = np.array(df.columns)
d1 = Counter(df.columns)
i_corrige = [a>1 for a in dict(d1.items()).values()]
var_corrige = np.array(dict(d1.items()).keys())[i_corrige]
var_corrige_2 = [a for a in columnas_original if a in var_corrige]
columnas_nuevas = []
for var in var_corrige:
aux_corr = [a for a in var_corrige_2 if a in var]
i=0
columnas_nuevas_aux=[]
for valor in aux_corr:
i+=1
nombre_nuevo = valor +"_"+ str(i)
columnas_nuevas_aux.append(nombre_nuevo)
columnas_nuevas.append(columnas_nuevas_aux)
columnas_nuevas=list(chain.from_iterable(columnas_nuevas))
indice_cambio = pd.Series(columnas_original).isin(var_corrige)
i = 0
j = 0
colsalida = [None]*len(df.columns)
for col in df.columns:
if indice_cambio[i] == True:
colsalida[i] = columnas_nuevas[j]
j += 1
else:
colsalida[i] = col
# no cambio el nombre
i += 1
df_out = df.toDF(*(colsalida))
return df_out

You can modify the renaming function here to suit your need, but broadly I find this as the best way to rename all the duplicates columns
old_col=df.schema.names
running_list=[]
new_col=[]
i=0
for column in old_col:
if(column in running_list):
new_col.append(column+"_"+str(i))
i=i+1
else:
new_col.append(column)
running_list.append(column)
print(new_col)
This the conversion I do, the suffix assigned to the duplicate columns is not that of difference until the name(prefix) remains the same & I can save the file.
To update the columns you can simply run:
df=df.toDF(*new_col)
This should update the column names and remove all the duplicates
If you want to keep the numbering as _1,_2,_3:
You can use a dictionary and try and except block,
dict={}
for column in old_col:
try:
i=dict[column]+1
new_col.append(column+"_"+str(i))
dict[column]=i
except:
dict[column]=1
new_col.append(column+"_"+str(1)
print(new_col)

the easy way I am doing it is:
def col_duplicates(self):
'''rename dataframe with dups'''
columnas = self.columns.copy()
for i in range(len(columnas)-1):
for j in range(i+1, len(columnas), 1):
if columnas[i] == columnas[j]:
columnas[j] = columnas[i] + '_dup_' + str(j) # this line controls how to rename
return self.toDF(*columnas)
use as:
new_df_without_duplicates = col_duplicates(df_with_duplicates)

Related

Pyspark / Databricks. Kolmogorov - Smirnov over time. Efficiently. In parallel

Hello StackOverflowers.
I have a pyspark dataframe that consists of a time_column and a column with values.
E.g.
+----------+--------------------+
| snapshot| values|
+----------+--------------------+
|2005-01-31| 0.19120256617637743|
|2005-01-31| 0.7972692479278891|
|2005-02-28|0.005236883665445502|
|2005-02-28| 0.5474099672222935|
|2005-02-28| 0.13077227571485905|
+----------+--------------------+
I would like to perform a KS test of each snapshot value with the previous one.
I tried to do it with a for loop.
import numpy as np
from scipy.stats import ks_2samp
import pyspark.sql.functions as F
def KS_for_one_snapshot(temp_df, snapshots_list, j, var = "values"):
sample1=temp_df.filter(F.col("snapshot")==snapshots_list[j])
sample2=temp_df.filter(F.col("snapshot")==snapshots_list[j-1]) # pick the last snapshot as the one to compare with
if (sample1.count() == 0 or sample2.count() == 0 ):
ks_value = -1 # previously "0 observations" which gave type error
else:
ks_value, p_value = ks_2samp( np.array(sample1.select(var).collect()).reshape(-1)
, np.array(sample2.select(var).collect()).reshape(-1)
, alternative="two-sided"
, mode="auto")
return ks_value
results = []
snapshots_list = df.select('snapshot').dropDuplicates().sort('snapshot').rdd.flatMap(lambda x: x).collect()
for j in range(len(snapshots_list) - 1 ):
results.append(KS_for_one_snapshot(df, snapshots_list, j+1))
results
But the data in reality is huge so it takes forever. I am using databricks and pyspark, so I wonder what would be a more efficient way to run it by avoiding the for loop and utilizing the available workers.
I tried to do it by using a udf but in vain.
Any ideas?
PS. you can generate the data with the following code.
from random import randint
df = (spark.createDataFrame( range(1,1000), T.IntegerType())
.withColumn('snapshot' ,F.array(F.lit("2005-01-31"), F.lit("2005-02-28"),F.lit("2005-03-30") ).getItem((F.rand()*3).cast("int")))
.withColumn('values', F.rand()).drop('value')
)
Update:
I tried the following by using an UDF.
var_used = 'values'
data_input_1 = df.groupBy('snapshot').agg(F.collect_list(var_used).alias('value_list'))
data_input_2 = df.groupBy('snapshot').agg(F.collect_list(var_used).alias("value_list_2"))
windowSpec = Window.orderBy("snapshot")
data_input_2 = data_input_2.withColumn('snapshot_2', F.lag("snapshot", 1).over(Window.orderBy('snapshot'))).filter('snapshot_2 is not NULL')
data_input_final = data_input_final = data_input_1.join(data_input_2, data_input_1.snapshot == data_input_2.snapshot_2)
def KS_one_snapshot_general(sample_in_list_1, sample_in_list_2):
if (len(sample_in_list_1) == 0 or len(sample_in_list_2) == 0 ):
ks_value = -1 # previously "0 observations" which gave type error
else:
print('something')
ks_value, p_value = ks_2samp( sample_in_list_1
, sample_in_list_2
, alternative="two-sided"
, mode="auto")
return ks_value
import pyspark.sql.types as T
KS_one_snapshot_general_udf = udf(KS_one_snapshot_general, T.FloatType())
data_input_final.select( KS_one_snapshot_general_udf('value_list', 'value_list_2')).display()
Which works fine if the dataset (per snapshot) is small. But If I increase the number of rows then I end up with an error.
PickleException: expected zero arguments for construction of ClassDict (for numpy.dtype)

Talend - split a string to n rows

I would like to split a string in a column to n rows in Talend.
For example :
column
2aabbccdd
The first number is the "n" which I use to define the row lenght, so the expected result should be :
row 1 = aa
row 2 = bb
row 3 = cc
row 4 = dd
The idea here is to iterate on the string and cut it every 2 characters.
Any idea please ?
I would use a tJavaFlex to split the string, with a trick to have n rows coming out of it.
tJavaFlex's main code:
int n = Integer.parseInt(row1.str.substring(0, 4)); //get n from the first 4 characters
String str2 = row1.str.substring(4); //get the string after n
int nbParts = (str2.length() + 1) / n;
System.out.println("number of parts = " + nbParts);
for (int i = 0; i < nbParts; i++)
{
String part = str2.substring(i * n);
if(part.length() > n)
{
part = part.substring(0, n);
}
row2.str = part;
And tJavaFlex's end code is just a closing brace:
}
The trick is to use a for loop in the main code, but only close it in the end code.
tFixedFlowInput contains just one column holding the input string.

Deep learning chatbot specific Index error list index out of range

I am trying to follow a tutorial on how to make a deeplearning chatbot with pytorch. However, this code is quite complex for me and it has stopped with a "IndexError: list index out of range". I looked the error up and get the gist of what it usually means, but seeing as this code is very complex for me I can't figure out how to solve the error.
this is the source tutorial: [https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/chatbot_tutorial.ipynb#scrollTo=LTzdbPF-OBL9][1]
Line 198 seems to be causing the error
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
This is the error log
Start preparing training data ...
Reading lines...
Traceback (most recent call last):
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 221, in <module>
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 209, in loadPrepareData
pairs = filterPairs(pairs)
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 202, in filterPairs
return [pair for pair in pairs if filterPair(pair)]
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 202, in <listcomp>
return [pair for pair in pairs if filterPair(pair)]
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 198, in filterPair
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
IndexError: list index out of range
Read 442563 sentence pairs
Process finished with exit code 1
And this is my code copied from my pycharm up to the block with the error. Seeing as its a huge code I could not copy the entire code. The rest of the code can be found in the github source link above.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
corpus_name = "cornell movie-dialogs corpus"
corpus = os.path.join("D:\Documents\Python\intents", corpus_name)
def printLines(file, n=10):
with open(file, 'rb') as datafile:
lines = datafile.readlines()
for line in lines[:n]:
print(line)
printLines(os.path.join(corpus, "movie_lines.txt"))
# Splits each line of the file into a dictionary of fields
def loadLines(fileName, fields):
lines = {}
with open(fileName, 'r', encoding='iso-8859-1') as f:
for line in f:
values = line.split(" +++$+++ ")
# Extract fields
lineObj = {}
for i, field in enumerate(fields):
lineObj[field] = values[i]
lines[lineObj['lineID']] = lineObj
return lines
# Groups fields of lines from `loadLines` into conversations based on *movie_conversations.txt*
def loadConversations(fileName, lines, fields):
conversations = []
with open(fileName, 'r', encoding='iso-8859-1') as f:
for line in f:
values = line.split(" +++$+++ ")
# Extract fields
convObj = {}
for i, field in enumerate(fields):
convObj[field] = values[i]
# Convert string to list (convObj["utteranceIDs"] == "['L598485', 'L598486', ...]")
lineIds = eval(convObj["utteranceIDs"])
# Reassemble lines
convObj["lines"] = []
for lineId in lineIds:
convObj["lines"].append(lines[lineId])
conversations.append(convObj)
return conversations
# Extracts pairs of sentences from conversations
def extractSentencePairs(conversations):
qa_pairs = []
for conversation in conversations:
# Iterate over all the lines of the conversation
for i in range(len(conversation["lines"]) - 1): # We ignore the last line (no answer for it)
inputLine = conversation["lines"][i]["text"].strip()
targetLine = conversation["lines"][i+1]["text"].strip()
# Filter wrong samples (if one of the lists is empty)
if inputLine and targetLine:
qa_pairs.append([inputLine, targetLine])
return qa_pairs
# Define path to new file
datafile = os.path.join(corpus, "formatted_movie_lines.txt")
delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))
# Initialize lines dict, conversations list, and field ids
lines = {}
conversations = []
MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"]
MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]
# Load lines and process conversations
print("\nProcessing corpus...")
lines = loadLines(os.path.join(corpus, "movie_lines.txt"), MOVIE_LINES_FIELDS)
print("\nLoading conversations...")
conversations = loadConversations(os.path.join(corpus, "movie_conversations.txt"),
lines, MOVIE_CONVERSATIONS_FIELDS)
# Write new csv file
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
writer = csv.writer(outputfile, delimiter=delimiter)
for pair in extractSentencePairs(conversations):
writer.writerow(pair)
# Print a sample of lines
print("\nSample lines from file:")
printLines(datafile)
# Default word tokens
PAD_token = 0 # Used for padding short sentences
SOS_token = 1 # Start-of-sentence token
EOS_token = 2 # End-of-sentence token
class Voc:
def __init__(self, name):
self.name = name
self.trimmed = False
self.word2index = {}
self.word2count = {}
self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
self.num_words = 3 # Count SOS, EOS, PAD
def addSentence(self, sentence):
for word in sentence.split(' '):
self.addWord(word)
def addWord(self, word):
if word not in self.word2index:
self.word2index[word] = self.num_words
self.word2count[word] = 1
self.index2word[self.num_words] = word
self.num_words += 1
else:
self.word2count[word] += 1
# Remove words below a certain count threshold
def trim(self, min_count):
if self.trimmed:
return
self.trimmed = True
keep_words = []
for k, v in self.word2count.items():
if v >= min_count:
keep_words.append(k)
print('keep_words {} / {} = {:.4f}'.format(
len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
))
# Reinitialize dictionaries
self.word2index = {}
self.word2count = {}
self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
self.num_words = 3 # Count default tokens
for word in keep_words:
self.addWord(word)
MAX_LENGTH = 10 # Maximum sentence length to consider
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
s = unicodeToAscii(s.lower().strip())
s = re.sub(r"([.!?])", r" \1", s)
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
s = re.sub(r"\s+", r" ", s).strip()
return s
# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
print("Reading lines...")
# Read the file and split into lines
lines = open(datafile, encoding='utf-8').\
read().strip().split('\n')
# Split every line into pairs and normalize
pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
voc = Voc(corpus_name)
return voc, pairs
# Returns True iff both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
# Input sequences need to preserve the last word for EOS token
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
# Filter pairs using filterPair condition
def filterPairs(pairs):
return [pair for pair in pairs if filterPair(pair)]
# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
print("Start preparing training data ...")
voc, pairs = readVocs(datafile, corpus_name)
print("Read {!s} sentence pairs".format(len(pairs)))
pairs = filterPairs(pairs)
print("Trimmed to {!s} sentence pairs".format(len(pairs)))
print("Counting words...")
for pair in pairs:
voc.addSentence(pair[0])
voc.addSentence(pair[1])
print("Counted words:", voc.num_words)
return voc, pairs
# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
print(pair)
MIN_COUNT = 3 # Minimum word count threshold for trimming
I really hope someone can help me fix this problem and help me understand why it happens.
In the end I changed
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
to
try:
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
except:
return False
And now the code seems to be working.

remove duplicates in a table (rexx language)

I have a question about removing duplicates in a table (rexx language), I am on netphantom applications that are using the rexx language.
I need a sample on how to remove the duplicates in a table.
I do have a thoughts on how to do it though, like using two loops for these two tables which are A and B, but I am not familiar with this.
My situation is:
rc = PanlistInsertData('A',0,SAMPLE)
TABLE A (this table having duplicate data)
123
1
1234
12
123
1234
I need to filter out those duplicates data into TABLE B like this:
123
1234
1
12
You can use lookup stem variables to test if you have already found a value.
This should work (note I have not tested so there could be syntax errors)
no=0;
yes=1
lookup. = no /* initialize the stem to no, not strictly needed */
j=0
do i = 1 to in.0
v = in.i
if lookup.v <> yes then do
j = j + 1
out.j = v
lookup.v = yes
end
end
out.0 = j
You can eliminate the duplicates by :
If InStem first element, Move the element to OutStem Else check all the OutStem elements for the current InStem element
If element is found, Iterate to the next InStem element Else add InStem element to OutStem
Code Snippet :
/*Input Stem - InStem.
Output Stem - OutStem.
Array Counters - I, J, K */
J = 1
DO I = 1 TO InStem.0
IF I = 1 THEN
OutStem.I = InStem.I
ELSE
DO K = 1 TO J
IF (InStem.I ?= OutStem.K) & (K = J) THEN
DO
J = J + 1
OutStem.J = InStem.I
END
ELSE
DO
IF (InStem.I == OutStem.K) THEN
ITERATE I
END
END
END
OutStem.0 = J
Hope this helps.

How do you order annotations by offset in brat?

When using the rapid annotator tool brat, it appears that the created annotations file will present the annotation in the order that the annotations were performed by the user. If you start at the beginning of a document and go the end performing annotation, then the annotations will naturally be in the correct offset order. However, if you need to go earlier in the document and add another annotation, the offset order of the annotations in the output .ann file will be out of order.
How then can you rearrange the .ann file such that the annotations are in offset order when you are done? Is there some option within brat that allows you to do this or is it something that one has to write their own script to perform?
Hearing nothing, I did write a python script to accomplish what I had set out to do. First, I reorder all annotations by begin index. Secondly, I resequence the label numbers so that they are once again in ascending order.
import optparse, sys
splitchar1 = '\t'
splitchar2 = ' '
# for brat, overlapped is not permitted (or at least a warning is generated)
# we could use this simplification in sorting by simply sorting on begin. it is
# probably a good idea anyway.
class AnnotationRecord:
label = 'T0'
type = ''
begin = -1
end = -1
text = ''
def __repr__(self):
return self.label + splitchar1
+ self.type + splitchar2
+ str(self.begin) + splitchar2
+ str(self.end) + splitchar1 + self.text
def create_record(parts):
record = AnnotationRecord()
record.label = parts[0]
middle_parts = parts[1].split(splitchar2)
record.type = middle_parts[0]
record.begin = middle_parts[1]
record.end = middle_parts[2]
record.text = parts[2]
return record
def main(filename, out_filename):
fo = open(filename, 'r')
lines = fo.readlines()
fo.close()
annotation_records = []
for line in lines:
parts = line.split(splitchar1)
annotation_records.append(create_record(parts))
# sort based upon begin
sorted_records = sorted(annotation_records, key=lambda a: int(a.begin))
# now relabel based upon the sorted order
label_value = 1
for sorted_record in sorted_records:
sorted_record.label = 'T' + str(label_value)
label_value += 1
# now write the resulting file to disk
fo = open(out_filename, 'w')
for sorted_record in sorted_records:
fo.write(sorted_record.__repr__())
fo.close()
#format of .ann file is T# Type Start End Text
#args are input file, output file
if __name__ == '__main__':
parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(),
usage=globals()['__doc__'],
version='$Id$')
parser.add_option ('-v', '--verbose', action='store_true',
default=False, help='verbose output')
(options, args) = parser.parse_args()
if len(args) < 2:
parser.error ('missing argument')
main(args[0], args[1])
sys.exit(0)