Adding a custom function in SPARK NLP Pipeline

Adding a custom function in SPARK NLP Pipeline - pyspark

I am trying to add a custom function (to remove names from a document) and I want this udf to be incorporated into the SPARK NLP pipeline. Can you please tell me where I am doing it wrong?
Here is the udf I created to remove names
def remove_names(str):
#All_names = list(map(lambda x: x.lower(), All_names))
All_names = ['josh','alice']
split_str = str.split()
found_names = [name for name in split_str if name in All_names]
for name in found_names:
str = str.replace(name,'')
str = ' '.join(str.split())
return str
udf_txt_clean = udf(lambda x: remove_names(x),StringType())
My pipeline looks like this :
documentAssembler = DocumentAssembler()\
.setInputCol("combined_text")\
.setOutputCol("document")
remove_names = udf_txt_clean('document') \
.setInputCols(["document"]) \
.setOutputCol("noname")
tokenizer = Tokenizer() \
.setInputCols(["noname"]) \
.setOutputCol("token")
spellModel = ContextSpellCheckerModel\
.pretrained()\
.setInputCols("token")\
.setOutputCol("checked")
normalizer = Normalizer() \
.setInputCols(["checked"]) \
.setOutputCol("normalized")\
.setLowercase(True)\
stopwords_cleaner = StopWordsCleaner()\
.setInputCols("normalized")\
.setOutputCol("cleanTokens")\
.setCaseSensitive(False)\
tokenassembler = TokenAssembler()\
.setInputCols(["document", "cleanTokens"]) \
.setOutputCol("clean_text")
nlpPipeline = Pipeline(stages=[
documentAssembler,
remove_names,
tokenizer,
spellModel,
normalizer,
stopwords_cleaner,
tokenassembler
])
empty_df = spark.createDataFrame([['']]).toDF("text")
pipelineModel = nlpPipeline.fit(empty_df)
result = pipelineModel.transform(sdf)
result.select('text', explode('clean_text.result').alias('clean_text')).display()
Sample Data just to recreate the issue
text = 'josh and sam are buddies. They (both) like <b>running</b>. They got better at it over the weekend'
test = pd.DataFrame({"test": [text]})
sdf = spark.createDataFrame(test)

Related

case when in merge statement databricks

I am trying to upsert in Databricks using merge statement in pyspark. I wanted to know if using expressions (e.g. adding two columns, case when) allowed in the whenMatchedUpdate part. For example I want to do something like this
deltaTableTarget = DeltaTable.forPath(spark, delta_table_path)
deltaTableTarget.alias('TgtCrmUserAggr') \
.merge(
broadcast(df_transformed.alias('DeltaSource')),
"DeltaSource.primary_key==TargetTable.primary_key"
) \
.whenMatchedUpdate(set =
{
"aggcount":"DeltaSource.count + TargetTable.count",
"max_date": "case when DeltaSource.max_date > TargetTable.max_date then DeltaSource.max_date else TargetTable.max_date end"
}
)
.whenNotMatchedInsert().insertAll()
)\
.execute()

If I understand your logic well, you can just take the max value of the 2 columns, right?
deltaTableTarget = DeltaTable.forPath(spark, delta_table_path)
deltaTableTarget.alias('TgtCrmUserAggr') \
.merge(
broadcast(df_transformed.alias('DeltaSource')),
"DeltaSource.primary_key==TargetTable.primary_key"
) \
.whenMatchedUpdate(set =
{
"aggcount":"DeltaSource.count + TargetTable.count",
"max_date": "GREATEST(DeltaSource.max_date,TargetTable.max_date)"
}
)
.whenNotMatchedInsert().insertAll()
)\
.execute()
If this is not correct, something you could do is use multiple whenMatchedUpdate() functions with a condition.
deltaTableTarget = DeltaTable.forPath(spark, delta_table_path)
deltaTableTarget.alias('TgtCrmUserAggr') \
.merge(
broadcast(df_transformed.alias('DeltaSource')),
"DeltaSource.primary_key==TargetTable.primary_key"
) \
.whenMatchedUpdate(condition= 'DeltaSource.max_date > TargetTable.max_date',
set =
{
"aggcount":"DeltaSource.count + TargetTable.count",
"max_date": "DeltaSource.max_date"
}
)
.whenMatchedUpdate(set =
{
"aggcount":"DeltaSource.count + TargetTable.count",
"max_date": "TargetTable.max_date"
}
)
.whenNotMatchedInsert().insertAll()
)\
.execute()

unable to get repr for <class 'albumentations.core.composition.compose'>

I am trying to run a code repository downloaded from GitHub as it is mentioned in the instruction of that but getting following error.
TypeError: init() missing 1 required positional argument: 'image_paths'
I am having this error at the code line 63 (preprocessing = preprocessing).
When I srat the program in debug mode I shows following error
unable to get repr for <class 'albumentations.core.composition.compose'>
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import torch
from skimage import io
from utils import adjust_sar_contrast, compute_building_score, plot_images
sys.path.append('/home/salman/Downloads/SpaceNet_SAR_Buildings_Solutions-master/4-motokimura/tmp/work')
from spacenet6_model.configs.load_config import get_config_with_previous_experiment
from spacenet6_model.datasets import SpaceNet6TestDataset
from spacenet6_model.models import get_model
from spacenet6_model.transforms import get_augmentation, get_preprocess
# select previous experiment to load
exp_id = 14
exp_log_dir = "/home/salman/Downloads/SpaceNet_SAR_Buildings_Solutions-master/4-motokimura/tmp/logs" # None: use default
# select device to which the model is loaded
cuda = True
if cuda:
device = 'cuda'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
else:
device = 'cpu'
os.environ['CUDA_VISIBLE_DEVICES'] = ''
# overwrite default config with previous experiment
config = get_config_with_previous_experiment(exp_id=exp_id, exp_log_dir=exp_log_dir)
# overwrite additional hyper parameters
config.MODEL.DEVICE = device
config.WEIGHT_ROOT = "/home/salman/Downloads/SpaceNet_SAR_Buildings_Solutions-master/4-motokimura/tmp/weights/"
config.MODEL.WEIGHT = f"/home/salman/Downloads/SpaceNet_SAR_Buildings_Solutions-master/4-motokimura/tmp/weights/exp_{exp_id:04d}/model_best.pth"
config.INPUT.MEAN_STD_DIR = "/home/salman/Downloads/SpaceNet_SAR_Buildings_Solutions-master/4-motokimura/tmp/work/models/image_mean_std/"
config.INPUT.TEST_IMAGE_DIR = "/home/salman/data/SN6_buildings_AOI_11_Rotterdam_test_public/test_public/AOI_11_Rotterdam/SAR-Intensity"
config.INPUT.SAR_ORIENTATION="/home/salman/Downloads/SpaceNet_SAR_Buildings_Solutions-master/4-motokimura/tmp/work/static/SAR_orientations.txt"
config.TRAIN_VAL_SPLIT_DIR="/home/salman/Downloads/data/spacenet6/split"
config.PREDICTION_ROOT="/home/salman/Downloads/data/spacenet6/predictions"
config.POLY_CSV_ROOT="/home/salman/Downloads/data/spacenet6/polygons"
config.CHECKPOINT_ROOT="/home/salman/Downloads/data/spacenet6/ceckpoints"
config.POLY_OUTPUT_PATH="/home/salman/Downloads/data/spacenet6/val_polygons"
config.freeze()
print(config)
model = get_model(config)
model.eval();
from glob import glob
image_paths = glob(os.path.join(config.INPUT.TEST_IMAGE_DIR, "*.tif"))
#print(image_paths)
preprocessing = get_preprocess(config, is_test=True)
augmentation = get_augmentation(config, is_train=False)
test_dataset = SpaceNet6TestDataset(
config,
augmentation=augmentation,
preprocessing=preprocessing
)
test_dataset_vis = SpaceNet6TestDataset(
config,
augmentation=augmentation,
preprocessing=None
)
channel_footprint = config.INPUT.CLASSES.index('building_footprint')
channel_boundary = config.INPUT.CLASSES.index('building_boundary')
score_thresh = 0.5
alpha = 1.0
start_index = 900
N = 20
for i in range(start_index, start_index + N):
image_vis = test_dataset_vis[i]['image']
image = test_dataset[i]['image']
x_tensor = image.unsqueeze(0).to(config.MODEL.DEVICE)
pr_score = model.module.predict(x_tensor)
pr_score = pr_score.squeeze().cpu().numpy()
pr_score_building = compute_building_score(
pr_score[channel_footprint],
pr_score[channel_boundary],
alpha=alpha
)
pr_mask = pr_score_building > score_thresh
rotated = test_dataset[i]['rotated']
if rotated:
image_vis = np.flipud(np.fliplr(image_vis))
pr_mask = np.flipud(np.fliplr(pr_mask))
plot_images(
SAR_intensity_0=(adjust_sar_contrast(image_vis[:, :, 0]), 'gray'),
building_mask_pr=(pr_mask, 'viridis')
)
The function which this code calls is given below:
def get_spacenet6_preprocess(config, is_test):
"""
"""
mean_path = os.path.join(
config.INPUT.MEAN_STD_DIR,
config.INPUT.IMAGE_TYPE,
'mean.npy'
)
mean = np.load(mean_path)
mean = mean[np.newaxis, np.newaxis, :]
std_path = os.path.join(
config.INPUT.MEAN_STD_DIR,
config.INPUT.IMAGE_TYPE,
'std.npy'
)
std = np.load(std_path)
std = std[np.newaxis, np.newaxis, :]
if is_test:
to_tensor = albu.Lambda(
image=functools.partial(_to_tensor)
)
else:
to_tensor = albu.Lambda(
image=functools.partial(_to_tensor),
mask=functools.partial(_to_tensor)
)
preprocess = [
albu.Lambda(
image=functools.partial(
_normalize_image,
mean=mean,
std=std
)
),
to_tensor,
]
return albu.Compose(preprocess)

IllegalAccessError when using DynamoDBMapper to encrypt data in EMR

I followed this document: https://docs.aws.amazon.com/dynamodb-encryption-client/latest/devguide/java-examples.html and setup encryption client and mapper to encrypt an items and batchsave into Table but it is not working and throwing errors as given below
stack Trace details:
ERROR Client: Application diagnostics message: User class threw exception: java.lang.IllegalAccessError: tried to access class com.amazonaws.services.dynamodbv2.datamodeling.DynamoDBMappingsRegistry from class com.amazonaws.services.dynamodbv2.datamodeling.AttributeEncryptor at com.amazonaws.services.dynamodbv2.datamodeling.AttributeEncryptor.getModelClassMetadata(AttributeEncryptor.java:156) at com.amazonaws.services.dynamodbv2.datamodeling.AttributeEncryptor.transform(AttributeEncryptor.java:65) at com.amazonaws.services.dynamodbv2.datamodeling.DynamoDBMapper.transformAttributes(DynamoDBMapper.java:2180) at com.amazonaws.services.dynamodbv2.datamodeling.DynamoDBMapper.batchWrite(DynamoDBMapper.java:1229) at com.amazonaws.services.dynamodbv2.datamodeling.AbstractDynamoDBMapper.batchSave(AbstractDynamoDBMapper.java:193) at com.amazon.payrolldatalakeemr.awsoperations.DDBOperations$.batchSaveInDDB(DDBOperations.scala:40)
Config details:
AWSJavaSDKExternalRelease = 1.11.x;
# Spark dependencies
Spark-core = 2.2.1;
Spark-sql = 2.2.1;
DaxJavaClient = 1.0;
ANTLR-Runtime = 3.5.x;
DynamoDbGrammar = 1.0;
Lombok = 1.16.x;
LombokUtils = 1.1;
Maven-com-amazonaws_aws-dynamodb-encryption-java = 1.x;
Mapper code:
def getDynamoDBMapper(region: String): DynamoDBMapper = {
val cmkArn = "*****************************"
val kms: AWSKMS = AWSKMSClientBuilder.standard().withRegion(region).build()
val cmp: DirectKmsMaterialProvider = new DirectKmsMaterialProvider(kms, cmkArn)
val encryptor: DynamoDBEncryptor = DynamoDBEncryptor.getInstance(cmp)
val mapperConfig: DynamoDBMapperConfig = DynamoDBMapperConfig.builder.withSaveBehavior(DynamoDBMapperConfig.SaveBehavior.CLOBBER).build
new DynamoDBMapper(ddclient, mapperConfig, new AttributeEncryptor(encryptor))
}

Resolved after adding spark property:
--conf spark.driver.userClassPathFirst=true

WARN TaskMemoryManager:302 - Failed to allocate a page (16777216 bytes), try again

I am facing this error while running a spark job in standalone cluster mode.
I have following code in pyspark:
def join_client_info(self, cur_df):
raw_clrr_df = sqlContext.read.parquet(hdfs_path+'/data/life400/CLRRPF')\
.selectExpr(['CLNTNUM as cliNum',\
'CLRRROLE'])
salh_df = sqlContext.read.parquet(hdfs_path+'/data/life400/SALHPF')\
.selectExpr(['CLNTNUM as cliNum',\
'DECL_GR_SALARY as proposerSalary'])
spaceDeleteUDF = udf(lambda s: re.sub('[^A-Za-z0-9]+', "", s), StringType())
clrr_df = raw_clrr_df.withColumn('clientRole', spaceDeleteUDF(\
raw_clrr_df['CLRRROLE'])).drop('CLRRROLE')
cli_num = cur_df.select(['cliNum']).collect()[0]['cliNum']
number_of_pols_lf = clrr_df.filter('cliNum='+cli_num)\
.where(clrr_df['clientRole']=='LF')\
.count()
number_of_pols_ow = clrr_df.filter('cliNum='+cli_num)\
.where(clrr_df['clientRole']=='OW')\
.count()
with_lf_num_of_policies = cur_df.withColumn('numberOfPolsIn'\
,lit(number_of_pols_lf))
with_lf_ow_num_of_policies = with_lf_num_of_policies.withColumn(\
'numberOfPolsOw'\
,lit(number_of_pols_ow))
# print(cur_df)
with_proposer_sal = salh_df.filter('cliNum='+cli_num)
return with_lf_ow_num_of_policies.join(with_proposer_sal,'cliNum','inner')
If I uncomment the "print(cur_df)" line, it works fine, doesn`t give me an error. I find this behaviour weird. What am I missing here?

get_process_lines in liquidsoap 1.3.0

I've just updated Liquidsoap to 1.3.0 and now get_process_lines does not return anything.
def get_request() =
# Get the URI
lines = get_process_lines("curl http://localhost:3000/api/v1/liquidsoap/next/my-radio")
log("liquidsoap curl returns #{lines}")
uri = list.hd(default="",lines)
log("liquidsoap will try and play #{uri}")
# Create a request
request.create(uri)
end
I read on the CHANGELOG
- Moved get_process_lines and get_process_output to utils.liq, added optional env parameter
Does it mean I have to do something to use utils.liq in my script now ?
The full script is as follows
set("log.file",false)
set("log.stdout",true)
set("log.level",3)
def apply_metadata(m) =
title = m["title"]
artist = m["artist"]
log("Now playing: #{title} by #{artist}")
end
# Our custom request function
def get_request() =
# Get the URI
lines = get_process_lines("curl http://localhost:3000/api/v1/liquidsoap/next/my-radio")
log("liquidsoap curl returns #{lines}")
uri = list.hd(default="",lines)
log("liquidsoap will try and play #{uri}")
# Create a request
request.create(uri)
end
def my_safe(s) =
security = sine()
fallback(track_sensitive=false,[s,security])
end
s = request.dynamic(id="s",get_request)
s = on_metadata(apply_metadata,s)
s = crossfade(s)
s = my_safe(s)
# We output the stream to an icecast
# server, in ogg/vorbis format.
log("liquidsoap starting")
output.icecast(
%mp3(id3v2=true,bitrate=128,samplerate=44100),
host = "localhost",
port = 8000,
password = "PASSWORD",
mount = "myradio",
genre="various",
url="http://www.myradio.fr",
description="My Radio",
s
)
Of course the API is working
$ curl http://localhost:3000/api/v1/liquidsoap/next/my-radio
annotate:title="Chamakay",artist="Blood Orange",album="Cupid Deluxe":http://localhost/stream/3.mp3
A more simple example :
lines = get_process_lines("echo hi")
log("lines = #{lines}")
line = list.hd(default="",lines)
log("line = #{line}")
returns the following logs
2017/05/05 15:24:42 [lang:3] lines = []
2017/05/05 15:24:42 [lang:3] line =
Many thanks in advance for your help !
geoffroy

The issue was fixed in liquidsoap 1.3.1
Fixed:
Fixed run_process, get_process_lines, get_process_output when compiling with OCaml <= 4.03 (#437, #439)
https://github.com/savonet/liquidsoap/blob/1.3.1/CHANGES#L12

We Keep Coding

iphone swift flutter scala powershell matlab mongodb postgresql perl eclipse

Adding a custom function in SPARK NLP Pipeline - pyspark

Related

case when in merge statement databricks

unable to get repr for <class 'albumentations.core.composition.compose'>

IllegalAccessError when using DynamoDBMapper to encrypt data in EMR

WARN TaskMemoryManager:302 - Failed to allocate a page (16777216 bytes), try again

get_process_lines in liquidsoap 1.3.0

Categories

Resources