pyspark publish google-pubsublite - pyspark

the code is from google doc.
https://cloud.google.com/pubsub/lite/docs/write-messages-apache-spark
I m trying to publish to a pubsulite topic from pyspark. .
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import array, create_map, col, lit, when
from pyspark.sql.types import BinaryType, StringType
import uuid
# TODO(developer):
project_number = xxx
location = "us-central1"
topic_id = "kosmin"
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.google.cloud:pubsublite-spark-sql-streaming:0.4.1,com.google.cloud:google-cloud-pubsublite:1.6.1 pyspark-shell'
spark = SparkSession.builder.appName("write-app").getOrCreate()
sdf = (
sdf.withColumn("key", lit("example").cast(BinaryType()))
.withColumn("data", col("value").cast(StringType()).cast(BinaryType()))
.withColumnRenamed("timestamp", "event_timestamp")
# Populate the attributes field. For example, an even value will
# have {"key1", [b"even"]}.
.withColumn(
"attributes",
create_map(
lit("key1"),
array(when(col("value") % 2 == 0, b"even").otherwise(b"odd")),
),
)
.drop("value")
)
query = (
sdf.writeStream.format("pubsublite")
.option(
"pubsublite.topic",
f"projects/{project_number}/locations/{location}/topics/{topic_id}",
)
# Required. Use a unique checkpoint location for each job.
.option("checkpointLocation", "/tmp/app" + uuid.uuid4().hex)
.outputMode("append")
# .trigger(processingTime="1 second")
.start()
)
# Wait 60 seconds to terminate the query.
query.awaitTermination(60)
query.stop()
but i m getting
22/07/29 19:09:38 ERROR Utils: Aborting task
com.google.api.gax.rpc.ApiException:
at com.google.cloud.pubsublite.internal.CheckedApiException.<init>(CheckedApiException.java:51)
at com.google.cloud.pubsublite.internal.CheckedApiException.<init>(CheckedApiException.java:55)
at com.google.cloud.pubsublite.internal.ExtractStatus.toCanonical(ExtractStatus.java:53)
at com.google.cloud.pubsublite.spark.PslWriteDataSourceOptions.newServiceClient(PslWriteDataSourceOptions.java:131)
......
Caused by: java.lang.NoSuchMethodError: com.google.common.base.Preconditions.checkArgument(ZLjava/lang/String;CLjava/lang/Object;)V
at io.grpc.Metadata$Key.validateName(Metadata.java:754)
at io.grpc.Metadata$Key.<init>(Metadata.java:762)
at io.grpc.Metadata$Key.<init>(Metadata.java:671)
at io.grpc.Metadata$AsciiKey.<init>(Metadata.java:971)
at io.grpc.Metadata$AsciiKey.<init>(Metadata.java:966)
at io.grpc.Metadata$Key.of(Metadata.java:708)
at io.grpc.Metadata$Key.of(Metadata.java:704)
at com.google.api.gax.grpc.GrpcHeaderInterceptor.<init>(GrpcHeaderInterceptor.java:60)
....
what am I missing ? the credentials are set, maybe some package at submit time ?

worked with
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars=pubsublite-spark-sql-streaming-LATEST-with-dependencies.jar pyspark-shell'

Related

Spark stream stops abruptly - "the specified path does not exist"

I am working on the spark structure streaming. My Stream works fine but after sometime it just stops because of below issue.
Any suggestion what could be the reason and how to resolve this issue.
java.io.FileNotFoundException: Operation failed: "The specified path does not exist.", 404, GET, https://XXXXXXXX.dfs.core.windows.net/output?upn=false&resource=filesystem&maxResults=5000&directory=XXXXXXXX&timeout=90&recursive=true, PathNotFound, "The specified path does not exist. RequestId:d1b7c77f-e01f-0027-7f09-4646f7000000 Time:2022-04-01T20:47:30.1791444Z"
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem.checkException(AzureBlobFileSystem.java:1290)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem.listKeysWithPrefix(AzureBlobFileSystem.java:530)
at com.databricks.tahoe.store.EnhancedAzureBlobFileSystemUpgrade.listKeysWithPrefix(EnhancedFileSystem.scala:605)
at com.databricks.tahoe.store.EnhancedDatabricksFileSystemV2.$anonfun$listKeysWithPrefix$1(EnhancedFileSystem.scala:374)
at com.databricks.backend.daemon.data.client.DBFSV2.$anonfun$listKeysWithPrefix$1(DatabricksFileSystemV2.scala:247)
at com.databricks.logging.UsageLogging.$anonfun$recordOperation$1(UsageLogging.scala:395)
at com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:484)
at com.databricks.logging.UsageLogging.$anonfun$recordOperationWithResultTags$4(UsageLogging.scala:504)
at com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:266)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
at com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:261)
at com.databricks.logging.UsageLogging.withAttributionContext$(UsageLogging.scala:258)
at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.withAttributionContext(DatabricksFileSystemV2.scala:510)
at com.databricks.logging.UsageLogging.withAttributionTags(UsageLogging.scala:305)
at com.databricks.logging.UsageLogging.withAttributionTags$(UsageLogging.scala:297)
at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.withAttributionTags(DatabricksFileSystemV2.scala:510)
at com.databricks.logging.UsageLogging.recordOperationWithResultTags(UsageLogging.scala:479)
at com.databricks.logging.UsageLogging.recordOperationWithResultTags$(UsageLogging.scala:404)
at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.recordOperationWithResultTags(DatabricksFileSystemV2.scala:510)
at com.databricks.logging.UsageLogging.recordOperation(UsageLogging.scala:395)
at com.databricks.logging.UsageLogging.recordOperation$(UsageLogging.scala:367)
at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.recordOperation(DatabricksFileSystemV2.scala:510)
at com.databricks.backend.daemon.data.client.DBFSV2.listKeysWithPrefix(DatabricksFileSystemV2.scala:240)
at com.databricks.tahoe.store.EnhancedDatabricksFileSystemV2.listKeysWithPrefix(EnhancedFileSystem.scala:374)
at com.databricks.tahoe.store.AzureLogStore.listKeysWithPrefix(AzureLogStore.scala:54)
at com.databricks.tahoe.store.DelegatingLogStore.listKeysWithPrefix(DelegatingLogStore.scala:251)
at com.databricks.sql.fileNotification.autoIngest.FileEventBackfiller$.listFiles(FileEventWorkerThread.scala:967)
at com.databricks.sql.fileNotification.autoIngest.FileEventBackfiller.runInternal(FileEventWorkerThread.scala:876)
at com.databricks.sql.fileNotification.autoIngest.FileEventBackfiller.run(FileEventWorkerThread.scala:809)
Caused by: Operation failed: "The specified path does not exist.", 404, GET, https://XXXXXXXXXX.dfs.core.windows.net/output?upn=false&resource=filesystem&maxResults=5000&directory=XXXXXXXX&timeout=90&recursive=true, PathNotFound, "The specified path does not exist. RequestId:02ae07cf-901f-0001-080e-46dd43000000 Time:2022-04-01T21:21:40.2136657Z"
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.services.AbfsRestOperation.execute(AbfsRestOperation.java:241)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.services.AbfsClient.listPath(AbfsClient.java:235)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore.listFiles(AzureBlobFileSystemStore.java:1112)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore.access$200(AzureBlobFileSystemStore.java:143)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore$1.fetchMoreResults(AzureBlobFileSystemStore.java:1052)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore$1.(AzureBlobFileSystemStore.java:1033)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore.listKeysWithPrefix(AzureBlobFileSystemStore.java:1029)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem.listKeysWithPrefix(AzureBlobFileSystem.java:527)
... 27 more
Below is my code:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.types import StructType, StringType
from pyspark.sql import functions as F
from delta.tables import *
spark.sql("set spark.sql.files.ignoreMissingFiles=true")
filteredRawDF = ""
try:
filteredRawDF = spark.readStream.format("cloudFiles") \
.option("cloudFiles.format", "json") \
.option("cloudFiles.schemaLocation", landingcheckPointFilePath) \
.option("cloudFiles.inferColumnTypes", "true") \
.load(landingFilePath) \
.select(from_json('body', schema).alias('temp')) \
.select(explode("temp.report.data").alias("details")) \
.select("details",
explode("details.breakdown").alias("inner_breakdown")) \
.select("details","inner_breakdown",
explode("inner_breakdown.breakdown").alias("outer_breakdown"))\
.select(to_timestamp(col("details.name"), "yyyy-MM-
dd'T'HH:mm:ss+SSSS").alias('datetime'),
col("details.year"),
col("details.day"),
col("details.hour"),
col("details.minute"),
col("inner_breakdown.name").alias("hotelName"),
col("outer_breakdown.name").alias("checkindate"),
col("outer_breakdown.counts")[0].cast("int").alias("HdpHits"))
except Exception as e:
print(e)
query = filteredRawDF \
.writeStream \
.format("delta") \
.option("mergeSchema", "true") \
.outputMode("append") \
.option("checkpointLocation", checkPointPath) \
.trigger(processingTime='50 seconds') \
.start(savePath) '''
Thanks

AWS GlueStudio RDS -> Redshift invalid timestamp format

I am trying to create an AWS Glue ETL job to move data from Aurora RDS to Redshift, but cannot resolve how to get the timestamp fields properly mapped. All stages of the job show a valid preview of the expected data, but the job always fails with the following error
py4j.protocol.Py4JJavaError: An error occurred while calling o179.pyWriteDynamicFrame.
: java.sql.SQLException:
Error (code 1206) while loading data into Redshift: "Invalid timestamp format or value [YYYY-MM-DD HH24:MI:SS]"
Table name: public.stage_table_ae89e9dffe974b649bbf4852e49a4b12
Column name: updated_at
Column type: timestamp(0)
Raw line: 1234,5341,1121,0,2022-01-06 16:29:55.000000000,2022-01-06 16:29:55.000000000,1,1,Suzy
Raw field value: 0
I have tried doing a date format to remove the microseconds, I have tried forcing quotes around the date fields, nothing works.
Here is the generated script
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrameCollection
from awsglue.dynamicframe import DynamicFrame
from awsglue import DynamicFrame
# Script generated for node Custom transform
def CastIntsTransform(glueContext, dfc) -> DynamicFrameCollection:
df = dfc.select(list(dfc.keys())[0])
df_resolved = (
df.resolveChoice(specs=[("id", "cast:bigint")])
.resolveChoice(specs=[("user_id", "cast:bigint")])
.resolveChoice(specs=[("connected_user_id", "cast:bigint")])
.resolveChoice(specs=[("mg_id", "cast:bigint")])
.resolveChoice(specs=[("access_level", "cast:tinyint")])
.resolveChoice(specs=[("status", "cast:tinyint")])
)
return DynamicFrameCollection({"CustomTransform0": df_resolved}, glueContext)
def sparkSqlQuery(glueContext, query, mapping, transformation_ctx) -> DynamicFrame:
for alias, frame in mapping.items():
frame.toDF().createOrReplaceTempView(alias)
result = spark.sql(query)
return DynamicFrame.fromDF(result, glueContext, transformation_ctx)
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node JDBC Connection
JDBCConnection_node1 = glueContext.create_dynamic_frame.from_catalog(
database="ABC123",
table_name="user_connections",
transformation_ctx="JDBCConnection_node1",
)
# Script generated for node SQL
SqlQuery0 = """
select
id,
user_id,
connected_user_id,
COALESCE(mg_id, 0) mg_id,
created_at,
updated_at,
updated_at,
access_level,
status,
COALESCE(nickname, '') nickname
from
apiData
"""
SQL_node1647619002820 = sparkSqlQuery(
glueContext,
query=SqlQuery0,
mapping={"apiData": JDBCConnection_node1},
transformation_ctx="SQL_node1647619002820",
)
# Script generated for node Custom transform
Customtransform_node1647612655336 = CastIntsTransform(
glueContext,
DynamicFrameCollection(
{"SQL_node1647619002820": SQL_node1647619002820}, glueContext
),
)
# Script generated for node Select From Collection
SelectFromCollection_node1647613332516 = SelectFromCollection.apply(
dfc=Customtransform_node1647612655336,
key=list(Customtransform_node1647612655336.keys())[0],
transformation_ctx="SelectFromCollection_node1647613332516",
)
# Script generated for node ApplyMapping
ApplyMapping_node2 = ApplyMapping.apply(
frame=SelectFromCollection_node1647613332516,
mappings=[
("id", "bigint", "id", "bigint"),
("user_id", "bigint", "user_id", "bigint"),
("connected_user_id", "bigint", "connected_user_id", "bigint"),
("mg_id", "bigint", "mg_id", "bigint"),
("created_at", "timestamp", "created_at", "timestamp"),
("updated_at", "timestamp", "updated_at", "timestamp"),
("access_level", "tinyint", "access_level", "tinyint"),
("status", "tinyint", "status", "tinyint"),
("nickname", "varchar", "nickname", "varchar"),
],
transformation_ctx="ApplyMapping_node2",
)
# Script generated for node Amazon Redshift
pre_query = "drop table if exists public.stage_table_cd5d65739d334453938f090ea1cb2d6e;create table public.stage_table_cd5d65739d334453938f090ea1cb2d6e as select * from public.test_user_connections where 1=2;"
post_query = "begin;delete from public.test_user_connections using public.stage_table_cd5d65739d334453938f090ea1cb2d6e where public.stage_table_cd5d65739d334453938f090ea1cb2d6e.id = public.test_user_connections.id; insert into public.test_user_connections select * from public.stage_table_cd5d65739d334453938f090ea1cb2d6e; drop table public.stage_table_cd5d65739d334453938f090ea1cb2d6e; end;"
AmazonRedshift_node1647612972417 = glueContext.write_dynamic_frame.from_jdbc_conf(
frame=ApplyMapping_node2,
catalog_connection="ABC123",
connection_options={
"database": "test",
"dbtable": "public.stage_table_cd5d65739d334453938f090ea1cb2d6e",
"preactions": pre_query,
"postactions": post_query,
},
redshift_tmp_dir=args["TempDir"],
transformation_ctx="AmazonRedshift_node1647612972417",
)
job.commit()

Mongodb connection url not working in AWS Glue Connection screen

I am trying to connect to mongodb from AWS glue connection screen and getting error
tried various url's
mongodb://server1:2066,server2:2066,server3:2066/dbname.collection?authMechanism=PLAIN&authSource=&external
mongodb://server1:2066,server2:2066,server3:2066/?authMechanism=PLAIN&authSource=&external
Error in AWS Glue connection output in cloud watch
Check that your connection definition references your Mongo database with correct URL syntax, username, and password.
Exiting with error code 30
It works in Glue job with same VPC/Subnet/Security groups
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext, SparkConf
from awsglue.context import GlueContext
from awsglue.job import Job
import time
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
mongo_uri = "mongodb://_SERVER_1:2066,_SERVER_2:2066,_SERVER_3:2066/?authMechanism=PLAIN&authSource=$external"
read_mongo_options = {
"uri": mongo_uri,
"database": "_DB_",
#"authMechanism": "PLAIN",
#"authSource": "$external",
"collection": "_collection_",
"username": "****",
"password": "***",
"partitioner": "MongoSamplePartitioner",
"partitionerOptions.partitionSizeMB": "10",
"partitionerOptions.partitionKey": "_id",
"inferSchema": "false"
}
logger = glueContext.get_logger()
logger.info("Connecting...")
s3_location = 's3://_bucket_/output/'
dynamic_frame = glueContext.create_dynamic_frame.from_options(connection_type="mongodb",
connection_options=read_mongo_options)
print("Connected!")

How to find occurrence of words from a log file with pyspark RDD

I have a log file. I have read the file and converted in rdd. I want to count the number of times 'server_name' is present in the file.
Original log file I have -
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.session import SparkSession
sc = SparkContext()
sqlContext = SQLContext(sc)
spark = SparkSession(sc)
base_df = spark.read.text("/content/fsm-20210817.logs")
base_df_rdd = base_df.rdd
server_list = ['nginx-ingress-controller-5b6697898-zqxl4','cert-manager-5695c78d49-q9s9j']
for i in server_list:
res = textFile.rdd.map(lambda x: x[0].split(' ').count(i)).sum()
print(i,res)
I'm getting output as -
nginx-ingress-controller-5b6697898-zqxl4 0
cert-manager-5695c78d49-q9s9j 0
I have base_df_rdd as-
[Row(value='{"log":{"offset":5367960,"file":{"path":"/var/log/containers/cert-manager-5695c78d49-q9s9j_cert-manager_cert-manager-cf5af9cbaccdccd8f637d0ba7313996b1cfc0bab7b25fe9f157953918016ac84.log"}},"stream":"stderr","message":"E0817 00:00:00.144379 1 sync.go:183] cert-manager/controller/challenges \\"msg\\"=\\"propagation check failed\\" \\"error\\"=\\"wrong status code \'404\', expected \'200\'\\" \\"dnsName\\"=\\"prodapi.fsmbuddy.com\\" \\"resource_kind\\"=\\"Challenge\\" \\"resource_name\\"=\\"prodapi-fsmbuddy-tls-cert-vtvdq-1471208926-2451592135\\" \\"resource_namespace\\"=\\"default\\" \\"resource_version\\"=\\"v1\\" \\"type\\"=\\"HTTP-01\\" ","#timestamp":"2021-08-17T00:00:00.144Z","ecs":{"version":"1.0.0"},"cloud":{"instance":{"id":"i-06c596f469bcf9b4a"},"region":"ap-south-1","provider":"aws","availability_zone":"ap-south-1b","machine":{"type":"t3a.large"}},"input":{"type":"container"},"#version":"1","host":{"architecture":"x86_64","os":{"codename":"Core","version":"7 (Core)","name":"CentOS Linux","kernel":"4.14.186-146.268.amzn2.x86_64","platform":"centos","family":"redhat"},"hostname":"ip-192-168-18-105.ap-south-1.compute.internal","containerized":false,"name":"ip-192-168-18-105.ap-south-1.compute.internal"},"tags":["beats_input_codec_plain_applied","_grokparsefailure"],"agent":{"version":"7.2.0","type":"filebeat","ephemeral_id":"af246a38-d99d-43ab-849b-cf25288dd6c1","hostname":"ip-192-168-18-105.ap-south-1.compute.internal","id":"9631683c-b8fc-40d8-9e28-f80f5fa3cc2c"},"kubernetes":{"node":{"name":"ip-192-168-18-105.ap-south-1.compute.internal"},"container":{"name":"cert-manager"},"labels":{"helm_sh/chart":"cert-manager-v1.0.1","app_kubernetes_io/component":"controller","app_kubernetes_io/managed-by":"Tiller","pod-template-hash":"5695c78d49","app_kubernetes_io/instance":"cert-manager","app_kubernetes_io/name":"cert-manager","app":"cert-manager"},"pod":{"uid":"0be2ef9e-f2ee-4d40-b74b-17734527d78c","name":"cert-manager-5695c78d49-q9s9j"},"replicaset":{"name":"cert-manager-5695c78d49"},"namespace":"cert-manager"}}'),
Row(value='{"log":{"offset":1946553,"file":{"path":"/var/log/containers/fsm-backend-cron-prod-6bd6459455-p9p49_default_fsm-backend-cron-prod-51838c21c82b0a19b713bf028da7418f9885fd29b606e5b9912c1c66f6c3046a.log"}},"stream":"stdout","message":"Inside user information updation cron: 2021-08-17T00:00:00.001Z","#timestamp":"2021-08-17T00:00:00.001Z","ecs":{"version":"1.0.0"},"cloud":{"instance":{"id":"i-06c596f469bcf9b4a"},"region":"ap-south-1","availability_zone":"ap-south-1b","provider":"aws","machine":{"type":"t3a.large"}},"input":{"type":"container"},"#version":"1","host":{"architecture":"x86_64","os":{"codename":"Core","version":"7 (Core)","name":"CentOS Linux","kernel":"4.14.186-146.268.amzn2.x86_64","platform":"centos","family":"redhat"},"hostname":"ip-192-168-18-105.ap-south-1.compute.internal","containerized":false,"name":"ip-192-168-18-105.ap-south-1.compute.internal"},"tags":["beats_input_codec_plain_applied","_grokparsefailure"],"agent":{"version":"7.2.0","type":"filebeat","ephemeral_id":"af246a38-d99d-43ab-849b-cf25288dd6c1","hostname":"ip-192-168-18-105.ap-south-1.compute.internal","id":"9631683c-b8fc-40d8-9e28-f80f5fa3cc2c"},"kubernetes":{"node":{"name":"ip-192-168-18-105.ap-south-1.compute.internal"},"container":{"name":"fsm-backend-cron-prod"},"labels":{"pod-template-hash":"6bd6459455","app":"fsm-backend-cron-prod"},"pod":{"uid":"c356f207-bc60-4974-9f8b-02ecfe87eaa0","name":"fsm-backend-cron-prod-6bd6459455-p9p49"},"replicaset":{"name":"fsm-backend-cron-prod-6bd6459455"},"namespace":"default"}}'),
Row(value='{"log":{"offset":1946687,"file":{"path":"/var/log/containers/fsm-backend-cron-prod-6bd6459455-p9p49_default_fsm-backend-cron-prod-51838c21c82b0a19b713bf028da7418f9885fd29b606e5b9912c1c66f6c3046a.log"}},"stream":"stdout","message":"\\u001B[0;36mMongoose:\\u001B[0m orders.find({ orderStatus: \\u001B[32m\'success\'\\u001B[39m, orderDate: { \\u001B[32m\'$gte\'\\u001B[39m: new Date(\\"Mon, 16 Aug 2021 00:00:00 GMT\\") }}, { projection: {} })","#timestamp":"2021-08-17T00:00:00.002Z","ecs":{"version":"1.0.0"},"cloud":{"instance":{"id":"i-06c596f469bcf9b4a"},"region":"ap-south-1","availability_zone":"ap-south-1b","provider":"aws","machine":{"type":"t3a.large"}},"input":{"type":"container"},"#version":"1","host":{"os":{"codename":"Core","version":"7 (Core)","name":"CentOS Linux","kernel":"4.14.186-146.268.amzn2.x86_64","platform":"centos","family":"redhat"},"name":"ip-192-168-18-105.ap-south-1.compute.internal","hostname":"ip-192-168-18-105.ap-south-1.compute.internal","containerized":false,"architecture":"x86_64"},"tags":["beats_input_codec_plain_applied","_grokparsefailure"],"agent":{"version":"7.2.0","type":"filebeat","ephemeral_id":"af246a38-d99d-43ab-849b-cf25288dd6c1","hostname":"ip-192-168-18-105.ap-south-1.compute.internal","id":"9631683c-b8fc-40d8-9e28-f80f5fa3cc2c"},"kubernetes":{"node":{"name":"ip-192-168-18-105.ap-south-1.compute.internal"},"container":{"name":"fsm-backend-cron-prod"},"labels":{"pod-template-hash":"6bd6459455","app":"fsm-backend-cron-prod"},"pod":{"uid":"c356f207-bc60-4974-9f8b-02ecfe87eaa0","name":"fsm-backend-cron-prod-6bd6459455-p9p49"},"replicaset":{"name":"fsm-backend-cron-prod-6bd6459455"},"namespace":"default"}}'),
Row(value='{"log":{"offset":1946955,"file":{"path":"/var/log/containers/fsm-backend-cron-prod-6bd6459455-p9p49_default_fsm-backend-cron-prod-51838c21c82b0a19b713bf028da7418f9885fd29b606e5b9912c1c66f6c3046a.log"}},"stream":"stdout","message":"******","#timestamp":"2021-08-17T00:00:00.003Z","ecs":{"version":"1.0.0"},"cloud":{"instance":{"id":"i-06c596f469bcf9b4a"},"region":"ap-south-1","availability_zone":"ap-south-1b","provider":"aws","machine":{"type":"t3a.large"}},"input":{"type":"container"},"#version":"1","host":{"architecture":"x86_64","name":"ip-192-168-18-105.ap-south-1.compute.internal","hostname":"ip-192-168-18-105.ap-south-1.compute.internal","os":{"codename":"Core","version":"7 (Core)","name":"CentOS Linux","kernel":"4.14.186-146.268.amzn2.x86_64","platform":"centos","family":"redhat"},"containerized":false},"tags":["beats_input_codec_plain_applied","_grokparsefailure"],"agent":{"version":"7.2.0","type":"filebeat","ephemeral_id":"af246a38-d99d-43ab-849b-cf25288dd6c1","hostname":"ip-192-168-18-105.ap-south-1.compute.internal","id":"9631683c-b8fc-40d8-9e28-f80f5fa3cc2c"},"kubernetes":{"node":{"name":"ip-192-168-18-105.ap-south-1.compute.internal"},"container":{"name":"fsm-backend-cron-prod"},"labels":{"pod-template-hash":"6bd6459455","app":"fsm-backend-cron-prod"},"pod":{"uid":"c356f207-bc60-4974-9f8b-02ecfe87eaa0","name":"fsm-backend-cron-prod-6bd6459455-p9p49"},"replicaset":{"name":"fsm-backend-cron-prod-6bd6459455"},"namespace":"default"}}'),
Row(value='{"log":{"offset":1947032,"file":{"path":"/var/log/containers/fsm-backend-cron-prod-6bd6459455-p9p49_default_fsm-backend-cron-prod-51838c21c82b0a19b713bf028da7418f9885fd29b606e5b9912c1c66f6c3046a.log"}},"stream":"stdout","message":"\\u001B[0;36mMongoose:\\u001B[0m enrollments.find({ createdAt: { \\u001B[32m\'$gte\'\\u001B[39m: new Date(\\"Mon, 16 Aug 2021 23:00:00 GMT\\") }, isActive: { \\u001B[32m\'$ne\'\\u001B[39m: \\u001B[33mfalse\\u001B[39m }}, { projection: {} })","#timestamp":"2021-08-17T00:00:00.004Z","ecs":{"version":"1.0.0"},"cloud":{"instance":{"id":"i-06c596f469bcf9b4a"},"region":"ap-south-1","availability_zone":"ap-south-1b","provider":"aws","machine":{"type":"t3a.large"}},"input":{"type":"container"},"#version":"1","host":{"architecture":"x86_64","name":"ip-192-168-18-105.ap-south-1.compute.internal","hostname":"ip-192-168-18-105.ap-south-1.compute.internal","os":{"codename":"Core","version":"7 (Core)","name":"CentOS Linux","kernel":"4.14.186-146.268.amzn2.x86_64","platform":"centos","family":"redhat"},"containerized":false},"tags":["beats_input_codec_plain_applied","_grokparsefailure"],"agent":{"version":"7.2.0","type":"filebeat","ephemeral_id":"af246a38-d99d-43ab-849b-cf25288dd6c1","hostname":"ip-192-168-18-105.ap-south-1.compute.internal","id":"9631683c-b8fc-40d8-9e28-f80f5fa3cc2c"},"kubernetes":{"node":{"name":"ip-192-168-18-105.ap-south-1.compute.internal"},"container":{"name":"fsm-backend-cron-prod"},"labels":{"pod-template-hash":"6bd6459455","app":"fsm-backend-cron-prod"},"pod":{"uid":"c356f207-bc60-4974-9f8b-02ecfe87eaa0","name":"fsm-backend-cron-prod-6bd6459455-p9p49"},"replicaset":{"name":"fsm-backend-cron-prod-6bd6459455"},"namespace":"default"}}'),
Row(value='{"log":{"offset":1947329,"file":{"path":"/var/log/containers/fsm-backend-cron-prod-6bd6459455-p9p49_default_fsm-backend-cron-prod-51838c21c82b0a19b713bf028da7418f9885fd29b606e5b9912c1c66f6c3046a.log"}},"stream":"stdout","message":"Currency data:{\\"result\\":\\"success\\",\\"documentation\\":\\"https://www.exchangerate-api.com/docs\\",\\"terms_of_use\\":\\"https://www.exchangerate-api.com/terms\\",\\"time_zone\\":\\"UTC\\",\\"time_last_update\\":1629072001,\\"time_next_update\\":1629158521,\\"base\\":\\"INR\\",\\"conversion_rates\\":{\\"INR\\":1,\\"AED\\":0.04946,\\"AFN\\":1.075,\\"ALL\\":1.3897,\\"AMD\\":6.63,\\"ANG\\":0.0241,\\"AOA\\":8.6508,\\"ARS\\":1.3046,\\"AUD\\":0.01829,\\"AWG\\":0.0241,\\"AZN\\":0.02285,\\"BAM\\":0.02239,\\"BBD\\":0.02693,\\"BDT\\":1.1405,\\"BGN\\":0.02239,\\"BHD\\":0.005063,\\"BIF\\":26.6861,\\"BMD\\":0.01347,\\"BND\\":0.0183,\\"BOB\\":0.09274,\\"BRL\\":0.07057,\\"BSD\\":0.01347,\\"BTN\\":1,\\"BWP\\":0.1497,\\"BYN\\":0.03361,\\"BZD\\":0.02693,\\"CAD\\":0.01688,\\"CDF\\":26.7409,\\"CHF\\":0.0124,\\"CLP\\":10.4188,\\"CNY\\":0.08741,\\"COP\\":51.9025,\\"CRC\\":8.3535,\\"CUC\\":0.01347,\\"CUP\\":0.3468,\\"CVE\\":1.2625,\\"CZK\\":0.2931,\\"DJF\\":2.3933,\\"DKK\\":0.08542,\\"DOP\\":0.7673,\\"DZD\\":1.8193,\\"EGP\\":0.2112,\\"ERN\\":0.202,\\"ETB\\":0.6076,\\"EUR\\":0.01145,\\"FJD\\":0.02801,\\"FKP\\":0.009749,\\"FOK\\":0.08542,\\"GBP\\":0.009749,\\"GEL\\":0.04202,\\"GGP\\":0.009749,\\"GHS\\":0.08076,\\"GIP\\":0.009749,\\"GMD\\":0.6996,\\"GNF\\":131.3706,\\"GTQ\\":0.1042,\\"GYD\\":2.8153,\\"HKD\\":0.1049,\\"HNL\\":0.3193,\\"HRK\\":0.08627,\\"HTG\\":1.2853,\\"HUF\\":4.086,\\"IDR\\":193.9741,\\"ILS\\":0.04382,\\"IMP\\":0.009749,\\"IQD\\":19.6346,\\"IRR\\":564.6901,\\"ISK\\":1.6955,\\"JMD\\":2.08,\\"JOD\\":0.009548,\\"JPY\\":1.4838,\\"KES\\":1.4694,\\"KGS\\":1.1411,\\"KHR\\":54.8746,\\"KID\\":0.01829,\\"KMF\\":5.6327,\\"KRW\\":15.6725,\\"KWD\\":0.004035,\\"KYD\\":0.01122,\\"KZT\\":5.7203,\\"LAK\\":129.023,\\"LBP\\":20.3005,\\"LKR\\":2.6874,\\"LRD\\":2.3085,\\"LSL\\":0.1992,\\"LYD\\":0.06092,\\"MAD\\":0.1208,\\"MDL\\":0.2391,\\"MGA\\":52.5738,\\"MKD\\":0.7039,\\"MMK\\":22.1546,\\"MNT\\":38.2846,\\"MOP\\":0.1081,\\"MRU\\":0.486,\\"MUR\\":0.5712,\\"MVR\\":0.2062,\\"MWK\\":10.9455,\\"MXN\\":0.2685,\\"MYR\\":0.05709,\\"MZN\\":0.8614,\\"NAD\\":0.1992,\\"NGN\\":5.5967,\\"NIO\\":0.4725,\\"NOK\\":0.1188,\\"NPR\\":1.6,\\"NZD\\":0.01915,\\"OMR\\":0.005178,\\"PAB\\":0.01347,\\"PEN\\":0.05494,\\"PGK\\":0.04721,\\"PHP\\":0.6817,\\"PKR\\":2.2107,\\"PLN\\":0.05259,\\"PYG\\":93.2322,\\"QAR\\":0.04902,\\"RON\\":0.05625,\\"RSD\\":1.3457,\\"RUB\\":0.988,\\"RWF\\":13.5649,\\"SAR\\":0.0505,\\"SBD\\":0.1072,\\"SCR\\":0.1964,\\"SDG\\":5.9863,\\"SEK\\":0.1167,\\"SGD\\":0.0183,\\"SHP\\":0.009749,\\"SLL\\":138.6306,\\"SOS\\":7.7848,\\"SRD\\":0.2883,\\"SSP\\":2.3945,\\"STN\\":0.2805,\\"SYP\\":16.9261,\\"SZL\\":0.1992,\\"THB\\":0.4511,\\"TJS\\":0.1521,\\"TMT\\":0.04714,\\"TND\\":0.03747,\\"TOP\\":0.03017,\\"TRY\\":0.1152,\\"TTD\\":0.09132,\\"TVD\\":0.01829,\\"TWD\\":0.3739,\\"TZS\\":31.209,\\"UAH\\":0.3591,\\"UGX\\":47.6167,\\"USD\\":0.01347,\\"UYU\\":0.5869,\\"UZS\\":144.1304,\\"VES\\":55690.0545,\\"VND\\":308.1789,\\"VUV\\":1.5055,\\"WST\\":0.03438,\\"XAF\\":7.5103,\\"XCD\\":0.03636,\\"XDR\\":0.009481,\\"XOF\\":7.5103,\\"XPF\\":1.3663,\\"YER\\":3.3625,\\"ZAR\\":0.1992,\\"ZMW\\":0.2599}}","#timestamp":"2021-08-17T00:00:00.813Z","ecs":{"version":"1.0.0"},"cloud":{"instance":{"id":"i-06c596f469bcf9b4a"},"region":"ap-south-1","provider":"aws","availability_zone":"ap-south-1b","machine":{"type":"t3a.large"}},"input":{"type":"container"},"#version":"1","host":{"architecture":"x86_64","name":"ip-192-168-18-105.ap-south-1.compute.internal","hostname":"ip-192-168-18-105.ap-south-1.compute.internal","os":{"codename":"Core","version":"7 (Core)","name":"CentOS Linux","kernel":"4.14.186-146.268.amzn2.x86_64","platform":"centos","family":"redhat"},"containerized":false},"tags":["beats_input_codec_plain_applied","_grokparsefailure"],"agent":{"version":"7.2.0","type":"filebeat","ephemeral_id":"af246a38-d99d-43ab-849b-cf25288dd6c1","hostname":"ip-192-168-18-105.ap-south-1.compute.internal","id":"9631683c-b8fc-40d8-9e28-f80f5fa3cc2c"},"kubernetes":{"node":{"name":"ip-192-168-18-105.ap-south-1.compute.internal"},"container":{"name":"fsm-backend-cron-prod"},"labels":{"pod-template-hash":"6bd6459455","app":"fsm-backend-cron-prod"},"pod":{"uid":"c356f207-bc60-4974-9f8b-02ecfe87eaa0","name":"fsm-backend-cron-prod-6bd6459455-p9p49"},"replicaset":{"name":"fsm-backend-cron-prod-6bd6459455"},"namespace":"default"}}'),
Row(value='{"log":{"offset":1950159,"file":{"path":"/var/log/containers/fsm-backend-cron-prod-6bd6459455-p9p49_default_fsm-backend-cron-prod-51838c21c82b0a19b713bf028da7418f9885fd29b606e5b9912c1c66f6c3046a.log"}},"stream":"stdout","message":"\\u001B[0;36mMongoose:\\u001B[0m countries.find({ currency_code: \\u001B[32m\'INR\'\\u001B[39m }, { projection: {} })","#timestamp":"2021-08-17T00:00:01.026Z","ecs":{"version":"1.0.0"},"cloud":{"instance":{"id":"i-06c596f469bcf9b4a"},"region":"ap-south-1","provider":"aws","availability_zone":"ap-south-1b","machine":{"type":"t3a.large"}},"input":{"type":"container"},"#version":"1","host":{"architecture":"x86_64","name":"ip-192-168-18-105.ap-south-1.compute.internal","hostname":"ip-192-168-18-105.ap-south-1.compute.internal","containerized":false,"os":{"codename":"Core","version":"7 (Core)","name":"CentOS Linux","kernel":"4.14.186-146.268.amzn2.x86_64","platform":"centos","family":"redhat"}},"tags":["beats_input_codec_plain_applied","_grokparsefailure"],"agent":{"version":"7.2.0","type":"filebeat","ephemeral_id":"af246a38-d99d-43ab-849b-cf25288dd6c1","hostname":"ip-192-168-18-105.ap-south-1.compute.internal","id":"9631683c-b8fc-40d8-9e28-f80f5fa3cc2c"},"kubernetes":{"node":{"name":"ip-192-168-18-105.ap-south-1.compute.internal"},"container":{"name":"fsm-backend-cron-prod"},"labels":{"pod-template-hash":"6bd6459455","app":"fsm-backend-cron-prod"},"pod":{"uid":"c356f207-bc60-4974-9f8b-02ecfe87eaa0","name":"fsm-backend-cron-prod-6bd6459455-p9p49"},"replicaset":{"name":"fsm-backend-cron-prod-6bd6459455"},"namespace":"default"}}'),
Row(value='{"log":{"offset":1950341,"file":{"path":"/var/log/containers/fsm-backend-cron-prod-6bd6459455-p9p49_default_fsm-backend-cron-prod-51838c21c82b0a19b713bf028da7418f9885fd29b606e5b9912c1c66f6c3046a.log"}},"stream":"stdout","message":"\\u001B[0;36mMongoose:\\u001B[0m countries.find({ currency_code: \\u001B[32m\'AED\'\\u001B[39m }, { projection: {} })","#timestamp":"2021-08-17T00:00:01.027Z","ecs":{"version":"1.0.0"},"cloud":{"instance":{"id":"i-06c596f469bcf9b4a"},"region":"ap-south-1","availability_zone":"ap-south-1b","provider":"aws","machine":{"type":"t3a.large"}},"input":{"type":"container"},"#version":"1","host":{"architecture":"x86_64","os":{"codename":"Core","version":"7 (Core)","name":"CentOS Linux","kernel":"4.14.186-146.268.amzn2.x86_64","platform":"centos","family":"redhat"},"hostname":"ip-192-168-18-105.ap-south-1.compute.internal","containerized":false,"name":"ip-192-168-18-105.ap-south-1.compute.internal"},"tags":["beats_input_codec_plain_applied","_grokparsefailure"],"agent":{"version":"7.2.0","type":"filebeat","ephemeral_id":"af246a38-d99d-43ab-849b-cf25288dd6c1","hostname":"ip-192-168-18-105.ap-south-1.compute.internal","id":"9631683c-b8fc-40d8-9e28-f80f5fa3cc2c"},"kubernetes":{"node":{"name":"ip-192-168-18-105.ap-south-1.compute.internal"},"container":{"name":"fsm-backend-cron-prod"},"labels":{"pod-template-hash":"6bd6459455","app":"fsm-backend-cron-prod"},"pod":{"uid":"c356f207-bc60-4974-9f8b-02ecfe87eaa0","name":"fsm-backend-cron-prod-6bd6459455-p9p49"},"replicaset":{"name":"fsm-backend-cron-prod-6bd6459455"},"namespace":"default"}}')],
Server_names are there in the log file but I'm getting count as 0. Please help.
Here is a solution using pyspark sql functions:
Split your string on the substring(server name in your case) that you are trying to count and length of the array value - 1 would be the actual count of the substring present in the one row. To calculate occurrences in the total file, use sum function.
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import sum, col, size, split
sc = SparkContext()
sqlContext = SQLContext(sc)
spark = SparkSession(sc)
base_df = spark.read.text("/content/fsm-20210817.logs")
server_list = ['nginx-ingress-controller-5b6697898-zqxl4','cert-manager-5695c78d49-q9s9j']
for i in server_list:
base_df_with_count = base_df.withColumn(i+"count", size(split(col("column_name"), i)) - 1)
res = base_df_with_count.select(sum(i+"count")).collect()[0][0]
print(i,res)
PS: Not tested the code, but should work.

Parameterize the find method in python using mongo

Files to upload will be like WFSIV0101202001.318.tar.gz,WFSIV0101202001.2624.tar.gz etc.
INPUT_FILE_PATH = 'C:\Files to upload'
try:
import os
from google.cloud import storage
import sys
import pymongo
import pymongo.errors
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
except:
print("missing modules")
try:
mongo_client = MongoClient(host="xyz.com", port=27017)
Db = mongo_client['abcd']
coll = Db['shopper_journey_sitedata']
except ConnectionFailure:
print("Connection failed")
date=[]
# Thirdpartyid=[]
input_files = os.listdir(INPUT_FILE_PATH)
# looping through input files
for input_file in input_files:
x = input_file.split(".")
date.append(x[0][5:13])
tp_site_id = x[1]
# print(tp_site_id)
cur = coll.find({"third_party_site_id":tp_site_id})
for doc in cur:
print(doc)
Now i want to parameterize the find() method for every id, so that on each iteration i should get st_site_id ?
above code i tried but ist giving error as "Datas:name error"
You can do one thing
coll.find({"third_party_site_id": { $in :
[318,2624,2621,2622,102,078]}})
If Tid is an array, then you could replace 318 in your query to Tid[I]