Pyspark AssertionError: on should be Column or list of Column - pyspark

Hi I have the below dataframes and when I join them I get AssertionError: on should be Column or list of Column. How do I get around this please as I cannot find any solution on google related to this?
Pages = sc.read.json("/Users/me/desktop/clickstream/Clicks/Page*.json.gz")
Pages_Dataset = Pages.select("SessionNumber", "PageLocation", "PageInstanceID")\
.withColumnRenamed("PageLocation", "URL")\
.withColumnRenamed("PageInstanceID", "CsaNumber")\
.withColumn("URL2", expr("CASE WHEN INSTR(URL, '=') > 0 THEN SUBSTR(URL,0,INSTR(URL, '=') -1) ELSE URL END"))\
.withColumn("URL2", expr("CASE WHEN INSTR(URL2, '?') > 0 THEN SUBSTR(URL2,0,INSTR(URL2, '?') -1) ELSE URL2 END"))\
.withColumn("URL2", expr("CASE WHEN INSTR(URL2, '#') > 0 THEN SUBSTR(URL2,0,INSTR(URL2, '#') -1) ELSE URL2 END"))\
.withColumn("URL3", expr("CASE WHEN INSTR(URL, 'prdcls=') > 0 THEN SUBSTR(URL,INSTR(URL, 'prdcls=')+7,2) ELSE '' END"))\
.withColumn("URL", concat("URL2", "URL3"))\
.select("SessionNumber", "URL", "CsaNumber").alias("a")\
.join(ConfiguredUrls.alias("b"), lower("a.URL") == lower("b.URL"), "left")\
.select("a.SessionNumber", "b.Product", "a.CsaNumber", "b.EndQuote", "a.URL")\
.withColumnRenamed("Product", "Session")\
.withColumn("Session", expr("CASE WHEN lower(URL) like 'https://mobilephones.com/deals/%' THEN 'Mobile Phones' ELSE Session END"))\
.withColumn("EndQuote", expr("CASE WHEN lower(URL) like 'https://mobilephones.com/deals/%' THEN 'Mobile Phones' ELSE EndQuote END"))\
.distinct()
Goals_Dataset = Goals.select("SessionNumber", "GoalName", "PageInstanceID", "EventTimestamp")\
.withColumnRenamed("EventTimestamp", "GoalDate")\
.withColumnRenamed("PageInstanceID", "CsaNumber")\
.select("SessionNumber", "GoalName", "CsaNumber", "GoalDate").alias("a")\
.join(ConfiguredGoals.alias("b"), lower("a.GoalName") == lower("b.GoalNameValue"), "left")\
.select("a.SessionNumber", coalesce("b.StartQuote", "b.EndQuote", "b.Switch").alias("Session"), "a.CsaNumber", "b.EndQuote")\
.distinct()
Session_Dataset = Pages_Dataset.select("SessionNumber", "Session", "CsaNumber", "EndQuote").alias("a")\
.join(Goals_Dataset.alias("b"), "a.SessionNumber" == "b.SessionNumber", "fullouter")\
.select(coalesce("a.SessionNumber", "b.SessionNumber").alias("SessionNumber"), coalesce("a.Session", "b.Session").alias("Session"), coalesce("a.CsaNumber", "b.CsaNumber").alias("CsaNumber"), coalesce("a.EndQuote", "b.EndQuote").alias("EndQuote"))\
.distinct()
#Error:
Session_Dataset = Pages_Dataset.select("SessionNumber", "Session", "CsaNumber", "EndQuote").alias("a")\
File "/usr/local/Cellar/apache-spark/3.2.1/libexec/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line 1343, in join
AssertionError: on should be Column or list of Column

"a.SessionNumber" == "b.SessionNumber" should be col("a.SessionNumber") == col("b.SessionNumber"), or just "SessionNumber"

Related

Traceback after looping through all available news article

I am making a python CLI utility that will answer questions like "15 + 15" or "How many letters are in the alphabet".
I then decided to add the ability to search up the latest news using the newspaper module.
All of it works except when the for loop finishes, after printing a string literal, it gives me a error that I do not know what the heck it means.
Can someone decipher the error for me and if possible, help me fix the error? Thanks.
import requests
import wolframalpha
import wikipedia
import time
import sys
from threading import Thread
from newspaper import Article
import bs4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen
version = 2.1
build = '19w12a6'
ready = 0
loadingAnimationStop = 0
appId = 'CENSORED STUFF BECAUSE I DON\'T WANT ANYONE TO TOUCH MY KEY'
client = wolframalpha.Client(appId)
exitNow = 0
def loadingAnimation():
while exitNow == 0:
print("Loading: |", end='\r')
time.sleep(0.2)
while ready == 1:
time.sleep(0)
print("Loading: /", end='\r')
time.sleep(0.2)
while ready == 1:
time.sleep(0)
print("Loading: -", end='\r')
time.sleep(0.2)
while ready == 1:
time.sleep(0)
sys.stdout.write("Loading: \ \r")
time.sleep(0.2)
while ready == 1:
time.sleep(0)
hui = Thread(target = loadingAnimation, args=())
hui.start()
def search_wiki(keyword=''):
searchResults = wikipedia.search(keyword)
if not searchResults:
print("No result from Wikipedia")
return
try:
page = wikipedia.page(searchResults[0])
except wikipedia.DisambiguationError:
page = wikipedia.page(err.options[0])
wikiTitle = str(page.title.encode('utf-8'))
wikiSummary = str(page.summary.encode('utf-8'))
print(' ', end='\r')
print(wikiTitle)
print(wikiSummary)
def search(text=''):
res = client.query(text)
if res['#success'] == 'false':
ready = 1
time.sleep(1)
print('Query cannot be resolved')
else:
result = ''
pod0 = res['pod'][0]
pod1 = res['pod'][1]
if (('definition' in pod1['#title'].lower()) or ('result' in pod1['#title'].lower()) or (pod1.get('#primary','false') == 'True')):
result = resolveListOrDict(pod1['subpod'])
ready = 1
time.sleep(0.75)
print(' ', end='\r')
print(result)
question = resolveListOrDict(pod0['subpod'])
question = removeBrackets(question)
#primaryImage(question)
else:
question = resolveListOrDict(pod0['subpod'])
question = removeBrackets(question)
search_wiki(question)
def removeBrackets(variable):
return variable.split('(')[0]
def resolveListOrDict(variable):
if isinstance(variable, list):
return variable[0]['plaintext']
else:
return variable['plaintext']
#def primaryImage(title=''):
# url = 'http://en.wikipedia.org/w/api.php'
# data = {'action':'query', 'prop':'pageimages','format':'json','piprop':'original','titles':title}
# try:
# res = requests.get(url, params=data)
# key = res.json()['query']['pages'].keys()[0]
# imageUrl = res.json()['query']['pages'][key]['original']['source']
# print(imageUrl)
# except Exception:
# print('Exception while finding image:= '+str(err))
page = requests.get('https://www.wolframalpha.com/')
s = page.status_code
if (s != 200):
ready = 1
time.sleep(1)
print('It looks like https://www.wolframalpha.com/ is not online.')
print('Please check your connection to the internet and https://www.wolframalpha.com/')
print('Stopping Python Information Engine')
while True:
time.sleep(1)
page = requests.get('https://www.wikipedia.org/')
s = page.status_code
if (s != 200):
ready = 1
time.sleep(1)
print('It looks like https://www.wikipedia.org/ is not online.')
print('Please check your connection to the internet and https://www.wikipedia.org/')
print('Stopping Python Information Engine')
while True:
time.sleep(1)
ready = 1
while exitNow == 0:
print('================================================================================================')
print('Python Information Engine CLI Version', end=' ')
print(version)
print('Create by Unsigned_Py')
print('================================================================================================')
ready = 1
time.sleep(1)
print(' ', end='\r')
print(' ', end='\r')
q = input('Search: ')
print('================================================================================================')
if (q == 'Credits()'):
print('Credits')
print('================================================================================================')
print('PIE is made by Unsigned_Py')
print('Unsigned_Py on the Python fourms: https://python-forum.io/User-Unsigned-Py')
print('Contact Unsigned_Py: Ckyiu#outlook.com')
if (q == 'Latest News'):
print('DISCLAIMER: The Python Information Engine News port is still in DEVELOPMENT!')
print('Getting latest news links from Google News...')
ready = 0
news_url = "https://news.google.com/news/rss"
Client = urlopen(news_url)
xml_page = Client.read()
Client.close()
soup_page = soup(xml_page,"xml")
news_list = soup_page.findAll("item")
ready = 1
print('================================================================================================')
article_number = 1
for news in news_list:
print(article_number, end=': ')
print(news.title.text)
print(news.pubDate.text)
if (input('Read (Y or N)? ') == 'y'):
ready = 0
url = news.link.text
article = Article(url)
article.download()
article.parse()
article.nlp()
ready = 1
print('================================================================================================')
print(article.summary)
print('================================================================================================')
article_number = article_number + 1
print("That's all for today!")
if (q == 'Version()'):
print('Python Information Engine CLI Version', end=' ')
print(version)
print('Running Build', end=' ')
print(build)
print('Upon finding a bug, please report to Unsigned_Py and I will try to fix it!')
print('Looking for Python Information Engine CLI Version 1.0 - 1.9?')
print("It's called Wolfram|Alpha and Wikipedia Engine Search!")
if (q != 'Exit()'):
if (q != 'Credits()'):
if (q != 'News'):
if (q != 'Version()'):
ready = 0
search(q)
else:
exitNow = 1
print('Thank you for using Python Information Engine')
print('================================================================================================')
time.sleep(2)
ready = 0
Here's the error:
Traceback (most recent call last):
File "C:\Users\ckyiu\OneDrive\Desktop\Python Information Engine 2.1.py", line 210, in <module>
search(q)
File "C:\Users\ckyiu\OneDrive\Desktop\Python Information Engine 2.1.py", line 62, in search
res = client.query(text)
File "C:\Users\ckyiu\AppData\Local\Programs\Python\Python37-32\lib\site-packages\wolframalpha\__init__.py", line 56, in query
return Result(resp)
File "C:\Users\ckyiu\AppData\Local\Programs\Python\Python37-32\lib\site-packages\wolframalpha\__init__.py", line 178, in __init__
super(Result, self).__init__(doc)
File "C:\Users\ckyiu\AppData\Local\Programs\Python\Python37-32\lib\site-packages\wolframalpha\__init__.py", line 62, in __init__
self._handle_error()
File "C:\Users\ckyiu\AppData\Local\Programs\Python\Python37-32\lib\site-packages\wolframalpha\__init__.py", line 69, in _handle_error
raise Exception(template.format(**self))
Exception: Error 0: Unknown error
Well I got it to work now, for some reason I put: if (q != 'News'): I wanted if (q != 'Latest News'):.
Then python threw me a error for that. I got it to work at least now.

How to get different values selected on selectInput

I want to introduce my csv data into this application which is appended to this question: what I'm aiming to is to make sure that the two select Input are loaded with different values whatever the user's selecting! The problem with my code is that the first selectInput is blocked to the same initial value while it must change and prevent the second to have its value every time !
ui <- fluidPage(
fileInput('file1', 'Upload your CSV File'),
htmlOutput("variables"),
htmlOutput("facteurs"),
uiOutput("tb"))
server <- function(input, output, session) {
myData1 <- reactive({
inFile <- input$file1
if (is.null(inFile)) return(NULL)
data <- read.csv(inFile$datapath, header = TRUE,row.names=1)
data
})
output$variables <- renderUI({
df_init <- myData1()
x=sapply(df_init,class)
x=(x=="numeric")
df=df_init[,x]
if (identical(df, '') || identical(df,data.frame())) return(NULL)
selectInput(inputId = "V1", label = "Variables to use: Y", choices=names(df), selected=names(df[1]))
})
output$facteurs <- renderUI({
df_init <- myData1()
x=sapply(df_init,class)
x=(x=="factor")
df=df_init[,x]
if (identical(df, '') || identical(df,data.frame())) return(NULL)
verticalLayout(
selectInput(inputId = "F11", label = "Factors to use: X1", choices=names(df)),
selectInput(inputId = "F12", label = "Factors to use: X2", choices=names(df)[names(df)!=input$F11]))
})}
shinyApp(ui = ui, server = server)
The data is
,Var,Lo,ES,Acidity,K232,K270,IP,OS,C 14:0,C 16:0,C 16:1,C 17:0,C 17:1,C 18:0,C 18:1,C 18:2,C 18:3,C 20:0,C 20:1,total,LLL,LnLO,LnLP,LLO,LnOO,PLL,LOO,LOP,PLP,OOO,POP,POO,AOL,SOO,SOP,Chlorophyll,b carotène,polyphenols ,Ethyl acetate,2- Methyl butanal,3- Methyl butanal,1-Penten-3-one,3-Hexanone,Hexanal,3-Pentanol,Trans-2-pentenal,1-Penten-3-ol,Cis-3-hexenal,Trans-2-hexenal,1-Pentanol,Hexyl acetate,Cis-3-hexenyl acetate,Cis-2-pentenol,6-Methyl-5-hepten-2-one,1-Hexanol,Trans-3-hexenol,Cis-3-hexenol,Trans-2-hexenol,Acetic acid,Butyric acid,H- Tyr ,Tyr ,DFOA,DFLA,Ac-Pin,Pin,EAA,OA,LA,total phenols (HPLC)
P1,chetoui,beja,sp,0.93,1.49,0.2,9.2,16.51,0.01,13.5,0.57,0.07,0.08,2.57,67.04,18.56,1.16,0.02,0.39,103.95,0.72,0.45,0.1,7.87,2.63,0.38,22.14,8.8,0.7,33.47,15.84,1.77,0.37,3.95,0.8,5.05,4.1,491.6,2.72,0.29,0.11,0.08,0.1,15.27,1.35,1.55,3.77,22.68,133.13,0.36,9.92,7.14,2.37,0.5,10.03,0.78,121.11,14.12,0.05,0.08,1.91,4.15,10.33,40.52,1.21,5.5,2.92,30.65,2.35,99.53
P2,chetoui,beja,sp,0.36,1.24,0.2,8.2,16.81,0.01,13.39,0.18,0.05,0.07,1.23,69.23,18.63,0.91,0.02,0.28,104,0.69,0.43,0.15,7.81,2.57,0.42,22.21,8.84,0.71,33.87,15.6,2.01,0.38,4.14,1.01,5.88,6.161,457.04,2.52,0.34,0.12,0.09,0.22,15.2,1.32,1.52,3.67,22.61,133.19,0.35,9.89,7.18,2.34,0.51,10.17,0.75,121.21,14.29,0.05,0.02,1.92,4.05,10.45,40.63,1.25,5.55,2.95,31.042,2.17,100.01
P3,chetoui,beja,sp,0.84,1.87,0.21,8.6,16.73,0.01,13.31,0.45,0.06,0.08,2.54,69.29,17.03,0.84,0.02,0.37,104,0.72,0.42,0.12,7.82,2.61,0.43,21.22,8.83,0.72,33.85,15.52,1.92,0.39,4.05,0.95,5.92,6.241,482.12,2.72,0.25,0.08,1.12,0.42,15.01,1.3,1.44,3.93,22.51,133.07,0.39,9.87,7.16,2.31,0.52,10.1,0.76,121.29,14.21,0.06,0.01,1.93,4.12,10.6,40.71,1.26,5.54,2.96,30.43,2.26,99.81

Coffescript not compiling in rails app. Says syntax error - unexpected indentation

I'm new to coffeescript and I'm not sure what is wrong with my syntax. I want to add an error message if either the date or time field is empty upon clicking the update button. Here is my code.
$.add_error = (field, message) ->
unless field.hasClass('input-error')
field.after('<span class="input-error-message">' + message + '</span>')
field.addClass('input-error')
$.remove_error = (field) ->
field.removeClass('input-error')
field.parent().find('.input-error-message').remove()
$('.btn.update_schedule').click ->
date = $('#date')
time = $('#time')
if (date.val() && !time.val()) || (!date.val() && time.val())
if !time.val()
$.add_error(repeat_count, 'Please select both a date and time')
false
else !date.val()
$.add_error(repeat_date, 'Please select both a date and time')
false
else
$.remove_error(time) || $.remove_error(date)
I'm unable to compile this code because it says there's an unexpected indentation but I don't see it. Any advice would be greatly appreciated.
In the last block $('.btn.update_schedule').click ->, there is an
if ... else ... else ...
that is basically non-sense.
It seems that should be instead an
if ... else if ... else ...
This seems the correct version:
$('.btn.update_schedule').click ->
date = $('#date')
time = $('#time')
if (date.val() && !time.val()) || (!date.val() && time.val())
if !time.val()
$.add_error(repeat_count, 'Please select both a date and time')
false
else if !date.val()
$.add_error(repeat_date, 'Please select both a date and time')
false
else
$.remove_error(time) || $.remove_error(date)

Filter on Oid using spark-mongo connector

I would like to filter on the objectId of the mongo document from spark program. I have tried the following:
case class _id(oid: String)
val str_start: _id = _id((start.getMillis() / 1000).toHexString + "0000000000000000")
val str_end: _id = _id((end.getMillis() / 1000).toHexString + "0000000000000000")
val filteredDF = df.filter(
$"_timestamp".isNotNull
.and($"_timestamp".between(new Timestamp(start.getMillis()), new Timestamp(end.getMillis())))
.and($"_id").between(str_start, str_end)
or
val str_start = (start.getMillis() / 1000).toHexString + "0000000000000000"
val str_end = (end.getMillis() / 1000).toHexString + "0000000000000000"
val filteredDF = df.filter(
$"_timestamp".isNotNull
.and($"_timestamp".between(new Timestamp(start.getMillis()), new Timestamp(end.getMillis())))
.and($"_id.oid").between(str_start, str_end)
Both give me an error in analysis:
Exception in thread "main" org.apache.spark.sql.AnalysisException: cannot
resolve '(((`_timestamp` IS NOT NULL) AND ((`_timestamp` >= TIMESTAMP('2017-
07-31 00:22:00.0')) AND (`_timestamp` <= TIMESTAMP('2017-08-01
00:22:00.0')))) AND `_id`)' due to data type mismatch: differing types in
'(((`_timestamp` IS NOT NULL) AND ((`_timestamp` >= TIMESTAMP('2017-07-31
00:22:00.0')) AND (`_timestamp` <= TIMESTAMP('2017-08-01 00:22:00.0')))) AND
`_id`)' (boolean and struct<oid:string>).;;
'Filter (((((isnotnull(_timestamp#40) && ((_timestamp#40 >=
1501449720000000) && (_timestamp#40 <= 1501536120000000))) && _id#38) >=
597e4df80000000000000000) && (((isnotnull(_timestamp#40) && ((_timestamp#40
>= 1501449720000000) && (_timestamp#40 <= 1501536120000000))) && _id#38) <=
597f9f780000000000000000))
How can I query on the oid?
Thanks
Nir
I think you are misplaced parenthesis: Should be something like
and($"_id.oid" between(str_start, str_end) )
(so that's why you have the error message:
(boolean and struct<oid:string>)

In Load Test - "The cast to value type 'System.Int32' failed because the materialized value is null"

when we are debugging below code with single user it is working fine. But while we tried to run Load test when user count goes more than 30 we are getting
"The cast to value type 'System.Int32' failed because the materialized value is null. Either the result type's generic parameter or the query must use a nullable type." exception.
This is code snippet where we are getting exception.
profile.AccessRights = (
from t in entities.Opportunities
join x in entities.OpportunityWorkloadGroups on t.OpportunityId equals x.OpportunityId into oppworkloads
from ow in oppworkloads.DefaultIfEmpty()
join m in entities.OpportunityWorkloads on ow.OpportunityWorkloadGroupId equals m.OpportunityWorkloadGroupId into oppworkloadmodules
from om in oppworkloadmodules.DefaultIfEmpty()
join y in entities.OpportunityUsers on om.OpportunityWorkloadId equals y.OpportunityWorkloadId into users
from ou in users.DefaultIfEmpty()
where (((t.LookupOpportunityStatu.Name.Equals(Constants.EstimationStatusCompletedName) || t.LookupOpportunityStatu.Name.Equals(Constants.OpportunityStatusDiscardedName)
|| t.LookupOpportunityStatu.Name.Equals(OSEConstants.OpportunityStatusCompletedWithDiscards)) && !t.IsRestricted) ? true : ou.User.Alias == null ? true : ou.User.Alias == alias)
select new Microsoft.OneEstimator.DataAccess.Entities.AccessRights()
{
MasterUserId = userId,
MasterUserRole = roleName,
OpportunityId = t.OpportunityId,
IsOpportunityOwner = t.OpportunityOwner == userId ? true : false,
IsRestricted = t.IsRestricted,
StatusName = t.LookupOpportunityStatu.Name,
OpportunityWorkloadId = ow.OpportunityWorkloads.Any(m => m.OpportunityWorkloadGroupId == ow.OpportunityWorkloadGroupId) ? ow.OpportunityWorkloads.FirstOrDefault(m => m.OpportunityWorkloadGroupId == ow.OpportunityWorkloadGroupId).OpportunityWorkloadId : 0,
EstimationId = ow.OpportunityWorkloadGroupId == null ? 0 : ow.IsActive == false ? 0 : ow.OpportunityWorkloadGroupId,
RoleName = ou.Role == null ? string.Empty : ou.User.Alias == alias ? ou.Role.RoleName : string.Empty,
HasRightsForWA = ow.WorkloadGroupVersion != null ? entities.WorkloadGroupUsers.Any(y => y.WorkloadGroupId == ow.WorkloadGroupVersion.WorkloadGroup.WorkloadGroupId && y.UserId == userId && y.Role.RoleName.Equals(OSEConstants.Estimator)) : true,
WorkstreamUserRoleName = ow.WorkloadGroupVersion != null ? entities.WorkloadGroupUsers.FirstOrDefault(y => y.WorkloadGroupId == ow.WorkloadGroupVersion.WorkloadGroup.WorkloadGroupId && y.UserId == userId).Role.RoleName : string.Empty
}).ToList();
I guess I'm doing something wrong with DefaultIfNull() not sure.