Jupyter with Redash Dynamic Query - jupyter

this is my demo of Redash's data.
and here is my Jupyter demo code:
from redash_dynamic_query import RedashDynamicQuery
import pandas as pd
import json
redash = RedashDynamicQuery(
endpoint='http://demo.redash.io/',
apikey='ejsbcH7u2gqCzKjgjltaJhBfrRqkSuTCy1pi5Lur',
#取得したいデータソースIDを設定
data_source_id=1
)
#取得したいクエリIDを設定
query_id = 8174
bind = {
"start_date": '2017-01-01T00:00:00',
"end_date": '2019-12-31T23:59:59',
}
#redashのクエリ結果を取得
result = redash.query(query_id, bind)
res = result['query_result']['data']
#jsonを整形
res_format_json = json.dumps(res, indent=4, separators=(',', ': '))
print(res_format_json)
But i got this error:
ExceptionTraceback (most recent call last)
<ipython-input-19-6bc390f7a5bf> in <module>
15
16 #redashのクエリ結果を取得
---> 17 result = redash.query(query_id, bind)
18 res = result['query_result']['data']
19
/opt/conda/lib/python3.7/site-packages/redash_dynamic_query/__init__.py in query(self, query_id, bind, as_csv)
26
27 # post query result
---> 28 response = self._api_query_results(self._build_query(query_id, query_body, query_data_source_id))
29 if 'query_result' in response:
30 return response
/opt/conda/lib/python3.7/site-packages/redash_dynamic_query/__init__.py in _api_query_results(self, query_string)
90 )
91 if response.status_code != 200:
---> 92 raise Exception('query_results failed. [%d]' % response.status_code)
93
94 return response.json()
Exception: query_results failed. [404]
Am I setting something wrong.
I just did base on origin document of redash_dynamic_query library.
Thank you for reading and please give me a hand if you have the answer.

I had the same problem.
The API key is incorrect because you are using the query API key.
The correct one is the user API key (see edit profile page).

Related

JSONDecodeError when connecting to Dataverse with Databricks

When executing the code listed below to access Azure Dataverse Endpoint with Databricks I get the following error
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
The code is as follows:
list_data = call_dataverse_endpoint(f'https://{my_dv_orgName}.crm11.dynamics.com/api/data/v9.2/{my_dv_entity}')
The full error message is as follows:
JSONDecodeError Traceback (most recent call last)
<command-2234217625858083> in <module>
----> 1 list_data = call_dataverse_endpoint(f'https://{my_dv_orgName}.crm11.dynamics.com/api/data/v9.2/{my_dv_entity}')
<command-3028378077557710> in call_dataverse_endpoint(endpoint)
16
17 # Loop through the responses until odata.nextLink is gone.
---> 18 while "#odata.nextLink" in response.json():
19 # Append the data returned by the endpoint to the list
20 data.extend(response.json()["value"])
/databricks/python/lib/python3.8/site-packages/requests/models.py in json(self, **kwargs)
898 # used.
899 pass
--> 900 return complexjson.loads(self.text, **kwargs)
901
902 #property
/usr/lib/python3.8/json/__init__.py in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
355 parse_int is None and parse_float is None and
356 parse_constant is None and object_pairs_hook is None and not kw):
--> 357 return _default_decoder.decode(s)
358 if cls is None:
359 cls = JSONDecoder
/usr/lib/python3.8/json/decoder.py in decode(self, s, _w)
335
336 """
--> 337 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
338 end = _w(s, end).end()
339 if end != len(s):
/usr/lib/python3.8/json/decoder.py in raw_decode(self, s, idx)
353 obj, end = self.scan_once(s, idx)
354 except StopIteration as err:
--> 355 raise JSONDecodeError("Expecting value", s, err.value) from None
356 return obj, end
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
The function that is being called is as follows:
def call_dataverse_endpoint(endpoint):
# Empty list
data = []
# Headers
headers = {
"Authorization": f"Bearer {my_dv_accessToken}",
"Accept": "application/json",
"Content-Type": "application/json; charset=utf-8"
}
# Initial request
response = requests.get(endpoint, headers=headers)
#print(print(f''' Fetching from initial url: {endpoint} ''' ))
# Loop through the responses until odata.nextLink is gone.
while "#odata.nextLink" in response.json():
# Append the data returned by the endpoint to the list
data.extend(response.json()["value"])
# Request the odata.nextLink URL
response = requests.get(response.json()["#odata.nextLink"], headers=headers)
#print(f''' Fetching from successive url: {response.json()["#odata.nextLink"]} ''' )
# Append nextLink response data
data.extend(response.json()["value"])
# Return
return data
Any thoughts?

Error Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_index_select

I have the following code taken directly from here with some pretty little modifications:
import pandas as pd
import torch
import json
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from torch import cuda
df = pd.read_pickle('df_final.pkl')
model = T5ForConditionalGeneration.from_pretrained('t5-base')
tokenizer = T5Tokenizer.from_pretrained('t5-base')
device = 'cuda' if cuda.is_available() else 'cpu'
text = ''.join(df[(df['col1'] == 'type') & (df['col2'] == 2)].col3.to_list())
preprocess_text = text.strip().replace("\n","")
t5_prepared_Text = "summarize: "+preprocess_text
#print ("original text preprocessed: \n", preprocess_text)
tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt", max_length = 500000).to(device)
# summmarize
summary_ids = model.generate(tokenized_text,
num_beams=4,
no_repeat_ngram_size=2,
min_length=30,
max_length=100,
early_stopping=True)
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print ("\n\nSummarized text: \n",output)
When executing the model_generate() part i get an error like this:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-12-e8e9819a85dc> in <module>
12 min_length=30,
13 max_length=100,
---> 14 early_stopping=True).to(device)
15
16 output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
~\Anaconda3\lib\site-packages\torch\autograd\grad_mode.py in decorate_no_grad(*args, **kwargs)
47 def decorate_no_grad(*args, **kwargs):
48 with self:
---> 49 return func(*args, **kwargs)
50 return decorate_no_grad
51
~\Anaconda3\lib\site-packages\transformers\generation_utils.py in generate(self, input_ids, max_length, min_length, do_sample, early_stopping, num_beams, temperature, top_k, top_p, repetition_penalty, bad_words_ids, bos_token_id, pad_token_id, eos_token_id, length_penalty, no_repeat_ngram_size, num_return_sequences, attention_mask, decoder_start_token_id, use_cache, **model_specific_kwargs)
383 encoder = self.get_encoder()
384
--> 385 encoder_outputs: tuple = encoder(input_ids, attention_mask=attention_mask)
386
387 # Expand input ids if num_beams > 1 or num_return_sequences > 1
~\Anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
~\Anaconda3\lib\site-packages\transformers\modeling_t5.py in forward(self, input_ids, attention_mask, encoder_hidden_states, encoder_attention_mask, inputs_embeds, head_mask, past_key_value_states, use_cache, output_attentions, output_hidden_states, return_dict)
701 if inputs_embeds is None:
702 assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings"
--> 703 inputs_embeds = self.embed_tokens(input_ids)
704
705 batch_size, seq_length = input_shape
~\Anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
~\Anaconda3\lib\site-packages\torch\nn\modules\sparse.py in forward(self, input)
112 return F.embedding(
113 input, self.weight, self.padding_idx, self.max_norm,
--> 114 self.norm_type, self.scale_grad_by_freq, self.sparse)
115
116 def extra_repr(self):
~\Anaconda3\lib\site-packages\torch\nn\functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
1482 # remove once script supports set_grad_enabled
1483 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 1484 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
1485
1486
RuntimeError: Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_index_select
​
I've searched this error and fouund some other threads like this one and this one but they didn't help me much since their case seems to be completely different. In my case there are no custom instances or classes created, so i don't know how to fix this or where the error come from.
Could you please tell me where is the error coming from and how could i fix it?
Thank you very much in advance.
Try explicitly moving your model to the GPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

Using pretrained models from sparknlp on Databricks

I am trying to follow the official examples from John Snow Labs but every time I get a TypeError: 'JavaPackage' object is not callable error. I followed all of the steps in the Databricks install documentation but no matter what walkthrough I try, either this one or this one it fails.
An example of the first (after doing the installs):
import sparknlp
from sparknlp.pretrained import *
pipeline = PretrainedPipeline('recognize_entities_dl', 'en')
recognize_entities_dl download started this may take some time.
TypeError: 'JavaPackage' object is not callable
TypeError Traceback (most recent call last)
<command-937510457011238> in <module>
----> 1 pipeline = PretrainedPipeline('recognize_entities_dl', 'en')
2
3 # ner_bert = NerDLModel.pretrained('ner_dl_bert')
4
5 # pipeline = PretrainedPipeline('recognize_entities_dl', 'en', 'https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/ner_dl_bert_en_2.4.3_2.4_1584624951079.zip')
/databricks/python/lib/python3.7/site-packages/sparknlp/pretrained.py in __init__(self, name, lang, remote_loc, parse_embeddings, disk_location)
89 def __init__(self, name, lang='en', remote_loc=None, parse_embeddings=False, disk_location=None):
90 if not disk_location:
---> 91 self.model = ResourceDownloader().downloadPipeline(name, lang, remote_loc)
92 else:
93 self.model = PipelineModel.load(disk_location)
/databricks/python/lib/python3.7/site-packages/sparknlp/pretrained.py in downloadPipeline(name, language, remote_loc)
49 def downloadPipeline(name, language, remote_loc=None):
50 print(name + " download started this may take some time.")
---> 51 file_size = _internal._GetResourceSize(name, language, remote_loc).apply()
52 if file_size == "-1":
53 print("Can not find the model to download please check the name!")
/databricks/python/lib/python3.7/site-packages/sparknlp/internal.py in __init__(self, name, language, remote_loc)
190 def __init__(self, name, language, remote_loc):
191 super(_GetResourceSize, self).__init__(
--> 192 "com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.getDownloadSize", name, language, remote_loc)
193
194
/databricks/python/lib/python3.7/site-packages/sparknlp/internal.py in __init__(self, java_obj, *args)
127 super(ExtendedJavaWrapper, self).__init__(java_obj)
128 self.sc = SparkContext._active_spark_context
--> 129 self._java_obj = self.new_java_obj(java_obj, *args)
130 self.java_obj = self._java_obj
131
/databricks/python/lib/python3.7/site-packages/sparknlp/internal.py in new_java_obj(self, java_class, *args)
137
138 def new_java_obj(self, java_class, *args):
--> 139 return self._new_java_obj(java_class, *args)
140
141 def new_java_array(self, pylist, java_class):
/databricks/spark/python/pyspark/ml/wrapper.py in _new_java_obj(java_class, *args)
65 java_obj = getattr(java_obj, name)
66 java_args = [_py2java(sc, arg) for arg in args]
---> 67 return java_obj(*java_args)
68
69 #staticmethod
TypeError: 'JavaPackage' object is not callable
I get a similar if not the exact error if I try:
pipeline = PretrainedPipeline('recognize_entities_dl', 'en', 'https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/ner_dl_bert_en_2.4.3_2.4_1584624951079.zip')
I also get the same error for the second example. The Databricks Runtime Version is: 6.5 (includes Apache Spark 2.4.5, Scala 2.11), which is on the list of approved runtimes.
I'm not sure what the error messages mean or how to resolve them.
I found out that 'JavaPackage' object is not callable is caused by the spark-nlp (assembly jars) missing. So I made sure that these jars were downloaded and then placed in BOTH the executor and driver. E.g
when building the Spark docker image do something like
RUN cd /opt/spark/jars && \
wget https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/spark-nlp-assembly-2.6.4.jar
and also on the driver image/machine make sure the jar exists in the local directoy. Then set
conf.set("spark.driver.extraClassPath", "/opt/spark/jars/spark-nlp-assembly-2.6.4.jar")
conf.set("spark.executor.extraClassPath", "/opt/spark/jars/spark-nlp-assembly-2.6.4.jar")
The solution for databricks might be a bit different so instead of baking in the jars you may need to host them on S3 and refer to them that way.

Databricks UDF calling an external web service cannot be serialised (PicklingError)

I am using Databricks and have a column in a dataframe that I need to update for every record with an external web service call. In this case it is using the Azure Machine Learning Service SDK and does a service call. This code works fine when not run as a UDF in spark (ie. just python) however it throws a serialization error when I try to call it as a UDF. The same happens if I use a lambda and a map with an rdd.
The model uses fastText and can be invoked fine from Postman or python via a normal http call or using the WebService SDK from AMLS - it's just when it is a UDF that it fails with this message:
TypeError: can't pickle _thread._local objects
The only workaround I can think of is to loop through each record in the dataframe sequentially and update the record with a call, however this is not very efficient. I don't know if this is a spark error or because the service is loading a fasttext model. When I use the UDF and mock a return value it works though.
Error at bottom...
from azureml.core.webservice import Webservice, AciWebservice
from azureml.core import Workspace
def predictModelValue2(summary, modelName, modelLabel):
raw_data = '[{"label": "' + modelLabel + '", "model": "' + modelName + '", "as_full_account": "' + summary + '"}]'
prediction = service.run(raw_data)
return prediction
from pyspark.sql.types import FloatType
from pyspark.sql.functions import udf
predictModelValueUDF = udf(predictModelValue2)
DVIRCRAMFItemsDFScored1 = DVIRCRAMFItemsDF.withColumn("Result", predictModelValueUDF("Summary", "ModelName", "ModelLabel"))
TypeError: can't pickle _thread._local objects
During handling of the above exception, another exception occurred:
PicklingError Traceback (most recent call
last) in
----> 2 x = df.withColumn("Result", predictModelValueUDF("Summary",
"ModelName", "ModelLabel"))
/databricks/spark/python/pyspark/sql/udf.py in wrapper(*args)
194 #functools.wraps(self.func, assigned=assignments)
195 def wrapper(*args):
--> 196 return self(*args)
197
198 wrapper.name = self._name
/databricks/spark/python/pyspark/sql/udf.py in call(self, *cols)
172
173 def call(self, *cols):
--> 174 judf = self._judf
175 sc = SparkContext._active_spark_context
176 return Column(judf.apply(_to_seq(sc, cols, _to_java_column)))
/databricks/spark/python/pyspark/sql/udf.py in _judf(self)
156 # and should have a minimal performance impact.
157 if self._judf_placeholder is None:
--> 158 self._judf_placeholder = self._create_judf()
159 return self._judf_placeholder
160
/databricks/spark/python/pyspark/sql/udf.py in _create_judf(self)
165 sc = spark.sparkContext
166
--> 167 wrapped_func = _wrap_function(sc, self.func, self.returnType)
168 jdt = spark._jsparkSession.parseDataType(self.returnType.json())
169 judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
/databricks/spark/python/pyspark/sql/udf.py in _wrap_function(sc,
func, returnType)
33 def _wrap_function(sc, func, returnType):
34 command = (func, returnType)
---> 35 pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
36 return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes, sc.pythonExec,
37 sc.pythonVer, broadcast_vars, sc._javaAccumulator)
/databricks/spark/python/pyspark/rdd.py in _prepare_for_python_RDD(sc,
command) 2461 # the serialized command will be compressed by
broadcast 2462 ser = CloudPickleSerializer()
-> 2463 pickled_command = ser.dumps(command) 2464 if len(pickled_command) >
sc._jvm.PythonUtils.getBroadcastThreshold(sc._jsc): # Default 1M
2465 # The broadcast will have same life cycle as created
PythonRDD
/databricks/spark/python/pyspark/serializers.py in dumps(self, obj)
709 msg = "Could not serialize object: %s: %s" % (e.class.name, emsg)
710 cloudpickle.print_exec(sys.stderr)
--> 711 raise pickle.PicklingError(msg)
712
713
PicklingError: Could not serialize object: TypeError: can't pickle
_thread._local objects
I am not expert in DataBricks or Spark, but pickling functions from the local notebook context is always problematic when you are touching complex objects like the service object. In this particular case, I would recommend removing the dependency on the azureML service object and just use requests to call the service.
Pull the key from the service:
# retrieve the API keys. two keys were generated.
key1, key2 = service.get_keys()
scoring_uri = service.scoring_uri
You should be able to use these strings in the UDF directly without pickling issues -- here is an example of how you would call the service with just requests. Below applied to your UDF:
import requests, json
def predictModelValue2(summary, modelName, modelLabel):
input_data = json.dumps({"summary": summary, "modelName":, ....})
headers = {'Content-Type':'application/json', 'Authorization': 'Bearer ' + key1}
# call the service for scoring
resp = requests.post(scoring_uri, input_data, headers=headers)
return resp.text[1]
On a side node, though: your UDF will be called for each row in your data frame and each time it will make a network call -- that will be very slow. I would recommend looking for ways to batch the execution. As you can see from your constructed json service.run will accept an array of items, so you should call it in batches of 100s or so.

NameError: name 're' is not defined... already imported re in the code and built in function

I keep getting "NameError: name 're' is not defined", even though I have already imported re in my code AND the built in function pat_count() defined in library_s19_week2.py. I tried all the possible places to import re but none seemed working. Please help!
My code:
import re
hash_pat = re.compile(r'#\w+')
hash_counter = pat_count(hash_pat)
tweet_table['hash_count'] = tweet_table.apply(lambda row: hash_counter(row['tweet']), axis=1)
Traceback for the error:
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-93-1880eb903ae9> in <module>()
10
11 hash_pat = re.compile(r'#\w+')
---> 12 hash_counter = pat_count(hash_pat)
13 tweet_table['hash_count'] = tweet_table.apply(lambda row: hash_counter(row['tweet']), axis=1)
14
/content/library_s19_week2.py in pat_count(pattern)
95 def pat_count(pattern):
96 import re
---> 97
98 pat = re.compile(pattern)
99
NameError: name 're' is not defined
I found my bug:
hash_pat = re.compile(r'#\w+') should be hash_pat = r'#\w+.
As seen in the function pat_count() in the traceback, hash_pat is an input to re.compile().