The psdf.show() does not work although DataFrame looks to be created. I wonder what is the cause of this.
The environment is
Pyspark:3.2.1-hadoop3.2
Hadoop:3.2.1
JDK: 18.0.1.1
local
The code is the below
import pyspark.pandas as ps
from pyspark.sql import SparkSession
spark = SparkSession\
.builder\
.master('local')\
.appName('test')\
.getOrCreate()
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row
psdf = spark.createDataFrame([
(1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
(2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
(3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
], schema='a long, b double, c string, d date, e timestamp')
psdf.show()
and the error message is
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
Input In [7], in <cell line: 1>()
----> 1 psdf.show()
File ~\AppData\Local\Programs\Python\Python39\lib\site-packages\pyspark\sql\dataframe.py:494, in DataFrame.show(self, n, truncate, vertical)
491 raise TypeError("Parameter 'vertical' must be a bool")
493 if isinstance(truncate, bool) and truncate:
--> 494 print(self._jdf.showString(n, 20, vertical))
495 else:
496 try:
File ~\AppData\Local\Programs\Python\Python39\lib\site-packages\py4j\java_gateway.py:1321, in JavaMember.__call__(self, *args)
1315 command = proto.CALL_COMMAND_NAME +\
1316 self.command_header +\
1317 args_command +\
1318 proto.END_COMMAND_PART
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1324 for temp_arg in temp_args:
1325 temp_arg._detach()
File ~\AppData\Local\Programs\Python\Python39\lib\site-packages\pyspark\sql\utils.py:111, in capture_sql_exception.<locals>.deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
File ~\AppData\Local\Programs\Python\Python39\lib\site-packages\py4j\protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
332 format(target_id, ".", name, value))
Py4JJavaError: An error occurred while calling o43.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0) (XXXX.XXX.co.jp executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:188)
at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:108)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:121)
at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:162)
.......
If you know the solution, would you give me the advice to solve the problem?
Related
I am fitting RandomForestClassifier to the dataset (DecisionTreeClassifier is working fine on that dataset).
"An error occurred while calling {0}{1}{2}".
336 format(target_id, ".", name))
The complete traceback is given below:
/usr/local/lib/python3.8/dist-packages/pyspark/ml/wrapper.py in _fit(self, dataset)
381
382 def _fit(self, dataset: DataFrame) -> JM:
--> 383 java_model = self._fit_java(dataset)
384 model = self._create_model(java_model)
385 return self._copyValues(model)
/usr/local/lib/python3.8/dist-packages/pyspark/ml/wrapper.py in _fit_java(self, dataset)
378
379 self._transfer_params_to_java()
--> 380 return self._java_obj.fit(dataset._jdf)
381
382 def _fit(self, dataset: DataFrame) -> JM:
/usr/local/lib/python3.8/dist-packages/py4j/java_gateway.py in __call__(self, *args)
1319
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1323
/usr/local/lib/python3.8/dist-packages/pyspark/sql/utils.py in deco(*a, **kw)
188 def deco(*a: Any, **kw: Any) -> Any:
189 try:
--> 190 return f(*a, **kw)
191 except Py4JJavaError as e:
192 converted = convert_exception(e.java_exception)
/usr/local/lib/python3.8/dist-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
332 format(target_id, ".", name, value))
333 else:
--> 334 raise Py4JError(
335 "An error occurred while calling {0}{1}{2}".
336 format(target_id, ".", name))
Py4JError: An error occurred while calling o85.fit
I am creating a random forest model in pyspark on the expedia dataset and am facing this issue.
The code leading to this error is given here:
va = VectorAssembler(inputCols = x_train.columns, outputCol='features')
va_df = va.transform(x_train)
va_df = va_df.select(['features', 'hotel_cluster'])
rf = RandomForestClassifier(featuresCol="features", labelCol="hotel_cluster", numTrees=10)
dt = rf.fit(va_df) # error here
Any help in identifying the issue will be highly appreciated!
i work on sentiment analysis project using Pyspark, when i do the preprocessing of data and so on, i use TextBlob to see what is the sentiment of the tweet, i get the result and i convert it to df like this :
# Convert RDD Back to DataFrame
File_new_df = sqlContext.createDataFrame(File_rdd_new)
File_new_df.show(5)
+--------------------+------------------+--------------------+---------+
| tweet_text| Subjectivity| Polarity|Sentiment|
+--------------------+------------------+--------------------+---------+
| tweettext| 0.0| 0.0| Neutral|
|woman faces lashe...| 0.625| 0.125| Positive|
| worldcup | 0.0| 0.0| Neutral|
|going expose leak...|0.6386363636363637|-0.03257575757575757| Negative|
|qatar whose autho...|0.8333333333333334| 0.5| Positive|
+--------------------+------------------+--------------------+---------+
only showing top 5 rows
Now i want to do : (Sentiment is the column which has positive or negative or neutral)
File_new_df.groupBy("Sentiment").count().show(3)
df.count() doens't work, how can i do????
However when I call the .count() method on the dataframe it throws the below error
Py4JJavaError Traceback (most recent call last)
<ipython-input-44-59bd5bd510b2> in <module>
----> 1 File_new_df.groupBy("Sentiment").count().show(3)
C:\spark\spark\python\pyspark\sql\dataframe.py in show(self, n, truncate, vertical)
482 """
483 if isinstance(truncate, bool) and truncate:
--> 484 print(self._jdf.showString(n, 20, vertical))
485 else:
486 print(self._jdf.showString(n, int(truncate), vertical))
C:\spark\spark\python\lib\py4j-0.10.9-src.zip\py4j\java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
C:\spark\spark\python\pyspark\sql\utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
C:\spark\spark\python\lib\py4j-0.10.9-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o705.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 81.0 failed 1 times, most recent failure: Lost task 0.0 in stage 81.0 (TID 504) (DESKTOP-95B8MQL executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\spark\spark\python\lib\pyspark.zip\pyspark\worker.py", line 604, in main
File "C:\spark\spark\python\lib\pyspark.zip\pyspark\worker.py", line 596, in process
File "C:\spark\spark\python\lib\pyspark.zip\pyspark\serializers.py", line 259, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "C:\spark\spark\python\lib\pyspark.zip\pyspark\util.py", line 73, in wrapper
return f(*args, **kwargs)
File "<ipython-input-32-244ea6f47285>", line 21, in <lambda>
File "<ipython-input-32-244ea6f47285>", line 8, in rowwise_function
File "<ipython-input-31-d55b51a92547>", line 2, in getSubjectivity
File "C:\ProgramData\Anaconda3\lib\site-packages\textblob\blob.py", line 384, in __init__
raise TypeError('The `text` argument passed to `__init__(text)` '
TypeError: The `text` argument passed to `__init__(text)` must be a string, not <class 'NoneType'>
I tried excluding all null values and make my label starts from '0' instead of '1'. Both don't solve the error. The error says failed to execute OneHotEncoder, but I am not using it in my code.. Followings are my code and the error I got.
This is my DataFrame. I dropped all null values.
low_ask_sizes:integer
ask_price:integer
collaboration:integer
released_months:integer
brand:string
cate_color:string
label:integer
Split the data into train and test
train, validation, test = data.randomSplit([0.7, 0.2, 0.1], 1234)
Convert nominal column to numerical column
from pyspark.ml.feature import StringIndexer
categorical_columns = [item[0] for item in data.dtypes if item[1].startswith('string')]
numeric_columns = [item[0] for item in data.dtypes if item[1].startswith('int')]
indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(
column), handleInvalid = 'keep') for column in categorical_columns]
Assemble feature columns into one feature vector column
from pyspark.ml.feature import VectorAssembler
featuresCreator = VectorAssembler(
inputCols=[indexer.getOutputCol() for indexer in indexers] + numeric_columns,
outputCol='features')
layers = [len(featuresCreator.getInputCols()), 4, 2, 2]
Select features (vector column) and label column
prepared_data = output.select('features', 'label')
This is the structure of prepared_data:
features:udt
label:integer
Classifier
from pyspark.ml.classification import MultilayerPerceptronClassifier
classifier = MultilayerPerceptronClassifier(labelCol='label',
featuresCol='features',
maxIter=100,
layers=layers,
blockSize=128,
seed=1234)
Build the pipeline and start training the model
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=indexers + [featuresCreator, classifier])
model = pipeline.fit(train)
It gives me this error on the last line:
Py4JJavaError Traceback (most recent call last)
<command-2394554338211359> in <module>
2
3 pipeline = Pipeline(stages=indexers + [featuresCreator, classifier])
----> 4 model = pipeline.fit(train)
/databricks/spark/python/pyspark/ml/base.py in fit(self, dataset, params)
159 return self.copy(params)._fit(dataset)
160 else:
--> 161 return self._fit(dataset)
162 else:
163 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
/databricks/spark/python/pyspark/ml/pipeline.py in _fit(self, dataset)
112 dataset = stage.transform(dataset)
113 else: # must be an Estimator
--> 114 model = stage.fit(dataset)
115 transformers.append(model)
116 if i < indexOfLastEstimator:
/databricks/spark/python/pyspark/ml/base.py in fit(self, dataset, params)
159 return self.copy(params)._fit(dataset)
160 else:
--> 161 return self._fit(dataset)
162 else:
163 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
/databricks/spark/python/pyspark/ml/wrapper.py in _fit(self, dataset)
333
334 def _fit(self, dataset):
--> 335 java_model = self._fit_java(dataset)
336 model = self._create_model(java_model)
337 return self._copyValues(model)
/databricks/spark/python/pyspark/ml/wrapper.py in _fit_java(self, dataset)
330 """
331 self._transfer_params_to_java()
--> 332 return self._java_obj.fit(dataset._jdf)
333
334 def _fit(self, dataset):
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
108 def deco(*a, **kw):
109 try:
--> 110 return f(*a, **kw)
111 except py4j.protocol.Py4JJavaError as e:
112 converted = convert_exception(e.java_exception)
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o3766.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 773.0 failed 1 times, most recent failure: Lost task 0.0 in stage 773.0 (TID 1010) (ip-10-172-226-106.us-west-2.compute.internal executor driver): org.apache.spark.SparkException: Failed to execute user defined function(OneHotEncoderModel$$Lambda$6652/897874937: (double, int) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>)
I'm trying to use PySpark locally on my windows pc. To test I tried:
rdd = spark.sparkContext.parallelize(range(10))
rdd.collect()
But I get the following error. I hope you can help me. I already tried installing Java 8/11 and that did not work. I also included the Path to the Environment variables and did not work. Or maybe it is not possible at all to use PySpark locally?
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
C:\Anaconda3\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw)
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
C:\Anaconda3\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
Py4JJavaError: An error occurred while calling o20354.fit.
: java.lang.IllegalArgumentException: Unsupported class file major version 55
at org.apache.xbean.asm6.ClassReader.<init>(ClassReader.java:166)
at org.apache.xbean.asm6.ClassReader.<init>(ClassReader.java:148)
at org.apache.xbean.asm6.ClassReader.<init>(ClassReader.java:136)
at org.apache.xbean.asm6.ClassReader.<init>(ClassReader.java:237)
at org.apache.spark.util.ClosureCleaner$.getClassReader(ClosureCleaner.scala:49)
at org.apache.spark.util.FieldAccessFinder$$anon$3$$anonfun$visitMethodInsn$2.apply(ClosureCleaner.scala:517)
at org.apache.spark.util.FieldAccessFinder$$anon$3$$anonfun$visitMethodInsn$2.apply(ClosureCleaner.scala:500)
at scala.collection.TraversableLike$WithFilter$$anonfun$foreach$1.apply(TraversableLike.scala:733)
at scala.collection.mutable.HashMap$$anon$1$$anonfun$foreach$2.apply(HashMap.scala:134)
at scala.collection.mutable.HashMap$$anon$1$$anonfun$foreach$2.apply(HashMap.scala:134)
at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:236)
at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:40)
at scala.collection.mutable.HashMap$$anon$1.foreach(HashMap.scala:134)
at scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:732)
at org.apache.spark.util.FieldAccessFinder$$anon$3.visitMethodInsn(ClosureCleaner.scala:500)
at org.apache.xbean.asm6.ClassReader.readCode(ClassReader.java:2175)
at org.apache.xbean.asm6.ClassReader.readMethod(ClassReader.java:1238)
at org.apache.xbean.asm6.ClassReader.accept(ClassReader.java:631)
at org.apache.xbean.asm6.ClassReader.accept(ClassReader.java:355)
at org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$ClosureCleaner$$clean$14.apply(ClosureCleaner.scala:307)
at org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$ClosureCleaner$$clean$14.apply(ClosureCleaner.scala:306)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:306)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:162)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2326)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2100)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$countByKey$1.apply(PairRDDFunctions.scala:370)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$countByKey$1.apply(PairRDDFunctions.scala:370)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
at org.apache.spark.rdd.PairRDDFunctions.countByKey(PairRDDFunctions.scala:369)
at org.apache.spark.rdd.RDD$$anonfun$countByValue$1.apply(RDD.scala:1259)
at org.apache.spark.rdd.RDD$$anonfun$countByValue$1.apply(RDD.scala:1259)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
at org.apache.spark.rdd.RDD.countByValue(RDD.scala:1258)
at org.apache.spark.ml.feature.StringIndexer.fit(StringIndexer.scala:140)
at org.apache.spark.ml.feature.StringIndexer.fit(StringIndexer.scala:109)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:834)
During handling of the above exception, another exception occurred:
IllegalArgumentException Traceback (most recent call last)
<ipython-input-46-451f361a414e> in <module>
2
3 indexer = StringIndexer(inputCol='class', outputCol='classIndex')
----> 4 indexed = indexer.fit(df).transform(df)
5
6 indexed.show()
C:\Anaconda3\lib\site-packages\pyspark\ml\base.py in fit(self, dataset, params)
130 return self.copy(params)._fit(dataset)
131 else:
--> 132 return self._fit(dataset)
133 else:
134 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
C:\Anaconda3\lib\site-packages\pyspark\ml\wrapper.py in _fit(self, dataset)
293
294 def _fit(self, dataset):
--> 295 java_model = self._fit_java(dataset)
296 model = self._create_model(java_model)
297 return self._copyValues(model)
C:\Anaconda3\lib\site-packages\pyspark\ml\wrapper.py in _fit_java(self, dataset)
290 """
291 self._transfer_params_to_java()
--> 292 return self._java_obj.fit(dataset._jdf)
293
294 def _fit(self, dataset):
C:\Anaconda3\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
C:\Anaconda3\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw)
77 raise QueryExecutionException(s.split(': ', 1)[1], stackTrace)
78 if s.startswith('java.lang.IllegalArgumentException: '):
---> 79 raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
80 raise
81 return deco
IllegalArgumentException: 'Unsupported class file major version 55'
---------------------------------------------------------------------------------------
hdfs_options = {
"hdfs_host": "...",
"hdfs_port": 50070,
"hdfs_user": "..."
}
opts = PipelineOptions(**hdfs_options)
token = run_shell_cmd('curl -s --negotiate -u : "http://nn:50070/webhdfs/v1/?op=GETDELEGATIONTOKEN"'
p = beam.Pipeline(options=opts)
p.apply(
beam.io.ReadFromText(f"hdfs:///my_path/*.md?delegation={token}") # does not work
);
I have the token and a delegation token file but unable to Authenticate with either.
Match operation failed with exceptions {'hdfs:///my_path/*.md?delegation=...': BeamIOError("List operation failed with exceptions {'hdfs:///my_path': HdfsError('Authentication failure. Check your credentials.')}")}
Stacktrace
---------------------------------------------------------------------------
BeamIOError Traceback (most recent call last)
<ipython-input-251-127e501adfaa> in <module>()
2
3 p.apply(
----> 4 beam.io.ReadFromText(f"hdfs:///my_path/*.md?delegation={token}")
5 );
/root/miniconda3/lib/python3.7/site-packages/apache_beam/io/textio.py in __init__(self, file_pattern, min_bundle_size, compression_type, strip_trailing_newlines, coder, validate, skip_header_lines, **kwargs)
540 file_pattern, min_bundle_size, compression_type,
541 strip_trailing_newlines, coder, validate=validate,
--> 542 skip_header_lines=skip_header_lines)
543
544 def expand(self, pvalue):
/root/miniconda3/lib/python3.7/site-packages/apache_beam/io/textio.py in __init__(self, file_pattern, min_bundle_size, compression_type, strip_trailing_newlines, coder, buffer_size, validate, skip_header_lines, header_processor_fns)
124 super(_TextSource, self).__init__(file_pattern, min_bundle_size,
125 compression_type=compression_type,
--> 126 validate=validate)
127
128 self._strip_trailing_newlines = strip_trailing_newlines
/root/miniconda3/lib/python3.7/site-packages/apache_beam/io/filebasedsource.py in __init__(self, file_pattern, min_bundle_size, compression_type, splittable, validate)
123 self._splittable = splittable
124 if validate and file_pattern.is_accessible():
--> 125 self._validate()
126
127 def display_data(self):
/root/miniconda3/lib/python3.7/site-packages/apache_beam/options/value_provider.py in _f(self, *args, **kwargs)
138 if not obj.is_accessible():
139 raise error.RuntimeValueProviderError('%s not accessible' % obj)
--> 140 return fnc(self, *args, **kwargs)
141 return _f
142 return _check_accessible
/root/miniconda3/lib/python3.7/site-packages/apache_beam/io/filebasedsource.py in _validate(self)
181
182 # Limit the responses as we only want to check if something exists
--> 183 match_result = FileSystems.match([pattern], limits=[1])[0]
184 if len(match_result.metadata_list) <= 0:
185 raise IOError(
/root/miniconda3/lib/python3.7/site-packages/apache_beam/io/filesystems.py in match(patterns, limits)
198 return []
199 filesystem = FileSystems.get_filesystem(patterns[0])
--> 200 return filesystem.match(patterns, limits)
201
202 #staticmethod
/root/miniconda3/lib/python3.7/site-packages/apache_beam/io/filesystem.py in match(self, patterns, limits)
718
719 if exceptions:
--> 720 raise BeamIOError("Match operation failed", exceptions)
721 return result
722